From d5388e23b5c75bf8189b173051d24a0176a5a303 Mon Sep 17 00:00:00 2001 From: Jonathan Swinney Date: Fri, 30 Oct 2020 18:46:23 +0000 Subject: runtime: improve memmove performance on arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the memmove implementation for moves of 17 bytes or larger with an implementation from ARM optimized software. The moves of 16 bytes or fewer are unchanged, but the registers used are updated to match the rest of the implementation. This implementation makes use of new optimizations: - software pipelined loop for large (>128 byte) moves - medium size moves (17..128 bytes) have a new implementation - address realignment when src or dst is unaligned - preference for aligned src (loads) or dst (stores) depending on CPU To support preference for aligned loads or aligned stores, a new CPU flag is added. This flag indicates that the detected micro architecture performs better with aligned loads. Some tested CPUs did not exhibit a significant difference and are left with the default behavior of realigning based on the destination address (stores). Neoverse N1 (Tested on Graviton 2) name old time/op new time/op delta Memmove/0-4 1.88ns ± 1% 1.87ns ± 1% -0.58% (p=0.020 n=10+10) Memmove/1-4 4.40ns ± 0% 4.40ns ± 0% ~ (all equal) Memmove/8-4 3.88ns ± 3% 3.80ns ± 0% -1.97% (p=0.001 n=10+9) Memmove/16-4 3.90ns ± 3% 3.80ns ± 0% -2.49% (p=0.000 n=10+9) Memmove/32-4 4.80ns ± 0% 4.40ns ± 0% -8.33% (p=0.000 n=9+8) Memmove/64-4 5.86ns ± 0% 5.00ns ± 0% -14.76% (p=0.000 n=8+8) Memmove/128-4 8.46ns ± 0% 8.06ns ± 0% -4.62% (p=0.000 n=10+10) Memmove/256-4 12.4ns ± 0% 12.2ns ± 0% -1.61% (p=0.000 n=10+10) Memmove/512-4 19.5ns ± 0% 19.1ns ± 0% -2.05% (p=0.000 n=10+10) Memmove/1024-4 33.7ns ± 0% 33.5ns ± 0% -0.59% (p=0.000 n=10+10) Memmove/2048-4 62.1ns ± 0% 59.0ns ± 0% -4.99% (p=0.000 n=10+10) Memmove/4096-4 117ns ± 1% 110ns ± 0% -5.66% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 6.41ns ± 0% 5.62ns ± 0% -12.32% (p=0.000 n=10+7) MemmoveUnalignedDst/128-4 9.40ns ± 0% 8.34ns ± 0% -11.24% (p=0.000 n=10+10) MemmoveUnalignedDst/256-4 12.8ns ± 0% 12.8ns ± 0% ~ (all equal) MemmoveUnalignedDst/512-4 20.4ns ± 0% 19.7ns ± 0% -3.43% (p=0.000 n=9+10) MemmoveUnalignedDst/1024-4 34.1ns ± 0% 35.1ns ± 0% +2.93% (p=0.000 n=9+9) MemmoveUnalignedDst/2048-4 61.5ns ± 0% 60.4ns ± 0% -1.77% (p=0.000 n=10+10) MemmoveUnalignedDst/4096-4 122ns ± 0% 113ns ± 0% -7.38% (p=0.002 n=8+10) MemmoveUnalignedSrc/64-4 7.25ns ± 1% 6.26ns ± 0% -13.64% (p=0.000 n=9+9) MemmoveUnalignedSrc/128-4 10.5ns ± 0% 9.7ns ± 0% -7.52% (p=0.000 n=10+10) MemmoveUnalignedSrc/256-4 17.1ns ± 0% 17.3ns ± 0% +1.17% (p=0.000 n=10+10) MemmoveUnalignedSrc/512-4 27.0ns ± 0% 27.0ns ± 0% ~ (all equal) MemmoveUnalignedSrc/1024-4 46.7ns ± 0% 35.7ns ± 0% -23.55% (p=0.000 n=10+9) MemmoveUnalignedSrc/2048-4 85.2ns ± 0% 61.2ns ± 0% -28.17% (p=0.000 n=10+8) MemmoveUnalignedSrc/4096-4 162ns ± 0% 113ns ± 0% -30.25% (p=0.000 n=10+10) name old speed new speed delta Memmove/4096-4 35.2GB/s ± 0% 37.1GB/s ± 0% +5.56% (p=0.000 n=10+9) MemmoveUnalignedSrc/1024-4 21.9GB/s ± 0% 28.7GB/s ± 0% +30.90% (p=0.000 n=10+10) MemmoveUnalignedSrc/2048-4 24.0GB/s ± 0% 33.5GB/s ± 0% +39.18% (p=0.000 n=10+9) MemmoveUnalignedSrc/4096-4 25.3GB/s ± 0% 36.2GB/s ± 0% +43.50% (p=0.000 n=10+7) Cortex-A72 (Graviton 1) name old time/op new time/op delta Memmove/0-4 3.06ns ± 3% 3.08ns ± 1% ~ (p=0.958 n=10+9) Memmove/1-4 8.72ns ± 0% 7.85ns ± 0% -9.98% (p=0.002 n=8+10) Memmove/8-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal) Memmove/16-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal) Memmove/32-4 8.19ns ± 2% 8.29ns ± 0% ~ (p=0.114 n=10+10) Memmove/64-4 18.3ns ± 4% 10.0ns ± 0% -45.36% (p=0.000 n=10+10) Memmove/128-4 14.8ns ± 0% 17.4ns ± 0% +17.77% (p=0.000 n=10+10) Memmove/256-4 21.8ns ± 0% 23.1ns ± 0% +5.96% (p=0.000 n=10+10) Memmove/512-4 35.8ns ± 0% 37.2ns ± 0% +3.91% (p=0.000 n=10+10) Memmove/1024-4 63.7ns ± 0% 67.2ns ± 0% +5.49% (p=0.000 n=10+10) Memmove/2048-4 126ns ± 0% 123ns ± 0% -2.38% (p=0.000 n=10+10) Memmove/4096-4 238ns ± 1% 243ns ± 1% +1.93% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 19.3ns ± 1% 12.0ns ± 1% -37.49% (p=0.000 n=10+10) MemmoveUnalignedDst/128-4 17.2ns ± 0% 17.4ns ± 0% +1.16% (p=0.000 n=10+10) MemmoveUnalignedDst/256-4 28.2ns ± 8% 29.2ns ± 0% ~ (p=0.352 n=10+10) MemmoveUnalignedDst/512-4 49.8ns ± 3% 48.9ns ± 0% ~ (p=1.000 n=10+10) MemmoveUnalignedDst/1024-4 89.5ns ± 0% 80.5ns ± 1% -10.02% (p=0.000 n=10+10) MemmoveUnalignedDst/2048-4 180ns ± 0% 127ns ± 0% -29.44% (p=0.000 n=9+10) MemmoveUnalignedDst/4096-4 347ns ± 0% 244ns ± 0% -29.59% (p=0.000 n=10+9) MemmoveUnalignedSrc/128-4 16.1ns ± 0% 21.8ns ± 0% +35.40% (p=0.000 n=10+10) MemmoveUnalignedSrc/256-4 24.9ns ± 8% 26.6ns ± 0% +6.70% (p=0.015 n=10+10) MemmoveUnalignedSrc/512-4 39.4ns ± 6% 40.6ns ± 0% ~ (p=0.352 n=10+10) MemmoveUnalignedSrc/1024-4 72.5ns ± 0% 83.0ns ± 1% +14.44% (p=0.000 n=9+10) MemmoveUnalignedSrc/2048-4 129ns ± 1% 128ns ± 1% ~ (p=0.179 n=10+10) MemmoveUnalignedSrc/4096-4 241ns ± 0% 253ns ± 1% +4.99% (p=0.000 n=9+9) Cortex-A53 (Raspberry Pi 3) name old time/op new time/op delta Memmove/0-4 11.0ns ± 0% 11.0ns ± 1% ~ (p=0.294 n=8+10) Memmove/1-4 29.6ns ± 0% 28.0ns ± 1% -5.41% (p=0.000 n=9+10) Memmove/8-4 23.5ns ± 0% 22.1ns ± 0% -6.11% (p=0.000 n=8+8) Memmove/16-4 23.7ns ± 1% 22.1ns ± 0% -6.59% (p=0.000 n=10+8) Memmove/32-4 27.9ns ± 0% 27.1ns ± 0% -3.13% (p=0.000 n=8+8) Memmove/64-4 33.8ns ± 0% 31.5ns ± 1% -6.99% (p=0.000 n=8+10) Memmove/128-4 45.6ns ± 0% 44.2ns ± 1% -3.23% (p=0.000 n=9+10) Memmove/256-4 69.3ns ± 0% 69.3ns ± 0% ~ (p=0.072 n=8+8) Memmove/512-4 127ns ± 0% 110ns ± 0% -13.39% (p=0.000 n=8+8) Memmove/1024-4 222ns ± 0% 205ns ± 1% -7.66% (p=0.000 n=7+10) Memmove/2048-4 411ns ± 0% 366ns ± 0% -10.98% (p=0.000 n=8+9) Memmove/4096-4 795ns ± 1% 695ns ± 1% -12.63% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 44.0ns ± 0% 40.5ns ± 0% -7.93% (p=0.000 n=8+8) MemmoveUnalignedDst/128-4 59.6ns ± 0% 54.9ns ± 0% -7.85% (p=0.000 n=9+9) MemmoveUnalignedDst/256-4 98.2ns ±11% 90.0ns ± 1% ~ (p=0.130 n=10+10) MemmoveUnalignedDst/512-4 161ns ± 2% 145ns ± 1% -9.96% (p=0.000 n=10+10) MemmoveUnalignedDst/1024-4 281ns ± 0% 265ns ± 0% -5.65% (p=0.000 n=9+8) MemmoveUnalignedDst/2048-4 528ns ± 0% 482ns ± 0% -8.73% (p=0.000 n=8+9) MemmoveUnalignedDst/4096-4 1.02µs ± 1% 0.92µs ± 0% -10.00% (p=0.000 n=10+8) MemmoveUnalignedSrc/64-4 42.4ns ± 1% 40.5ns ± 0% -4.39% (p=0.000 n=10+8) MemmoveUnalignedSrc/128-4 57.4ns ± 0% 57.0ns ± 1% -0.75% (p=0.048 n=9+10) MemmoveUnalignedSrc/256-4 88.1ns ± 1% 89.6ns ± 0% +1.70% (p=0.000 n=9+8) MemmoveUnalignedSrc/512-4 160ns ± 2% 144ns ± 0% -9.89% (p=0.000 n=10+8) MemmoveUnalignedSrc/1024-4 286ns ± 0% 266ns ± 1% -6.69% (p=0.000 n=8+10) MemmoveUnalignedSrc/2048-4 525ns ± 0% 483ns ± 1% -7.96% (p=0.000 n=9+10) MemmoveUnalignedSrc/4096-4 1.01µs ± 0% 0.92µs ± 1% -9.40% (p=0.000 n=8+10) Change-Id: Ia1144e9d4dfafdece6e167c5e576bf80f254c8ab Reviewed-on: https://go-review.googlesource.com/c/go/+/243357 TryBot-Result: Go Bot Reviewed-by: Martin Möhrmann Reviewed-by: eric fang Reviewed-by: Cherry Zhang --- src/internal/cpu/cpu.go | 54 +++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 26 deletions(-) (limited to 'src/internal/cpu/cpu.go') diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go index 2829945af0..0ceedcd7d2 100644 --- a/src/internal/cpu/cpu.go +++ b/src/internal/cpu/cpu.go @@ -56,32 +56,34 @@ var ARM struct { // The booleans in ARM64 contain the correspondingly named cpu feature bit. // The struct is padded to avoid false sharing. var ARM64 struct { - _ CacheLinePad - HasFP bool - HasASIMD bool - HasEVTSTRM bool - HasAES bool - HasPMULL bool - HasSHA1 bool - HasSHA2 bool - HasCRC32 bool - HasATOMICS bool - HasFPHP bool - HasASIMDHP bool - HasCPUID bool - HasASIMDRDM bool - HasJSCVT bool - HasFCMA bool - HasLRCPC bool - HasDCPOP bool - HasSHA3 bool - HasSM3 bool - HasSM4 bool - HasASIMDDP bool - HasSHA512 bool - HasSVE bool - HasASIMDFHM bool - _ CacheLinePad + _ CacheLinePad + HasFP bool + HasASIMD bool + HasEVTSTRM bool + HasAES bool + HasPMULL bool + HasSHA1 bool + HasSHA2 bool + HasCRC32 bool + HasATOMICS bool + HasFPHP bool + HasASIMDHP bool + HasCPUID bool + HasASIMDRDM bool + HasJSCVT bool + HasFCMA bool + HasLRCPC bool + HasDCPOP bool + HasSHA3 bool + HasSM3 bool + HasSM4 bool + HasASIMDDP bool + HasSHA512 bool + HasSVE bool + HasASIMDFHM bool + IsNeoverseN1 bool + IsZeus bool + _ CacheLinePad } var MIPS64X struct { -- cgit v1.3 From 3510a1e32cbc86b73db143aefcc00aadc44c27bd Mon Sep 17 00:00:00 2001 From: Martin Möhrmann Date: Thu, 5 Nov 2020 05:59:34 +0100 Subject: internal/cpu: fix and cleanup ARM64 cpu feature fields and options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove all cpu features from the ARM64 struct that are not initialized to reduce cache lines used and to avoid those features being accidentially used without actual detection if they are present. Add missing option to mask the CPUID feature. Change-Id: I94bf90c0655de1af2218ac72117ac6c52adfc289 Reviewed-on: https://go-review.googlesource.com/c/go/+/267658 Run-TryBot: Martin Möhrmann TryBot-Result: Go Bot Reviewed-by: Tobias Klauser Trust: Martin Möhrmann --- src/internal/cpu/cpu.go | 17 ----------------- src/internal/cpu/cpu_arm64.go | 1 + 2 files changed, 1 insertion(+), 17 deletions(-) (limited to 'src/internal/cpu/cpu.go') diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go index 0ceedcd7d2..dab5d068ef 100644 --- a/src/internal/cpu/cpu.go +++ b/src/internal/cpu/cpu.go @@ -57,30 +57,13 @@ var ARM struct { // The struct is padded to avoid false sharing. var ARM64 struct { _ CacheLinePad - HasFP bool - HasASIMD bool - HasEVTSTRM bool HasAES bool HasPMULL bool HasSHA1 bool HasSHA2 bool HasCRC32 bool HasATOMICS bool - HasFPHP bool - HasASIMDHP bool HasCPUID bool - HasASIMDRDM bool - HasJSCVT bool - HasFCMA bool - HasLRCPC bool - HasDCPOP bool - HasSHA3 bool - HasSM3 bool - HasSM4 bool - HasASIMDDP bool - HasSHA512 bool - HasSVE bool - HasASIMDFHM bool IsNeoverseN1 bool IsZeus bool _ CacheLinePad diff --git a/src/internal/cpu/cpu_arm64.go b/src/internal/cpu/cpu_arm64.go index 8fde39f03e..4e9ea8ca96 100644 --- a/src/internal/cpu/cpu_arm64.go +++ b/src/internal/cpu/cpu_arm64.go @@ -29,6 +29,7 @@ func doinit() { {Name: "sha2", Feature: &ARM64.HasSHA2}, {Name: "crc32", Feature: &ARM64.HasCRC32}, {Name: "atomics", Feature: &ARM64.HasATOMICS}, + {Name: "cpuid", Feature: &ARM64.HasCPUID}, {Name: "isNeoverseN1", Feature: &ARM64.IsNeoverseN1}, {Name: "isZeus", Feature: &ARM64.IsZeus}, } -- cgit v1.3