aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/asm
diff options
context:
space:
mode:
authorCherry Zhang <cherryyz@google.com>2017-04-28 18:02:00 -0400
committerCherry Zhang <cherryyz@google.com>2017-05-09 19:41:00 +0000
commitfb0ccc5d0ac41edc545a877691d84bbb86801a07 (patch)
tree7586fcfb164bc2f5aad4af8a0dbdea4df3e56e0f /src/cmd/asm
parent5e0bcb3893c2e54fdb96affcafd2953f20dd64eb (diff)
downloadgo-fb0ccc5d0ac41edc545a877691d84bbb86801a07.tar.xz
cmd/internal/obj/arm64, cmd/compile: improve offset folding on ARM64
ARM64 assembler backend only accepts loads and stores with small or aligned offset. The compiler therefore can only fold small or aligned offsets into loads and stores. For locals and args, their offsets to SP are not known until very late, and the compiler makes conservative decision not folding some of them. However, in most cases, the offset is indeed small or aligned, and can be folded into load and store (but actually not). This CL adds support of loads and stores with large and unaligned offsets. When the offset doesn't fit into the instruction, it uses two instructions and (for very large offset) the constant pool. This way, the compiler doesn't need to be conservative, and can simply fold the offset. To make it work, the assembler's optab matching rules need to be changed. Before, MOVD accepts C_UAUTO32K which matches multiple of 8 between 0 and 32K, and also C_UAUTO16K, which may not be multiple of 8 and does not fit into MOVD instruction. The assembler errors in the latter case. This change makes it only matches multiple of 8 (or offsets within ±256, which also fits in instruction), and uses the large-or-unaligned-offset rule for things doesn't fit (without error). Other sized move rules are changed similarly. Class C_UAUTO64K and C_UOREG64K are removed, as they are never used. In shared library, load/store of global is rewritten to using GOT and temp register, which conflicts with the use of temp register for assembling large offset. So the folding is disabled for globals in shared library mode. Reduce cmd/go binary size by 2%. name old time/op new time/op delta BinaryTree17-8 8.67s ± 0% 8.61s ± 0% -0.60% (p=0.000 n=9+10) Fannkuch11-8 6.24s ± 0% 6.19s ± 0% -0.83% (p=0.000 n=10+9) FmtFprintfEmpty-8 116ns ± 0% 116ns ± 0% ~ (all equal) FmtFprintfString-8 196ns ± 0% 192ns ± 0% -1.89% (p=0.000 n=10+10) FmtFprintfInt-8 199ns ± 0% 198ns ± 0% -0.35% (p=0.001 n=9+10) FmtFprintfIntInt-8 294ns ± 0% 293ns ± 0% -0.34% (p=0.000 n=8+8) FmtFprintfPrefixedInt-8 318ns ± 1% 318ns ± 1% ~ (p=1.000 n=10+10) FmtFprintfFloat-8 537ns ± 0% 531ns ± 0% -1.17% (p=0.000 n=9+10) FmtManyArgs-8 1.19µs ± 1% 1.18µs ± 1% -1.41% (p=0.001 n=10+10) GobDecode-8 17.2ms ± 1% 17.3ms ± 2% ~ (p=0.165 n=10+10) GobEncode-8 14.7ms ± 1% 14.7ms ± 2% ~ (p=0.631 n=10+10) Gzip-8 837ms ± 0% 836ms ± 0% -0.14% (p=0.006 n=9+10) Gunzip-8 141ms ± 0% 139ms ± 0% -1.24% (p=0.000 n=9+10) HTTPClientServer-8 256µs ± 1% 253µs ± 1% -1.35% (p=0.000 n=10+10) JSONEncode-8 40.1ms ± 1% 41.3ms ± 1% +3.06% (p=0.000 n=10+9) JSONDecode-8 157ms ± 1% 156ms ± 1% -0.83% (p=0.001 n=9+8) Mandelbrot200-8 8.94ms ± 0% 8.94ms ± 0% +0.02% (p=0.000 n=9+9) GoParse-8 8.69ms ± 0% 8.54ms ± 1% -1.69% (p=0.000 n=8+10) RegexpMatchEasy0_32-8 227ns ± 1% 228ns ± 1% +0.48% (p=0.016 n=10+9) RegexpMatchEasy0_1K-8 1.92µs ± 0% 1.63µs ± 0% -15.08% (p=0.000 n=10+9) RegexpMatchEasy1_32-8 256ns ± 0% 251ns ± 0% -2.19% (p=0.000 n=10+9) RegexpMatchEasy1_1K-8 2.38µs ± 0% 2.09µs ± 0% -12.49% (p=0.000 n=10+9) RegexpMatchMedium_32-8 352ns ± 0% 354ns ± 0% +0.39% (p=0.002 n=10+9) RegexpMatchMedium_1K-8 106µs ± 0% 106µs ± 0% -0.05% (p=0.005 n=10+9) RegexpMatchHard_32-8 5.92µs ± 0% 5.89µs ± 0% -0.40% (p=0.000 n=9+8) RegexpMatchHard_1K-8 180µs ± 0% 179µs ± 0% -0.14% (p=0.000 n=10+9) Revcomp-8 1.20s ± 0% 1.13s ± 0% -6.29% (p=0.000 n=9+8) Template-8 159ms ± 1% 154ms ± 1% -3.14% (p=0.000 n=9+10) TimeParse-8 800ns ± 3% 769ns ± 1% -3.91% (p=0.000 n=10+10) TimeFormat-8 826ns ± 2% 817ns ± 2% -1.04% (p=0.050 n=10+10) [Geo mean] 145µs 143µs -1.79% Change-Id: I5fc42087cee9b54ea414f8ef6d6d020b80eb5985 Reviewed-on: https://go-review.googlesource.com/42172 Run-TryBot: Cherry Zhang <cherryyz@google.com> Reviewed-by: David Chase <drchase@google.com>
Diffstat (limited to 'src/cmd/asm')
-rw-r--r--src/cmd/asm/internal/asm/testdata/arm64.s54
1 files changed, 54 insertions, 0 deletions
diff --git a/src/cmd/asm/internal/asm/testdata/arm64.s b/src/cmd/asm/internal/asm/testdata/arm64.s
index 734ed152b2..1b6dc188c4 100644
--- a/src/cmd/asm/internal/asm/testdata/arm64.s
+++ b/src/cmd/asm/internal/asm/testdata/arm64.s
@@ -85,6 +85,60 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
MOVD $1, R1
MOVD ZR, (R1)
+ // small offset fits into instructions
+ MOVB 1(R1), R2 // 22048039
+ MOVH 1(R1), R2 // 22108078
+ MOVH 2(R1), R2 // 22048079
+ MOVW 1(R1), R2 // 221080b8
+ MOVW 4(R1), R2 // 220480b9
+ MOVD 1(R1), R2 // 221040f8
+ MOVD 8(R1), R2 // 220440f9
+ FMOVS 1(R1), F2 // 221040bc
+ FMOVS 4(R1), F2 // 220440bd
+ FMOVD 1(R1), F2 // 221040fc
+ FMOVD 8(R1), F2 // 220440fd
+ MOVB R1, 1(R2) // 41040039
+ MOVH R1, 1(R2) // 41100078
+ MOVH R1, 2(R2) // 41040079
+ MOVW R1, 1(R2) // 411000b8
+ MOVW R1, 4(R2) // 410400b9
+ MOVD R1, 1(R2) // 411000f8
+ MOVD R1, 8(R2) // 410400f9
+ FMOVS F1, 1(R2) // 411000bc
+ FMOVS F1, 4(R2) // 410400bd
+ FMOVD F1, 1(R2) // 411000fc
+ FMOVD F1, 8(R2) // 410400fd
+
+ // large aligned offset, use two instructions
+ MOVB 0x1001(R1), R2 // MOVB 4097(R1), R2 // 3b04409162078039
+ MOVH 0x2002(R1), R2 // MOVH 8194(R1), R2 // 3b08409162078079
+ MOVW 0x4004(R1), R2 // MOVW 16388(R1), R2 // 3b104091620780b9
+ MOVD 0x8008(R1), R2 // MOVD 32776(R1), R2 // 3b204091620740f9
+ FMOVS 0x4004(R1), F2 // FMOVS 16388(R1), F2 // 3b104091620740bd
+ FMOVD 0x8008(R1), F2 // FMOVD 32776(R1), F2 // 3b204091620740fd
+ MOVB R1, 0x1001(R2) // MOVB R1, 4097(R2) // 5b04409161070039
+ MOVH R1, 0x2002(R2) // MOVH R1, 8194(R2) // 5b08409161070079
+ MOVW R1, 0x4004(R2) // MOVW R1, 16388(R2) // 5b104091610700b9
+ MOVD R1, 0x8008(R2) // MOVD R1, 32776(R2) // 5b204091610700f9
+ FMOVS F1, 0x4004(R2) // FMOVS F1, 16388(R2) // 5b104091610700bd
+ FMOVD F1, 0x8008(R2) // FMOVD F1, 32776(R2) // 5b204091610700fd
+
+ // very large or unaligned offset uses constant pool
+ // the encoding cannot be checked as the address of the constant pool is unknown.
+ // here we only test that they can be assembled.
+ MOVB 0x44332211(R1), R2 // MOVB 1144201745(R1), R2
+ MOVH 0x44332211(R1), R2 // MOVH 1144201745(R1), R2
+ MOVW 0x44332211(R1), R2 // MOVW 1144201745(R1), R2
+ MOVD 0x44332211(R1), R2 // MOVD 1144201745(R1), R2
+ FMOVS 0x44332211(R1), F2 // FMOVS 1144201745(R1), F2
+ FMOVD 0x44332211(R1), F2 // FMOVD 1144201745(R1), F2
+ MOVB R1, 0x44332211(R2) // MOVB R1, 1144201745(R2)
+ MOVH R1, 0x44332211(R2) // MOVH R1, 1144201745(R2)
+ MOVW R1, 0x44332211(R2) // MOVW R1, 1144201745(R2)
+ MOVD R1, 0x44332211(R2) // MOVD R1, 1144201745(R2)
+ FMOVS F1, 0x44332211(R2) // FMOVS F1, 1144201745(R2)
+ FMOVD F1, 0x44332211(R2) // FMOVD F1, 1144201745(R2)
+
//
// MOVK
//