diff options
| author | Cherry Zhang <cherryyz@google.com> | 2017-04-28 18:02:00 -0400 |
|---|---|---|
| committer | Cherry Zhang <cherryyz@google.com> | 2017-05-09 19:41:00 +0000 |
| commit | fb0ccc5d0ac41edc545a877691d84bbb86801a07 (patch) | |
| tree | 7586fcfb164bc2f5aad4af8a0dbdea4df3e56e0f /src/cmd/internal/obj/arm64 | |
| parent | 5e0bcb3893c2e54fdb96affcafd2953f20dd64eb (diff) | |
| download | go-fb0ccc5d0ac41edc545a877691d84bbb86801a07.tar.xz | |
cmd/internal/obj/arm64, cmd/compile: improve offset folding on ARM64
ARM64 assembler backend only accepts loads and stores with small
or aligned offset. The compiler therefore can only fold small or
aligned offsets into loads and stores. For locals and args, their
offsets to SP are not known until very late, and the compiler
makes conservative decision not folding some of them. However,
in most cases, the offset is indeed small or aligned, and can
be folded into load and store (but actually not).
This CL adds support of loads and stores with large and unaligned
offsets. When the offset doesn't fit into the instruction, it
uses two instructions and (for very large offset) the constant
pool. This way, the compiler doesn't need to be conservative,
and can simply fold the offset.
To make it work, the assembler's optab matching rules need to be
changed. Before, MOVD accepts C_UAUTO32K which matches multiple
of 8 between 0 and 32K, and also C_UAUTO16K, which may not be
multiple of 8 and does not fit into MOVD instruction. The
assembler errors in the latter case. This change makes it only
matches multiple of 8 (or offsets within ±256, which also fits
in instruction), and uses the large-or-unaligned-offset rule
for things doesn't fit (without error). Other sized move rules
are changed similarly.
Class C_UAUTO64K and C_UOREG64K are removed, as they are never
used.
In shared library, load/store of global is rewritten to using
GOT and temp register, which conflicts with the use of temp
register for assembling large offset. So the folding is disabled
for globals in shared library mode.
Reduce cmd/go binary size by 2%.
name old time/op new time/op delta
BinaryTree17-8 8.67s ± 0% 8.61s ± 0% -0.60% (p=0.000 n=9+10)
Fannkuch11-8 6.24s ± 0% 6.19s ± 0% -0.83% (p=0.000 n=10+9)
FmtFprintfEmpty-8 116ns ± 0% 116ns ± 0% ~ (all equal)
FmtFprintfString-8 196ns ± 0% 192ns ± 0% -1.89% (p=0.000 n=10+10)
FmtFprintfInt-8 199ns ± 0% 198ns ± 0% -0.35% (p=0.001 n=9+10)
FmtFprintfIntInt-8 294ns ± 0% 293ns ± 0% -0.34% (p=0.000 n=8+8)
FmtFprintfPrefixedInt-8 318ns ± 1% 318ns ± 1% ~ (p=1.000 n=10+10)
FmtFprintfFloat-8 537ns ± 0% 531ns ± 0% -1.17% (p=0.000 n=9+10)
FmtManyArgs-8 1.19µs ± 1% 1.18µs ± 1% -1.41% (p=0.001 n=10+10)
GobDecode-8 17.2ms ± 1% 17.3ms ± 2% ~ (p=0.165 n=10+10)
GobEncode-8 14.7ms ± 1% 14.7ms ± 2% ~ (p=0.631 n=10+10)
Gzip-8 837ms ± 0% 836ms ± 0% -0.14% (p=0.006 n=9+10)
Gunzip-8 141ms ± 0% 139ms ± 0% -1.24% (p=0.000 n=9+10)
HTTPClientServer-8 256µs ± 1% 253µs ± 1% -1.35% (p=0.000 n=10+10)
JSONEncode-8 40.1ms ± 1% 41.3ms ± 1% +3.06% (p=0.000 n=10+9)
JSONDecode-8 157ms ± 1% 156ms ± 1% -0.83% (p=0.001 n=9+8)
Mandelbrot200-8 8.94ms ± 0% 8.94ms ± 0% +0.02% (p=0.000 n=9+9)
GoParse-8 8.69ms ± 0% 8.54ms ± 1% -1.69% (p=0.000 n=8+10)
RegexpMatchEasy0_32-8 227ns ± 1% 228ns ± 1% +0.48% (p=0.016 n=10+9)
RegexpMatchEasy0_1K-8 1.92µs ± 0% 1.63µs ± 0% -15.08% (p=0.000 n=10+9)
RegexpMatchEasy1_32-8 256ns ± 0% 251ns ± 0% -2.19% (p=0.000 n=10+9)
RegexpMatchEasy1_1K-8 2.38µs ± 0% 2.09µs ± 0% -12.49% (p=0.000 n=10+9)
RegexpMatchMedium_32-8 352ns ± 0% 354ns ± 0% +0.39% (p=0.002 n=10+9)
RegexpMatchMedium_1K-8 106µs ± 0% 106µs ± 0% -0.05% (p=0.005 n=10+9)
RegexpMatchHard_32-8 5.92µs ± 0% 5.89µs ± 0% -0.40% (p=0.000 n=9+8)
RegexpMatchHard_1K-8 180µs ± 0% 179µs ± 0% -0.14% (p=0.000 n=10+9)
Revcomp-8 1.20s ± 0% 1.13s ± 0% -6.29% (p=0.000 n=9+8)
Template-8 159ms ± 1% 154ms ± 1% -3.14% (p=0.000 n=9+10)
TimeParse-8 800ns ± 3% 769ns ± 1% -3.91% (p=0.000 n=10+10)
TimeFormat-8 826ns ± 2% 817ns ± 2% -1.04% (p=0.050 n=10+10)
[Geo mean] 145µs 143µs -1.79%
Change-Id: I5fc42087cee9b54ea414f8ef6d6d020b80eb5985
Reviewed-on: https://go-review.googlesource.com/42172
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: David Chase <drchase@google.com>
Diffstat (limited to 'src/cmd/internal/obj/arm64')
| -rw-r--r-- | src/cmd/internal/obj/arm64/a.out.go | 35 | ||||
| -rw-r--r-- | src/cmd/internal/obj/arm64/anames7.go | 14 | ||||
| -rw-r--r-- | src/cmd/internal/obj/arm64/asm7.go | 346 |
3 files changed, 268 insertions, 127 deletions
diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go index f192a51b0a..3a3fed5cf5 100644 --- a/src/cmd/internal/obj/arm64/a.out.go +++ b/src/cmd/internal/obj/arm64/a.out.go @@ -289,16 +289,21 @@ const ( C_SBRA // for TYPE_BRANCH C_LBRA - C_NPAUTO // -512 <= x < 0, 0 mod 8 - C_NSAUTO // -256 <= x < 0 - C_PSAUTO // 0 to 255 - C_PPAUTO // 0 to 504, 0 mod 8 - C_UAUTO4K // 0 to 4095 - C_UAUTO8K // 0 to 8190, 0 mod 2 - C_UAUTO16K // 0 to 16380, 0 mod 4 - C_UAUTO32K // 0 to 32760, 0 mod 8 - C_UAUTO64K // 0 to 65520, 0 mod 16 - C_LAUTO // any other 32-bit constant + C_NPAUTO // -512 <= x < 0, 0 mod 8 + C_NSAUTO // -256 <= x < 0 + C_PSAUTO // 0 to 255 + C_PPAUTO // 0 to 504, 0 mod 8 + C_UAUTO4K_8 // 0 to 4095, 0 mod 8 + C_UAUTO4K_4 // 0 to 4095, 0 mod 4 + C_UAUTO4K_2 // 0 to 4095, 0 mod 2 + C_UAUTO4K // 0 to 4095 + C_UAUTO8K_8 // 0 to 8190, 0 mod 8 + C_UAUTO8K_4 // 0 to 8190, 0 mod 4 + C_UAUTO8K // 0 to 8190, 0 mod 2 + C_UAUTO16K_8 // 0 to 16380, 0 mod 8 + C_UAUTO16K // 0 to 16380, 0 mod 4 + C_UAUTO32K // 0 to 32760, 0 mod 8 + C_LAUTO // any other 32-bit constant C_SEXT1 // 0 to 4095, direct C_SEXT2 // 0 to 8190 @@ -307,17 +312,21 @@ const ( C_SEXT16 // 0 to 65520 C_LEXT - // TODO(aram): s/AUTO/INDIR/ C_ZOREG // 0(R) - C_NPOREG // mirror NPAUTO, etc + C_NPOREG // must mirror NPAUTO, etc C_NSOREG C_PSOREG C_PPOREG + C_UOREG4K_8 + C_UOREG4K_4 + C_UOREG4K_2 C_UOREG4K + C_UOREG8K_8 + C_UOREG8K_4 C_UOREG8K + C_UOREG16K_8 C_UOREG16K C_UOREG32K - C_UOREG64K C_LOREG C_ADDR // TODO(aram): explain difference from C_VCONADDR diff --git a/src/cmd/internal/obj/arm64/anames7.go b/src/cmd/internal/obj/arm64/anames7.go index c3ef2f652e..24911f657d 100644 --- a/src/cmd/internal/obj/arm64/anames7.go +++ b/src/cmd/internal/obj/arm64/anames7.go @@ -37,11 +37,16 @@ var cnames7 = []string{ "NSAUTO", "PSAUTO", "PPAUTO", + "UAUTO4K_8", + "UAUTO4K_4", + "UAUTO4K_2", "UAUTO4K", + "UAUTO8K_8", + "UAUTO8K_4", "UAUTO8K", + "UAUTO16K_8", "UAUTO16K", "UAUTO32K", - "UAUTO64K", "LAUTO", "SEXT1", "SEXT2", @@ -54,11 +59,16 @@ var cnames7 = []string{ "NSOREG", "PSOREG", "PPOREG", + "UOREG4K_8", + "UOREG4K_4", + "UOREG4K_2", "UOREG4K", + "UOREG8K_8", + "UOREG8K_4", "UOREG8K", + "UOREG16K_8", "UOREG16K", "UOREG32K", - "UOREG64K", "LOREG", "ADDR", "GOTADDR", diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index 8218c6b333..f4e2562cdd 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -369,33 +369,28 @@ var optab = []Optab{ {AMOVD, C_NSOREG, C_NONE, C_REG, 21, 4, REGSP, 0, 0}, /* long displacement store */ - {AMOVB, C_REG, C_NONE, C_LAUTO, 30, 8, REGSP, 0, 0}, - {AMOVB, C_REG, C_NONE, C_LOREG, 30, 8, 0, 0, 0}, - {AMOVBU, C_REG, C_NONE, C_LAUTO, 30, 8, REGSP, 0, 0}, - {AMOVBU, C_REG, C_NONE, C_LOREG, 30, 8, 0, 0, 0}, - {AMOVH, C_REG, C_NONE, C_LAUTO, 30, 8, REGSP, 0, 0}, - {AMOVH, C_REG, C_NONE, C_LOREG, 30, 8, 0, 0, 0}, - {AMOVW, C_REG, C_NONE, C_LAUTO, 30, 8, REGSP, 0, 0}, - {AMOVW, C_REG, C_NONE, C_LOREG, 30, 8, 0, 0, 0}, - {AMOVD, C_REG, C_NONE, C_LAUTO, 30, 8, REGSP, 0, 0}, - {AMOVD, C_REG, C_NONE, C_LOREG, 30, 8, 0, 0, 0}, + {AMOVB, C_REG, C_NONE, C_LAUTO, 30, 8, REGSP, LTO, 0}, + {AMOVB, C_REG, C_NONE, C_LOREG, 30, 8, 0, LTO, 0}, + {AMOVBU, C_REG, C_NONE, C_LAUTO, 30, 8, REGSP, LTO, 0}, + {AMOVBU, C_REG, C_NONE, C_LOREG, 30, 8, 0, LTO, 0}, + {AMOVH, C_REG, C_NONE, C_LAUTO, 30, 8, REGSP, LTO, 0}, + {AMOVH, C_REG, C_NONE, C_LOREG, 30, 8, 0, LTO, 0}, + {AMOVW, C_REG, C_NONE, C_LAUTO, 30, 8, REGSP, LTO, 0}, + {AMOVW, C_REG, C_NONE, C_LOREG, 30, 8, 0, LTO, 0}, + {AMOVD, C_REG, C_NONE, C_LAUTO, 30, 8, REGSP, LTO, 0}, + {AMOVD, C_REG, C_NONE, C_LOREG, 30, 8, 0, LTO, 0}, /* long displacement load */ - {AMOVB, C_LAUTO, C_NONE, C_REG, 31, 8, REGSP, 0, 0}, - {AMOVB, C_LOREG, C_NONE, C_REG, 31, 8, 0, 0, 0}, - {AMOVB, C_LOREG, C_NONE, C_REG, 31, 8, 0, 0, 0}, - {AMOVBU, C_LAUTO, C_NONE, C_REG, 31, 8, REGSP, 0, 0}, - {AMOVBU, C_LOREG, C_NONE, C_REG, 31, 8, 0, 0, 0}, - {AMOVBU, C_LOREG, C_NONE, C_REG, 31, 8, 0, 0, 0}, - {AMOVH, C_LAUTO, C_NONE, C_REG, 31, 8, REGSP, 0, 0}, - {AMOVH, C_LOREG, C_NONE, C_REG, 31, 8, 0, 0, 0}, - {AMOVH, C_LOREG, C_NONE, C_REG, 31, 8, 0, 0, 0}, - {AMOVW, C_LAUTO, C_NONE, C_REG, 31, 8, REGSP, 0, 0}, - {AMOVW, C_LOREG, C_NONE, C_REG, 31, 8, 0, 0, 0}, - {AMOVW, C_LOREG, C_NONE, C_REG, 31, 8, 0, 0, 0}, - {AMOVD, C_LAUTO, C_NONE, C_REG, 31, 8, REGSP, 0, 0}, - {AMOVD, C_LOREG, C_NONE, C_REG, 31, 8, 0, 0, 0}, - {AMOVD, C_LOREG, C_NONE, C_REG, 31, 8, 0, 0, 0}, + {AMOVB, C_LAUTO, C_NONE, C_REG, 31, 8, REGSP, LFROM, 0}, + {AMOVB, C_LOREG, C_NONE, C_REG, 31, 8, 0, LFROM, 0}, + {AMOVBU, C_LAUTO, C_NONE, C_REG, 31, 8, REGSP, LFROM, 0}, + {AMOVBU, C_LOREG, C_NONE, C_REG, 31, 8, 0, LFROM, 0}, + {AMOVH, C_LAUTO, C_NONE, C_REG, 31, 8, REGSP, LFROM, 0}, + {AMOVH, C_LOREG, C_NONE, C_REG, 31, 8, 0, LFROM, 0}, + {AMOVW, C_LAUTO, C_NONE, C_REG, 31, 8, REGSP, LFROM, 0}, + {AMOVW, C_LOREG, C_NONE, C_REG, 31, 8, 0, LFROM, 0}, + {AMOVD, C_LAUTO, C_NONE, C_REG, 31, 8, REGSP, LFROM, 0}, + {AMOVD, C_LOREG, C_NONE, C_REG, 31, 8, 0, LFROM, 0}, /* load long effective stack address (load int32 offset and add) */ {AMOVD, C_LACON, C_NONE, C_REG, 34, 8, REGSP, LFROM, 0}, @@ -741,7 +736,7 @@ func (c *ctxt7) addpool(p *obj.Prog, a *obj.Addr) { // MOVD addr, REGTMP // MOVD REGTMP, R // where addr is the address of the DWORD containing the address of foo. - if p.As == AMOVD || cls == C_ADDR || cls == C_VCON || int64(lit) != int64(int32(lit)) || uint64(lit) != uint64(uint32(lit)) { + if p.As == AMOVD && a.Type != obj.TYPE_MEM || cls == C_ADDR || cls == C_VCON || int64(lit) != int64(int32(lit)) || uint64(lit) != uint64(uint32(lit)) { // conservative: don't know if we want signed or unsigned extension. // in case of ambiguity, store 64-bit t.As = ADWORD @@ -767,21 +762,31 @@ func (c *ctxt7) addpool(p *obj.Prog, a *obj.Addr) { case C_PSAUTO, C_PPAUTO, + C_UAUTO4K_8, + C_UAUTO4K_4, + C_UAUTO4K_2, C_UAUTO4K, + C_UAUTO8K_8, + C_UAUTO8K_4, C_UAUTO8K, + C_UAUTO16K_8, C_UAUTO16K, C_UAUTO32K, - C_UAUTO64K, C_NSAUTO, C_NPAUTO, C_LAUTO, C_PPOREG, C_PSOREG, + C_UOREG4K_8, + C_UOREG4K_4, + C_UOREG4K_2, C_UOREG4K, + C_UOREG8K_8, + C_UOREG8K_4, C_UOREG8K, + C_UOREG16K_8, C_UOREG16K, C_UOREG32K, - C_UOREG64K, C_NSOREG, C_NPOREG, C_LOREG, @@ -998,20 +1003,39 @@ func autoclass(l int64) int { return C_PPAUTO } if l <= 4095 { + if l&7 == 0 { + return C_UAUTO4K_8 + } + if l&3 == 0 { + return C_UAUTO4K_4 + } + if l&1 == 0 { + return C_UAUTO4K_2 + } return C_UAUTO4K } - if l <= 8190 && (l&1) == 0 { - return C_UAUTO8K + if l <= 8190 { + if l&7 == 0 { + return C_UAUTO8K_8 + } + if l&3 == 0 { + return C_UAUTO8K_4 + } + if l&1 == 0 { + return C_UAUTO8K + } } - if l <= 16380 && (l&3) == 0 { - return C_UAUTO16K + if l <= 16380 { + if l&7 == 0 { + return C_UAUTO16K_8 + } + if l&3 == 0 { + return C_UAUTO16K + } } if l <= 32760 && (l&7) == 0 { return C_UAUTO32K } - if l <= 65520 && (l&0xF) == 0 { - return C_UAUTO64K - } return C_LAUTO } @@ -1031,10 +1055,19 @@ func (c *ctxt7) offsetshift(p *obj.Prog, v int64, cls int) int64 { s := 0 if cls >= C_SEXT1 && cls <= C_SEXT16 { s = cls - C_SEXT1 - } else if cls >= C_UAUTO4K && cls <= C_UAUTO64K { - s = cls - C_UAUTO4K - } else if cls >= C_UOREG4K && cls <= C_UOREG64K { - s = cls - C_UOREG4K + } else { + switch cls { + case C_UAUTO4K, C_UOREG4K, C_ZOREG: + s = 0 + case C_UAUTO8K, C_UOREG8K: + s = 1 + case C_UAUTO16K, C_UOREG16K: + s = 2 + case C_UAUTO32K, C_UOREG32K: + s = 3 + default: + c.ctxt.Diag("bad class: %v\n%v", DRconv(cls), p) + } } vs := v >> uint(s) if vs<<uint(s) != v { @@ -1349,27 +1382,42 @@ func cmp(a int, b int) bool { } case C_UAUTO4K: - if b == C_PSAUTO || b == C_PPAUTO { + switch b { + case C_PSAUTO, C_PPAUTO, C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8: return true } case C_UAUTO8K: - return cmp(C_UAUTO4K, b) + switch b { + case C_PSAUTO, C_PPAUTO, C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8, C_UAUTO8K_4, C_UAUTO8K_8: + return true + } case C_UAUTO16K: - return cmp(C_UAUTO8K, b) + switch b { + case C_PSAUTO, C_PPAUTO, C_UAUTO4K_4, C_UAUTO4K_8, C_UAUTO8K_4, C_UAUTO8K_8, C_UAUTO16K_8: + return true + } case C_UAUTO32K: - return cmp(C_UAUTO16K, b) - - case C_UAUTO64K: - return cmp(C_UAUTO32K, b) + switch b { + case C_PSAUTO, C_PPAUTO, C_UAUTO4K_8, C_UAUTO8K_8, C_UAUTO16K_8: + return true + } case C_NPAUTO: return cmp(C_NSAUTO, b) case C_LAUTO: - return cmp(C_NPAUTO, b) || cmp(C_UAUTO64K, b) + switch b { + case C_PSAUTO, C_PPAUTO, + C_UAUTO4K, C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8, + C_UAUTO8K, C_UAUTO8K_4, C_UAUTO8K_8, + C_UAUTO16K, C_UAUTO16K_8, + C_UAUTO32K: + return true + } + return cmp(C_NPAUTO, b) case C_PSOREG: if b == C_ZOREG { @@ -1382,27 +1430,42 @@ func cmp(a int, b int) bool { } case C_UOREG4K: - if b == C_ZOREG || b == C_PSAUTO || b == C_PSOREG || b == C_PPAUTO || b == C_PPOREG { + switch b { + case C_ZOREG, C_PSOREG, C_PPOREG, C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8: return true } case C_UOREG8K: - return cmp(C_UOREG4K, b) + switch b { + case C_ZOREG, C_PSOREG, C_PPOREG, C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8, C_UOREG8K_4, C_UOREG8K_8: + return true + } case C_UOREG16K: - return cmp(C_UOREG8K, b) + switch b { + case C_ZOREG, C_PSOREG, C_PPOREG, C_UOREG4K_4, C_UOREG4K_8, C_UOREG8K_4, C_UOREG8K_8, C_UOREG16K_8: + return true + } case C_UOREG32K: - return cmp(C_UOREG16K, b) - - case C_UOREG64K: - return cmp(C_UOREG32K, b) + switch b { + case C_ZOREG, C_PSOREG, C_PPOREG, C_UOREG4K_8, C_UOREG8K_8, C_UOREG16K_8: + return true + } case C_NPOREG: return cmp(C_NSOREG, b) case C_LOREG: - return cmp(C_NPOREG, b) || cmp(C_UOREG64K, b) + switch b { + case C_ZOREG, C_PSOREG, C_PPOREG, + C_UOREG4K, C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8, + C_UOREG8K, C_UOREG8K_4, C_UOREG8K_8, + C_UOREG16K, C_UOREG16K_8, + C_UOREG32K: + return true + } + return cmp(C_NPOREG, b) case C_LBRA: if b == C_SBRA { @@ -2420,58 +2483,92 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 |= uint32(p.From.Reg&31)<<5 | uint32(p.To.Reg&31) case 30: /* movT R,L(R) -> strT */ + // if offset L can be split into hi+lo, and both fit into instructions, do + // add $hi, R, Rtmp + // str R, lo(Rtmp) + // otherwise, use constant pool + // mov $L, Rtmp (from constant pool) + // str R, (R+Rtmp) s := movesize(o.as) - if s < 0 { c.ctxt.Diag("unexpected long move, op %v tab %v\n%v", p.As, o.as, p) } - v := int32(c.regoff(&p.To)) - if v < 0 { - c.ctxt.Diag("negative large offset\n%v", p) - } - if (v & ((1 << uint(s)) - 1)) != 0 { - c.ctxt.Diag("misaligned offset\n%v", p) - } - hi := v - (v & (0xFFF << uint(s))) - if (hi & 0xFFF) != 0 { - c.ctxt.Diag("internal: miscalculated offset %d [%d]\n%v", v, s, p) - } - //fprint(2, "v=%ld (%#lux) s=%d hi=%ld (%#lux) v'=%ld (%#lux)\n", v, v, s, hi, hi, ((v-hi)>>s)&0xFFF, ((v-hi)>>s)&0xFFF); r := int(p.To.Reg) - if r == 0 { r = int(o.param) } + + v := int32(c.regoff(&p.To)) + var hi int32 + if v < 0 || (v&((1<<uint(s))-1)) != 0 { + // negative or unaligned offset, use constant pool + goto storeusepool + } + + hi = v - (v & (0xFFF << uint(s))) + if hi&0xFFF != 0 { + c.ctxt.Diag("internal: miscalculated offset %d [%d]\n%v", v, s, p) + } + if hi&^0xFFF000 != 0 { + // hi doesn't fit into an ADD instruction + goto storeusepool + } + o1 = c.oaddi(p, int32(c.opirr(p, AADD)), hi, r, REGTMP) o2 = c.olsr12u(p, int32(c.opstr12(p, p.As)), ((v-hi)>>uint(s))&0xFFF, REGTMP, int(p.From.Reg)) + break + + storeusepool: + if r == REGTMP || p.From.Reg == REGTMP { + c.ctxt.Diag("REGTMP used in large offset store: %v", p) + } + o1 = c.omovlit(AMOVD, p, &p.To, REGTMP) + o2 = c.olsxrr(p, int32(c.opstrr(p, p.As)), int(p.From.Reg), r, REGTMP) case 31: /* movT L(R), R -> ldrT */ + // if offset L can be split into hi+lo, and both fit into instructions, do + // add $hi, R, Rtmp + // ldr lo(Rtmp), R + // otherwise, use constant pool + // mov $L, Rtmp (from constant pool) + // ldr (R+Rtmp), R s := movesize(o.as) - if s < 0 { c.ctxt.Diag("unexpected long move, op %v tab %v\n%v", p.As, o.as, p) } - v := int32(c.regoff(&p.From)) - if v < 0 { - c.ctxt.Diag("negative large offset\n%v", p) + + r := int(p.From.Reg) + if r == 0 { + r = int(o.param) } - if (v & ((1 << uint(s)) - 1)) != 0 { - c.ctxt.Diag("misaligned offset\n%v", p) + + v := int32(c.regoff(&p.From)) + var hi int32 + if v < 0 || (v&((1<<uint(s))-1)) != 0 { + // negative or unaligned offset, use constant pool + goto loadusepool } - hi := v - (v & (0xFFF << uint(s))) + + hi = v - (v & (0xFFF << uint(s))) if (hi & 0xFFF) != 0 { c.ctxt.Diag("internal: miscalculated offset %d [%d]\n%v", v, s, p) } - - //fprint(2, "v=%ld (%#lux) s=%d hi=%ld (%#lux) v'=%ld (%#lux)\n", v, v, s, hi, hi, ((v-hi)>>s)&0xFFF, ((v-hi)>>s)&0xFFF); - r := int(p.From.Reg) - - if r == 0 { - r = int(o.param) + if hi&^0xFFF000 != 0 { + // hi doesn't fit into an ADD instruction + goto loadusepool } + o1 = c.oaddi(p, int32(c.opirr(p, AADD)), hi, r, REGTMP) o2 = c.olsr12u(p, int32(c.opldr12(p, p.As)), ((v-hi)>>uint(s))&0xFFF, REGTMP, int(p.To.Reg)) + break + + loadusepool: + if r == REGTMP || p.From.Reg == REGTMP { + c.ctxt.Diag("REGTMP used in large offset load: %v", p) + } + o1 = c.omovlit(AMOVD, p, &p.From, REGTMP) + o2 = c.olsxrr(p, int32(c.opldrr(p, p.As)), int(p.To.Reg), r, REGTMP) case 32: /* mov $con, R -> movz/movn */ o1 = c.omovconst(p.As, p, &p.From, int(p.To.Reg)) @@ -2691,30 +2788,6 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 |= uint32(p.From.Reg&31) << 5 o1 |= uint32(p.To.Reg & 31) - case 47: /* movT R,V(R) -> strT (huge offset) */ - o1 = c.omovlit(AMOVW, p, &p.To, REGTMP) - - if !(o1 != 0) { - break - } - r := int(p.To.Reg) - if r == 0 { - r = int(o.param) - } - o2 = c.olsxrr(p, p.As, REGTMP, r, int(p.From.Reg)) - - case 48: /* movT V(R), R -> ldrT (huge offset) */ - o1 = c.omovlit(AMOVW, p, &p.From, REGTMP) - - if !(o1 != 0) { - break - } - r := int(p.From.Reg) - if r == 0 { - r = int(o.param) - } - o2 = c.olsxrr(p, p.As, REGTMP, r, int(p.To.Reg)) - case 50: /* sys/sysl */ o1 = c.opirr(p, p.As) @@ -4211,12 +4284,61 @@ func (c *ctxt7) opldrpp(p *obj.Prog, a obj.As) uint32 { return 0 } -/* - * load/store register (extended register) - */ -func (c *ctxt7) olsxrr(p *obj.Prog, as obj.As, rt int, r1 int, r2 int) uint32 { - c.ctxt.Diag("need load/store extended register\n%v", p) - return 0xffffffff +// olsxrr attaches register operands to a load/store opcode supplied in o. +// The result either encodes a load of r from (r1+r2) or a store of r to (r1+r2). +func (c *ctxt7) olsxrr(p *obj.Prog, o int32, r int, r1 int, r2 int) uint32 { + o |= int32(r1&31) << 5 + o |= int32(r2&31) << 16 + o |= int32(r & 31) + return uint32(o) +} + +// opldrr returns the ARM64 opcode encoding corresponding to the obj.As opcode +// for load instruction with register offset. +func (c *ctxt7) opldrr(p *obj.Prog, a obj.As) uint32 { + switch a { + case AMOVD: + return 0x1a<<10 | 0x3<<21 | 0x1f<<27 + case AMOVW: + return 0x1a<<10 | 0x5<<21 | 0x17<<27 + case AMOVWU: + return 0x1a<<10 | 0x3<<21 | 0x17<<27 + case AMOVH: + return 0x1a<<10 | 0x5<<21 | 0x0f<<27 + case AMOVHU: + return 0x1a<<10 | 0x3<<21 | 0x0f<<27 + case AMOVB: + return 0x1a<<10 | 0x5<<21 | 0x07<<27 + case AMOVBU: + return 0x1a<<10 | 0x3<<21 | 0x07<<27 + case AFMOVS: + return 0x1a<<10 | 0x3<<21 | 0x17<<27 | 1<<26 + case AFMOVD: + return 0x1a<<10 | 0x3<<21 | 0x1f<<27 | 1<<26 + } + c.ctxt.Diag("bad opldrr %v\n%v", a, p) + return 0 +} + +// opstrr returns the ARM64 opcode encoding corresponding to the obj.As opcode +// for store instruction with register offset. +func (c *ctxt7) opstrr(p *obj.Prog, a obj.As) uint32 { + switch a { + case AMOVD: + return 0x1a<<10 | 0x1<<21 | 0x1f<<27 + case AMOVW, AMOVWU: + return 0x1a<<10 | 0x1<<21 | 0x17<<27 + case AMOVH, AMOVHU: + return 0x1a<<10 | 0x1<<21 | 0x0f<<27 + case AMOVB, AMOVBU: + return 0x1a<<10 | 0x1<<21 | 0x07<<27 + case AFMOVS: + return 0x1a<<10 | 0x1<<21 | 0x17<<27 | 1<<26 + case AFMOVD: + return 0x1a<<10 | 0x1<<21 | 0x1f<<27 | 1<<26 + } + c.ctxt.Diag("bad opstrr %v\n%v", a, p) + return 0 } func (c *ctxt7) oaddi(p *obj.Prog, o1 int32, v int32, r int, rt int) uint32 { |
