aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/internal
diff options
context:
space:
mode:
authordiaxu01 <dianhong.xu@arm.com>2020-04-02 02:39:28 +0000
committerCherry Zhang <cherryyz@google.com>2020-09-16 14:56:18 +0000
commita86b6f23f08fd42154bc5bbfa417da6ee5ef48fb (patch)
tree194d4bb86dca2117e2edff06f28b4ebbd9abdaa2 /src/cmd/internal
parente82c9bd81654dab14f786c26af2dd8ea3a7a1737 (diff)
downloadgo-a86b6f23f08fd42154bc5bbfa417da6ee5ef48fb.tar.xz
cmd/internal/obj/arm64: optimize the instruction of moving long effective stack address
Currently, when the offset of "MOVD $offset(Rn), Rd" is a large positive constant or a negative constant, the assembler will load this offset from the constant pool.This patch gets rid of the constant pool by encoding the offset into two ADD instructions if it's a large positive constant or one SUB instruction if negative. For very large negative offset, it is rarely used, here we don't optimize this case. Optimized case 1: MOVD $-0x100000(R7), R0 Before: LDR 0x67670(constant pool), R27; ADD R27.UXTX, R0, R7 After: SUB $0x100000, R7, R0 Optimized case 2: MOVD $0x123468(R7), R0 Before: LDR 0x67670(constant pool), R27; ADD R27.UXTX, R0, R7 After: ADD $0x123000, R7, R27; ADD $0x000468, R27, R0 1. Binary size before/after. binary size change pkg/linux_arm64 +4KB pkg/tool/linux_arm64 no change go no change gofmt no change 2. go1 benckmark. name old time/op new time/op delta pkg:test/bench/go1 goos:linux goarch:arm64 BinaryTree17-64 7335721401.800000ns +-40% 6264542009.800000ns +-14% ~ (p=0.421 n=5+5) Fannkuch11-64 3886551822.600000ns +- 0% 3875870590.200000ns +- 0% ~ (p=0.151 n=5+5) FmtFprintfEmpty-64 82.960000ns +- 1% 83.900000ns +- 2% +1.13% (p=0.048 n=5+5) FmtFprintfString-64 149.200000ns +- 1% 148.000000ns +- 0% -0.80% (p=0.016 n=5+4) FmtFprintfInt-64 177.000000ns +- 0% 178.400000ns +- 2% ~ (p=0.794 n=4+5) FmtFprintfIntInt-64 240.200000ns +- 2% 239.400000ns +- 4% ~ (p=0.302 n=5+5) FmtFprintfPrefixedInt-64 300.400000ns +- 0% 299.200000ns +- 1% ~ (p=0.119 n=5+5) FmtFprintfFloat-64 360.000000ns +- 0% 361.600000ns +- 3% ~ (p=0.349 n=4+5) FmtManyArgs-64 1064.400000ns +- 1% 1061.400000ns +- 0% ~ (p=0.087 n=5+5) GobDecode-64 12080404.400000ns +- 2% 11637601.000000ns +- 1% -3.67% (p=0.008 n=5+5) GobEncode-64 8474973.800000ns +- 2% 7977801.600000ns +- 2% -5.87% (p=0.008 n=5+5) Gzip-64 416501238.400000ns +- 0% 410463405.400000ns +- 0% -1.45% (p=0.008 n=5+5) Gunzip-64 58088415.200000ns +- 0% 58826209.600000ns +- 0% +1.27% (p=0.008 n=5+5) HTTPClientServer-64 128660.200000ns +-23% 117840.800000ns +- 8% ~ (p=0.222 n=5+5) JSONEncode-64 17547746.800000ns +- 4% 17216180.000000ns +- 1% ~ (p=0.222 n=5+5) JSONDecode-64 80879896.000000ns +- 1% 80063737.200000ns +- 0% -1.01% (p=0.008 n=5+5) Mandelbrot200-64 5484901.600000ns +- 0% 5483614.400000ns +- 0% ~ (p=0.310 n=5+5) GoParse-64 6201166.800000ns +- 6% 6150920.600000ns +- 1% ~ (p=0.548 n=5+5) RegexpMatchEasy0_32-64 135.000000ns +- 0% 139.200000ns +- 7% ~ (p=0.643 n=5+5) RegexpMatchEasy0_1K-64 484.600000ns +- 2% 483.800000ns +- 2% ~ (p=0.984 n=5+5) RegexpMatchEasy1_32-64 128.000000ns +- 1% 124.600000ns +- 1% -2.66% (p=0.008 n=5+5) RegexpMatchEasy1_1K-64 769.400000ns +- 2% 761.400000ns +- 1% ~ (p=0.460 n=5+5) RegexpMatchMedium_32-64 12.900000ns +- 0% 12.500000ns +- 0% -3.10% (p=0.008 n=5+5) RegexpMatchMedium_1K-64 57879.200000ns +- 1% 56512.200000ns +- 0% -2.36% (p=0.008 n=5+5) RegexpMatchHard_32-64 3091.600000ns +- 1% 3071.000000ns +- 0% -0.67% (p=0.048 n=5+5) RegexpMatchHard_1K-64 92941.200000ns +- 1% 92794.000000ns +- 0% ~ (p=1.000 n=5+5) Revcomp-64 1695605187.000000ns +-54% 1821697637.400000ns +-47% ~ (p=1.000 n=5+5) Template-64 112839686.800000ns +- 1% 109964069.200000ns +- 3% ~ (p=0.095 n=5+5) TimeParse-64 587.000000ns +- 0% 587.000000ns +- 0% ~ (all equal) TimeFormat-64 586.000000ns +- 1% 584.200000ns +- 1% ~ (p=0.659 n=5+5) [Geo mean] 81804.262218ns 80694.712973ns -1.36% name old speed new speed delta pkg:test/bench/go1 goos:linux goarch:arm64 GobDecode-64 63.6MB/s +- 2% 66.0MB/s +- 1% +3.78% (p=0.008 n=5+5) GobEncode-64 90.6MB/s +- 2% 96.2MB/s +- 2% +6.23% (p=0.008 n=5+5) Gzip-64 46.6MB/s +- 0% 47.3MB/s +- 0% +1.47% (p=0.008 n=5+5) Gunzip-64 334MB/s +- 0% 330MB/s +- 0% -1.25% (p=0.008 n=5+5) JSONEncode-64 111MB/s +- 4% 113MB/s +- 1% ~ (p=0.222 n=5+5) JSONDecode-64 24.0MB/s +- 1% 24.2MB/s +- 0% +1.02% (p=0.008 n=5+5) GoParse-64 9.35MB/s +- 6% 9.42MB/s +- 1% ~ (p=0.571 n=5+5) RegexpMatchEasy0_32-64 237MB/s +- 0% 231MB/s +- 7% ~ (p=0.690 n=5+5) RegexpMatchEasy0_1K-64 2.11GB/s +- 2% 2.12GB/s +- 2% ~ (p=1.000 n=5+5) RegexpMatchEasy1_32-64 250MB/s +- 1% 257MB/s +- 1% +2.63% (p=0.008 n=5+5) RegexpMatchEasy1_1K-64 1.33GB/s +- 2% 1.35GB/s +- 1% ~ (p=0.548 n=5+5) RegexpMatchMedium_32-64 77.6MB/s +- 0% 79.8MB/s +- 0% +2.80% (p=0.008 n=5+5) RegexpMatchMedium_1K-64 17.7MB/s +- 1% 18.1MB/s +- 0% +2.41% (p=0.008 n=5+5) RegexpMatchHard_32-64 10.4MB/s +- 1% 10.4MB/s +- 0% ~ (p=0.056 n=5+5) RegexpMatchHard_1K-64 11.0MB/s +- 1% 11.0MB/s +- 0% ~ (p=0.984 n=5+5) Revcomp-64 188MB/s +-71% 155MB/s +-71% ~ (p=1.000 n=5+5) Template-64 17.2MB/s +- 1% 17.7MB/s +- 3% ~ (p=0.095 n=5+5) [Geo mean] 79.2MB/s 79.3MB/s +0.24% Change-Id: I593ac3e7037afafc3605ad4b0cfb51d5dd88015d Reviewed-on: https://go-review.googlesource.com/c/go/+/232438 Trust: Alberto Donizetti <alb.donizetti@gmail.com> Run-TryBot: Alberto Donizetti <alb.donizetti@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
Diffstat (limited to 'src/cmd/internal')
-rw-r--r--src/cmd/internal/obj/arm64/a.out.go7
-rw-r--r--src/cmd/internal/obj/arm64/anames7.go1
-rw-r--r--src/cmd/internal/obj/arm64/asm7.go46
3 files changed, 37 insertions, 17 deletions
diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go
index 2839da1437..b3c9e9a18e 100644
--- a/src/cmd/internal/obj/arm64/a.out.go
+++ b/src/cmd/internal/obj/arm64/a.out.go
@@ -410,9 +410,10 @@ const (
C_FCON // floating-point constant
C_VCONADDR // 64-bit memory address
- C_AACON // ADDCON offset in auto constant $a(FP)
- C_LACON // 32-bit offset in auto constant $a(FP)
- C_AECON // ADDCON offset in extern constant $e(SB)
+ C_AACON // ADDCON offset in auto constant $a(FP)
+ C_AACON2 // 24-bit offset in auto constant $a(FP)
+ C_LACON // 32-bit offset in auto constant $a(FP)
+ C_AECON // ADDCON offset in extern constant $e(SB)
// TODO(aram): only one branch class should be enough
C_SBRA // for TYPE_BRANCH
diff --git a/src/cmd/internal/obj/arm64/anames7.go b/src/cmd/internal/obj/arm64/anames7.go
index e1703fc4ab..96c9f788d9 100644
--- a/src/cmd/internal/obj/arm64/anames7.go
+++ b/src/cmd/internal/obj/arm64/anames7.go
@@ -36,6 +36,7 @@ var cnames7 = []string{
"FCON",
"VCONADDR",
"AACON",
+ "AACON2",
"LACON",
"AECON",
"SBRA",
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go
index df4bbbbd35..fc2033d689 100644
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -391,7 +391,11 @@ var optab = []Optab{
{AMOVD, C_VCON, C_NONE, C_NONE, C_REG, 12, 16, 0, NOTUSETMP, 0},
{AMOVK, C_VCON, C_NONE, C_NONE, C_REG, 33, 4, 0, 0, 0},
- {AMOVD, C_AACON, C_NONE, C_NONE, C_REG, 4, 4, REGFROM, 0, 0},
+ {AMOVD, C_AACON, C_NONE, C_NONE, C_RSP, 4, 4, REGFROM, 0, 0},
+ {AMOVD, C_AACON2, C_NONE, C_NONE, C_RSP, 4, 8, REGFROM, 0, 0},
+
+ /* load long effective stack address (load int32 offset and add) */
+ {AMOVD, C_LACON, C_NONE, C_NONE, C_RSP, 34, 8, REGSP, LFROM, 0},
// Move a large constant to a Vn.
{AFMOVQ, C_VCON, C_NONE, C_NONE, C_VREG, 101, 4, 0, LFROM, 0},
@@ -594,9 +598,6 @@ var optab = []Optab{
{AFMOVD, C_LAUTO, C_NONE, C_NONE, C_FREG, 31, 8, REGSP, LFROM, 0},
{AFMOVD, C_LOREG, C_NONE, C_NONE, C_FREG, 31, 8, 0, LFROM, 0},
- /* load long effective stack address (load int32 offset and add) */
- {AMOVD, C_LACON, C_NONE, C_NONE, C_REG, 34, 8, REGSP, LFROM, 0},
-
/* pre/post-indexed load (unscaled, signed 9-bit offset) */
{AMOVD, C_LOREG, C_NONE, C_NONE, C_REG, 22, 4, 0, 0, C_XPOST},
{AMOVW, C_LOREG, C_NONE, C_NONE, C_REG, 22, 4, 0, 0, C_XPOST},
@@ -1361,6 +1362,10 @@ func isaddcon(v int64) bool {
return v <= 0xFFF
}
+func isaddcon2(v int64) bool {
+ return 0 <= v && v <= 0xFFFFFF
+}
+
// isbitcon reports whether a constant can be encoded into a logical instruction.
// bitcon has a binary form of repetition of a bit sequence of length 2, 4, 8, 16, 32, or 64,
// which itself is a rotate (w.r.t. the length of the unit) of a sequence of ones.
@@ -1889,10 +1894,14 @@ func (c *ctxt7) aclass(a *obj.Addr) int {
default:
return C_GOK
}
-
- if isaddcon(c.instoffset) {
+ cf := c.instoffset
+ if isaddcon(cf) || isaddcon(-cf) {
return C_AACON
}
+ if isaddcon2(cf) {
+ return C_AACON2
+ }
+
return C_LACON
case obj.TYPE_BRANCH:
@@ -2046,7 +2055,7 @@ func cmp(a int, b int) bool {
return cmp(C_LCON, b)
case C_LACON:
- if b == C_AACON {
+ if b == C_AACON || b == C_AACON2 {
return true
}
@@ -3062,11 +3071,10 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
}
o1 |= (uint32(r&31) << 5) | uint32(rt&31)
- case 4: /* mov $addcon, R; mov $recon, R; mov $racon, R */
- o1 = c.opirr(p, p.As)
-
+ case 4: /* mov $addcon, R; mov $recon, R; mov $racon, R; mov $addcon2, R */
rt := int(p.To.Reg)
r := int(o.param)
+
if r == 0 {
r = REGZERO
} else if r == REGFROM {
@@ -3075,13 +3083,23 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
if r == 0 {
r = REGSP
}
+
v := int32(c.regoff(&p.From))
- if (v & 0xFFF000) != 0 {
- v >>= 12
- o1 |= 1 << 22 /* shift, by 12 */
+ var op int32
+ if v < 0 {
+ v = -v
+ op = int32(c.opirr(p, ASUB))
+ } else {
+ op = int32(c.opirr(p, AADD))
+ }
+
+ if int(o.size) == 8 {
+ o1 = c.oaddi(p, op, v&0xfff000, r, REGTMP)
+ o2 = c.oaddi(p, op, v&0x000fff, REGTMP, rt)
+ break
}
- o1 |= ((uint32(v) & 0xFFF) << 10) | (uint32(r&31) << 5) | uint32(rt&31)
+ o1 = c.oaddi(p, op, v, r, rt)
case 5: /* b s; bl s */
o1 = c.opbra(p, p.As)