aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorlimeidan <limeidan@loongson.cn>2025-08-29 15:40:31 +0800
committerGopher Robot <gobot@golang.org>2025-09-03 15:11:55 -0700
commit91e76a513bdfa4159ea0aa65a01f89e006e6ead3 (patch)
tree040ea9e18f8bad0621ccc19cd9636399d5b5c23e /src
parentc552ad913fd7117a70cc3abee6b44a15aac060e2 (diff)
downloadgo-91e76a513bdfa4159ea0aa65a01f89e006e6ead3.tar.xz
cmd/compile: use generated loops instead of DUFFCOPY on loong64
Change-Id: If9da2b5681e5d05d7c3d51f003f1fe662d3feaec Reviewed-on: https://go-review.googlesource.com/c/go/+/699855 Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Keith Randall <khr@google.com> Auto-Submit: Michael Pratt <mpratt@google.com> Reviewed-by: Michael Pratt <mpratt@google.com>
Diffstat (limited to 'src')
-rw-r--r--src/cmd/compile/internal/loong64/ssa.go157
-rw-r--r--src/cmd/compile/internal/ssa/_gen/LOONG64.rules30
-rw-r--r--src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go43
-rw-r--r--src/cmd/compile/internal/ssa/opGen.go26
-rw-r--r--src/cmd/compile/internal/ssa/rewriteLOONG64.go49
5 files changed, 188 insertions, 117 deletions
diff --git a/src/cmd/compile/internal/loong64/ssa.go b/src/cmd/compile/internal/loong64/ssa.go
index c917d14298..3959f8a7c1 100644
--- a/src/cmd/compile/internal/loong64/ssa.go
+++ b/src/cmd/compile/internal/loong64/ssa.go
@@ -659,42 +659,119 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.To.Sym = ir.Syms.Duffcopy
p.To.Offset = v.AuxInt
case ssa.OpLOONG64LoweredMove:
- // MOVx (Rarg1), Rtmp
- // MOVx Rtmp, (Rarg0)
- // ADDV $sz, Rarg1
- // ADDV $sz, Rarg0
- // BGEU Rarg2, Rarg0, -4(PC)
- mov, sz := largestMove(v.AuxInt)
- p := s.Prog(mov)
- p.From.Type = obj.TYPE_MEM
- p.From.Reg = v.Args[1].Reg()
+ dstReg := v.Args[0].Reg()
+ srcReg := v.Args[1].Reg()
+ if dstReg == srcReg {
+ break
+ }
+ tmpReg := int16(loong64.REG_R20)
+ n := v.AuxInt
+ if n < 16 {
+ v.Fatalf("Move too small %d", n)
+ }
+
+ var off int64
+ for n >= 8 {
+ // MOVV off(srcReg), tmpReg
+ // MOVV tmpReg, off(dstReg)
+ move8(s, srcReg, dstReg, tmpReg, off)
+ off += 8
+ n -= 8
+ }
+
+ if n != 0 {
+ // MOVV off+n-8(srcReg), tmpReg
+ // MOVV tmpReg, off+n-8(srcReg)
+ move8(s, srcReg, dstReg, tmpReg, off+n-8)
+ }
+ case ssa.OpLOONG64LoweredMoveLoop:
+ dstReg := v.Args[0].Reg()
+ srcReg := v.Args[1].Reg()
+ if dstReg == srcReg {
+ break
+ }
+ countReg := int16(loong64.REG_R20)
+ tmpReg := int16(loong64.REG_R21)
+ var off int64
+ n := v.AuxInt
+ loopSize := int64(64)
+ if n < 3*loopSize {
+ // - a loop count of 0 won't work.
+ // - a loop count of 1 is useless.
+ // - a loop count of 2 is a code size ~tie
+ // 4 instructions to implement the loop
+ // 8 instructions in the loop body
+ // vs
+ // 16 instructions in the straightline code
+ // Might as well use straightline code.
+ v.Fatalf("ZeroLoop size too small %d", n)
+ }
+
+ // Put iteration count in a register.
+ // MOVV $n/loopSize, countReg
+ p := s.Prog(loong64.AMOVV)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = n / loopSize
p.To.Type = obj.TYPE_REG
- p.To.Reg = loong64.REGTMP
+ p.To.Reg = countReg
+ cntInit := p
- p2 := s.Prog(mov)
- p2.From.Type = obj.TYPE_REG
- p2.From.Reg = loong64.REGTMP
- p2.To.Type = obj.TYPE_MEM
- p2.To.Reg = v.Args[0].Reg()
+ // Move loopSize bytes starting at srcReg to dstReg.
+ for range loopSize / 8 {
+ // MOVV off(srcReg), tmpReg
+ // MOVV tmpReg, off(dstReg)
+ move8(s, srcReg, dstReg, tmpReg, off)
+ off += 8
+ }
- p3 := s.Prog(loong64.AADDVU)
- p3.From.Type = obj.TYPE_CONST
- p3.From.Offset = sz
- p3.To.Type = obj.TYPE_REG
- p3.To.Reg = v.Args[1].Reg()
+ // Increment srcReg and destReg by loopSize.
+ // ADDV $loopSize, srcReg
+ p = s.Prog(loong64.AADDV)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = loopSize
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = srcReg
+ // ADDV $loopSize, dstReg
+ p = s.Prog(loong64.AADDV)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = loopSize
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = dstReg
- p4 := s.Prog(loong64.AADDVU)
- p4.From.Type = obj.TYPE_CONST
- p4.From.Offset = sz
- p4.To.Type = obj.TYPE_REG
- p4.To.Reg = v.Args[0].Reg()
+ // Decrement loop count.
+ // SUBV $1, countReg
+ p = s.Prog(loong64.ASUBV)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 1
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = countReg
- p5 := s.Prog(loong64.ABGEU)
- p5.From.Type = obj.TYPE_REG
- p5.From.Reg = v.Args[2].Reg()
- p5.Reg = v.Args[1].Reg()
- p5.To.Type = obj.TYPE_BRANCH
- p5.To.SetTarget(p)
+ // Jump to loop header if we're not done yet.
+ // BNE countReg, loop header
+ p = s.Prog(loong64.ABNE)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = countReg
+ p.To.Type = obj.TYPE_BRANCH
+ p.To.SetTarget(cntInit.Link)
+
+ // Multiples of the loop size are now done.
+ n %= loopSize
+
+ off = 0
+ // Copy any fractional portion.
+ for n >= 8 {
+ // MOVV off(srcReg), tmpReg
+ // MOVV tmpReg, off(dstReg)
+ move8(s, srcReg, dstReg, tmpReg, off)
+ off += 8
+ n -= 8
+ }
+
+ if n != 0 {
+ // MOVV off+n-8(srcReg), tmpReg
+ // MOVV tmpReg, off+n-8(srcReg)
+ move8(s, srcReg, dstReg, tmpReg, off+n-8)
+ }
case ssa.OpLOONG64CALLstatic, ssa.OpLOONG64CALLclosure, ssa.OpLOONG64CALLinter:
s.Call(v)
@@ -1225,6 +1302,24 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
return p
}
+// move8 copies 8 bytes at src+off to dst+off.
+func move8(s *ssagen.State, src, dst, tmp int16, off int64) {
+ // MOVV off(src), tmp
+ ld := s.Prog(loong64.AMOVV)
+ ld.From.Type = obj.TYPE_MEM
+ ld.From.Reg = src
+ ld.From.Offset = off
+ ld.To.Type = obj.TYPE_REG
+ ld.To.Reg = tmp
+ // MOVV tmp, off(dst)
+ st := s.Prog(loong64.AMOVV)
+ st.From.Type = obj.TYPE_REG
+ st.From.Reg = tmp
+ st.To.Type = obj.TYPE_MEM
+ st.To.Reg = dst
+ st.To.Offset = off
+}
+
// zero8 zeroes 8 bytes at reg+off.
func zero8(s *ssagen.State, reg int16, off int64) {
// MOVV ZR, off(reg)
diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
index cb55c16c3e..3fa4f363f6 100644
--- a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
@@ -419,34 +419,8 @@
(MOVVstore [8] dst (MOVVload [8] src mem)
(MOVVstore dst (MOVVload src mem) mem))
-// strip off fractional word move
-(Move [s] dst src mem) && s%8 != 0 && s > 16 =>
- (Move [s%8]
- (OffPtr <dst.Type> dst [s-s%8])
- (OffPtr <src.Type> src [s-s%8])
- (Move [s-s%8] dst src mem))
-
-// medium move uses a duff device
-(Move [s] dst src mem)
- && s%8 == 0 && s > 16 && s <= 8*128
- && logLargeCopy(v, s) =>
- (DUFFCOPY [16 * (128 - s/8)] dst src mem)
-// 16 and 128 are magic constants. 16 is the number of bytes to encode:
-// MOVV (R20), R30
-// ADDV $8, R20
-// MOVV R30, (R21)
-// ADDV $8, R21
-// and 128 is the number of such blocks. See runtime/duff_loong64.s:duffcopy.
-
-// large move uses a loop
-(Move [s] dst src mem)
- && s%8 == 0 && s > 1024 && logLargeCopy(v, s) =>
- (LoweredMove
- dst
- src
- (ADDVconst <src.Type> src [s-8])
- mem)
-
+(Move [s] dst src mem) && s > 16 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem)
+(Move [s] dst src mem) && s >= 192 && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem)
// float <=> int register moves, with no conversion.
// These come up when compiling math.{Float64bits, Float64frombits, Float32bits, Float32frombits}.
diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
index 359cb42056..cc6ae8fb8e 100644
--- a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
@@ -429,27 +429,40 @@ func init() {
needIntTemp: true,
},
- // large or unaligned move
- // arg0 = address of dst memory (in R21, changed as side effect)
- // arg1 = address of src memory (in R20, changed as side effect)
- // arg2 = address of the last element of src
- // arg3 = mem
- // auxint = alignment
+ // medium copying
+ // arg0 = address of dst memory
+ // arg1 = address of src memory
+ // arg2 = mem
+ // auxint = number of bytes to copy
// returns mem
- // MOVx (R20), Rtmp
- // MOVx Rtmp, (R21)
- // ADDV $sz, R20
- // ADDV $sz, R21
- // BGEU Rarg2, R20, -4(PC)
{
name: "LoweredMove",
aux: "Int64",
- argLength: 4,
+ argLength: 3,
reg: regInfo{
- inputs: []regMask{buildReg("R21"), buildReg("R20"), gp},
- clobbers: buildReg("R20 R21"),
+ inputs: []regMask{gp &^ buildReg("R20"), gp &^ buildReg("R20")},
+ clobbers: buildReg("R20"),
+ },
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // large copying
+ // arg0 = address of dst memory
+ // arg1 = address of src memory
+ // arg2 = mem
+ // auxint = number of bytes to copy
+ // returns mem
+ {
+ name: "LoweredMoveLoop",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{gp &^ buildReg("R20 R21"), gp &^ buildReg("R20 R21")},
+ clobbers: buildReg("R20 R21"),
+ clobbersArg0: true,
+ clobbersArg1: true,
},
- typ: "Mem",
faultOnNilArg0: true,
faultOnNilArg1: true,
},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index fca7d81017..f42d64228f 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1929,6 +1929,7 @@ const (
OpLOONG64DUFFCOPY
OpLOONG64LoweredZeroLoop
OpLOONG64LoweredMove
+ OpLOONG64LoweredMoveLoop
OpLOONG64LoweredAtomicLoad8
OpLOONG64LoweredAtomicLoad32
OpLOONG64LoweredAtomicLoad64
@@ -25986,16 +25987,31 @@ var opcodeTable = [...]opInfo{
{
name: "LoweredMove",
auxType: auxInt64,
- argLen: 4,
+ argLen: 3,
faultOnNilArg0: true,
faultOnNilArg1: true,
reg: regInfo{
inputs: []inputInfo{
- {0, 1048576}, // R21
- {1, 524288}, // R20
- {2, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
+ {0, 1071120376}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R21 R23 R24 R25 R26 R27 R28 R29 R31
+ {1, 1071120376}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R21 R23 R24 R25 R26 R27 R28 R29 R31
},
- clobbers: 1572864, // R20 R21
+ clobbers: 524288, // R20
+ },
+ },
+ {
+ name: "LoweredMoveLoop",
+ auxType: auxInt64,
+ argLen: 3,
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1070071800}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R23 R24 R25 R26 R27 R28 R29 R31
+ {1, 1070071800}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R23 R24 R25 R26 R27 R28 R29 R31
+ },
+ clobbers: 1572864, // R20 R21
+ clobbersArg0: true,
+ clobbersArg1: true,
},
},
{
diff --git a/src/cmd/compile/internal/ssa/rewriteLOONG64.go b/src/cmd/compile/internal/ssa/rewriteLOONG64.go
index ae3358e5e5..5890fe050a 100644
--- a/src/cmd/compile/internal/ssa/rewriteLOONG64.go
+++ b/src/cmd/compile/internal/ssa/rewriteLOONG64.go
@@ -9133,62 +9133,35 @@ func rewriteValueLOONG64_OpMove(v *Value) bool {
return true
}
// match: (Move [s] dst src mem)
- // cond: s%8 != 0 && s > 16
- // result: (Move [s%8] (OffPtr <dst.Type> dst [s-s%8]) (OffPtr <src.Type> src [s-s%8]) (Move [s-s%8] dst src mem))
+ // cond: s > 16 && s < 192 && logLargeCopy(v, s)
+ // result: (LoweredMove [s] dst src mem)
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
- if !(s%8 != 0 && s > 16) {
+ if !(s > 16 && s < 192 && logLargeCopy(v, s)) {
break
}
- v.reset(OpMove)
- v.AuxInt = int64ToAuxInt(s % 8)
- v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
- v0.AuxInt = int64ToAuxInt(s - s%8)
- v0.AddArg(dst)
- v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
- v1.AuxInt = int64ToAuxInt(s - s%8)
- v1.AddArg(src)
- v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
- v2.AuxInt = int64ToAuxInt(s - s%8)
- v2.AddArg3(dst, src, mem)
- v.AddArg3(v0, v1, v2)
- return true
- }
- // match: (Move [s] dst src mem)
- // cond: s%8 == 0 && s > 16 && s <= 8*128 && logLargeCopy(v, s)
- // result: (DUFFCOPY [16 * (128 - s/8)] dst src mem)
- for {
- s := auxIntToInt64(v.AuxInt)
- dst := v_0
- src := v_1
- mem := v_2
- if !(s%8 == 0 && s > 16 && s <= 8*128 && logLargeCopy(v, s)) {
- break
- }
- v.reset(OpLOONG64DUFFCOPY)
- v.AuxInt = int64ToAuxInt(16 * (128 - s/8))
+ v.reset(OpLOONG64LoweredMove)
+ v.AuxInt = int64ToAuxInt(s)
v.AddArg3(dst, src, mem)
return true
}
// match: (Move [s] dst src mem)
- // cond: s%8 == 0 && s > 1024 && logLargeCopy(v, s)
- // result: (LoweredMove dst src (ADDVconst <src.Type> src [s-8]) mem)
+ // cond: s >= 192 && logLargeCopy(v, s)
+ // result: (LoweredMoveLoop [s] dst src mem)
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
- if !(s%8 == 0 && s > 1024 && logLargeCopy(v, s)) {
+ if !(s >= 192 && logLargeCopy(v, s)) {
break
}
- v.reset(OpLOONG64LoweredMove)
- v0 := b.NewValue0(v.Pos, OpLOONG64ADDVconst, src.Type)
- v0.AuxInt = int64ToAuxInt(s - 8)
- v0.AddArg(src)
- v.AddArg4(dst, src, v0, mem)
+ v.reset(OpLOONG64LoweredMoveLoop)
+ v.AuxInt = int64ToAuxInt(s)
+ v.AddArg3(dst, src, mem)
return true
}
return false