aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/compile/internal
diff options
context:
space:
mode:
authorGuoqi Chen <chenguoqi@loongson.cn>2025-11-17 11:33:04 +0800
committerabner chenc <chenguoqi@loongson.cn>2025-11-19 19:38:42 -0800
commitc4bb9653ba28cba4bcd3a3cbb64285c495a03ba2 (patch)
tree66f072e7ac5c268c5a942ae16b671a2d047e7a2f /src/cmd/compile/internal
parent7f2ae21fb481e527086aafee6da3dafdca444f7a (diff)
downloadgo-c4bb9653ba28cba4bcd3a3cbb64285c495a03ba2.tar.xz
cmd/compile: Implement LoweredZeroLoop with LSX Instruction on loong64
goos: linux goarch: loong64 pkg: runtime cpu: Loongson-3A6000 @ 2500.00MHz | old.txt | new.txt | | sec/op | sec/op vs base | ClearFat256 6.406n ± 0% 3.329n ± 1% -48.03% (p=0.000 n=10) ClearFat512 12.810n ± 0% 7.607n ± 0% -40.62% (p=0.000 n=10) ClearFat1024 25.62n ± 0% 14.01n ± 0% -45.32% (p=0.000 n=10) ClearFat1032 26.02n ± 0% 14.28n ± 0% -45.14% (p=0.000 n=10) ClearFat1040 26.02n ± 0% 14.41n ± 0% -44.62% (p=0.000 n=10) MemclrKnownSize192 4.804n ± 0% 2.827n ± 0% -41.15% (p=0.000 n=10) MemclrKnownSize248 6.561n ± 0% 4.371n ± 0% -33.38% (p=0.000 n=10) MemclrKnownSize256 6.406n ± 0% 3.335n ± 0% -47.94% (p=0.000 n=10) geomean 11.41n 6.453n -43.45% goos: linux goarch: loong64 pkg: runtime cpu: Loongson-3C5000 @ 2200.00MHz | old.txt | new.txt | | sec/op | sec/op vs base | ClearFat256 14.570n ± 0% 7.284n ± 0% -50.01% (p=0.000 n=10) ClearFat512 29.13n ± 0% 14.57n ± 0% -49.98% (p=0.000 n=10) ClearFat1024 58.26n ± 0% 29.15n ± 0% -49.97% (p=0.000 n=10) ClearFat1032 58.73n ± 0% 29.15n ± 0% -50.36% (p=0.000 n=10) ClearFat1040 59.18n ± 0% 29.26n ± 0% -50.56% (p=0.000 n=10) MemclrKnownSize192 10.930n ± 0% 5.466n ± 0% -49.99% (p=0.000 n=10) MemclrKnownSize248 14.110n ± 0% 6.772n ± 0% -52.01% (p=0.000 n=10) MemclrKnownSize256 14.570n ± 0% 7.285n ± 0% -50.00% (p=0.000 n=10) geomean 25.75n 12.78n -50.36% Change-Id: I88d7b6ae2f6fc3f095979f24fb83ff42a9d2d42e Reviewed-on: https://go-review.googlesource.com/c/go/+/720940 Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Mark Freeman <markfreeman@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com>
Diffstat (limited to 'src/cmd/compile/internal')
-rw-r--r--src/cmd/compile/internal/loong64/ssa.go153
-rw-r--r--src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go1
-rw-r--r--src/cmd/compile/internal/ssa/opGen.go1
3 files changed, 115 insertions, 40 deletions
diff --git a/src/cmd/compile/internal/loong64/ssa.go b/src/cmd/compile/internal/loong64/ssa.go
index 84bbf9b394..71953109c4 100644
--- a/src/cmd/compile/internal/loong64/ssa.go
+++ b/src/cmd/compile/internal/loong64/ssa.go
@@ -575,6 +575,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
case ssa.OpLOONG64LoweredZeroLoop:
ptrReg := v.Args[0].Reg()
countReg := v.RegTmp()
+ flagReg := int16(loong64.REGTMP)
var off int64
n := v.AuxInt
loopSize := int64(64)
@@ -587,58 +588,119 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
// vs
// 16 instuctions in the straightline code
// Might as well use straightline code.
- v.Fatalf("ZeroLoop size tool small %d", n)
+ v.Fatalf("ZeroLoop size too small %d", n)
}
- // Put iteration count in a register.
- // MOVV $n/loopSize, countReg
- p := s.Prog(loong64.AMOVV)
- p.From.Type = obj.TYPE_CONST
- p.From.Offset = n / loopSize
- p.To.Type = obj.TYPE_REG
- p.To.Reg = countReg
- cntInit := p
+ // MOVV $n/loopSize, countReg
+ // MOVBU ir.Syms.Loong64HasLSX, flagReg
+ // BNE flagReg, lsxInit
+ // genericInit:
+ // for off = 0; off < loopSize; off += 8 {
+ // zero8(s, ptrReg, off)
+ // }
+ // ADDV $loopSize, ptrReg
+ // SUBV $1, countReg
+ // BNE countReg, genericInit
+ // JMP tail
+ // lsxInit:
+ // VXORV V31, V31, V31, v31 = 0
+ // for off = 0; off < loopSize; off += 16 {
+ // zero16(s, V31, ptrReg, off)
+ // }
+ // ADDV $loopSize, ptrReg
+ // SUBV $1, countReg
+ // BNE countReg, lsxInit
+ // tail:
+ // n %= loopSize
+ // for off = 0; n >= 8; off += 8, n -= 8 {
+ // zero8(s, ptrReg, off)
+ // }
+ //
+ // if n != 0 {
+ // zero8(s, ptrReg, off+n-8)
+ // }
- // Zero loopSize bytes starting at ptrReg.
- for range loopSize / 8 {
- // MOVV ZR, off(ptrReg)
+ p1 := s.Prog(loong64.AMOVV)
+ p1.From.Type = obj.TYPE_CONST
+ p1.From.Offset = n / loopSize
+ p1.To.Type = obj.TYPE_REG
+ p1.To.Reg = countReg
+
+ p2 := s.Prog(loong64.AMOVBU)
+ p2.From.Type = obj.TYPE_MEM
+ p2.From.Name = obj.NAME_EXTERN
+ p2.From.Sym = ir.Syms.Loong64HasLSX
+ p2.To.Type = obj.TYPE_REG
+ p2.To.Reg = flagReg
+
+ p3 := s.Prog(loong64.ABNE)
+ p3.From.Type = obj.TYPE_REG
+ p3.From.Reg = flagReg
+ p3.To.Type = obj.TYPE_BRANCH
+
+ for off = 0; off < loopSize; off += 8 {
zero8(s, ptrReg, off)
- off += 8
}
- // Increment ptrReg by loopSize.
- // ADDV $loopSize, ptrReg
- p = s.Prog(loong64.AADDV)
- p.From.Type = obj.TYPE_CONST
- p.From.Offset = loopSize
- p.To.Type = obj.TYPE_REG
- p.To.Reg = ptrReg
+ p4 := s.Prog(loong64.AADDV)
+ p4.From.Type = obj.TYPE_CONST
+ p4.From.Offset = loopSize
+ p4.To.Type = obj.TYPE_REG
+ p4.To.Reg = ptrReg
- // Decrement loop count.
- // SUBV $1, countReg
- p = s.Prog(loong64.ASUBV)
- p.From.Type = obj.TYPE_CONST
- p.From.Offset = 1
- p.To.Type = obj.TYPE_REG
- p.To.Reg = countReg
+ p5 := s.Prog(loong64.ASUBV)
+ p5.From.Type = obj.TYPE_CONST
+ p5.From.Offset = 1
+ p5.To.Type = obj.TYPE_REG
+ p5.To.Reg = countReg
- // Jump to loop header if we're not done yet.
- // BNE countReg, loop header
- p = s.Prog(loong64.ABNE)
- p.From.Type = obj.TYPE_REG
- p.From.Reg = countReg
- p.To.Type = obj.TYPE_BRANCH
- p.To.SetTarget(cntInit.Link)
+ p6 := s.Prog(loong64.ABNE)
+ p6.From.Type = obj.TYPE_REG
+ p6.From.Reg = countReg
+ p6.To.Type = obj.TYPE_BRANCH
+ p6.To.SetTarget(p3.Link)
+
+ p7 := s.Prog(obj.AJMP)
+ p7.To.Type = obj.TYPE_BRANCH
+
+ p8 := s.Prog(loong64.AVXORV)
+ p8.From.Type = obj.TYPE_REG
+ p8.From.Reg = loong64.REG_V31
+ p8.To.Type = obj.TYPE_REG
+ p8.To.Reg = loong64.REG_V31
+ p3.To.SetTarget(p8)
+
+ for off = 0; off < loopSize; off += 16 {
+ zero16(s, loong64.REG_V31, ptrReg, off)
+ }
+
+ p9 := s.Prog(loong64.AADDV)
+ p9.From.Type = obj.TYPE_CONST
+ p9.From.Offset = loopSize
+ p9.To.Type = obj.TYPE_REG
+ p9.To.Reg = ptrReg
+
+ p10 := s.Prog(loong64.ASUBV)
+ p10.From.Type = obj.TYPE_CONST
+ p10.From.Offset = 1
+ p10.To.Type = obj.TYPE_REG
+ p10.To.Reg = countReg
+
+ p11 := s.Prog(loong64.ABNE)
+ p11.From.Type = obj.TYPE_REG
+ p11.From.Reg = countReg
+ p11.To.Type = obj.TYPE_BRANCH
+ p11.To.SetTarget(p8.Link)
+
+ p12 := s.Prog(obj.ANOP)
+ p7.To.SetTarget(p12)
// Multiples of the loop size are now done.
n %= loopSize
-
- off = 0
// Write any fractional portion.
- for n >= 8 {
- // MOVV ZR, off(ptrReg)
+ for off = 0; n >= 8; off += 8 {
+ // MOVV ZR, off(ptrReg)
zero8(s, ptrReg, off)
- off += 8
n -= 8
}
@@ -1333,7 +1395,7 @@ func move8(s *ssagen.State, src, dst, tmp int16, off int64) {
// zero8 zeroes 8 bytes at reg+off.
func zero8(s *ssagen.State, reg int16, off int64) {
- // MOVV ZR, off(reg)
+ // MOVV ZR, off(reg)
p := s.Prog(loong64.AMOVV)
p.From.Type = obj.TYPE_REG
p.From.Reg = loong64.REGZERO
@@ -1341,3 +1403,14 @@ func zero8(s *ssagen.State, reg int16, off int64) {
p.To.Reg = reg
p.To.Offset = off
}
+
+// zero16 zeroes 16 bytes at reg+off.
+func zero16(s *ssagen.State, regZero, regBase int16, off int64) {
+ // VMOVQ regZero, off(regBase)
+ p := s.Prog(loong64.AVMOVQ)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = regZero
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = regBase
+ p.To.Offset = off
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
index 7e8b8bf497..81d3a3665b 100644
--- a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
@@ -388,6 +388,7 @@ func init() {
argLength: 2,
reg: regInfo{
inputs: []regMask{gp},
+ clobbers: buildReg("F31"),
clobbersArg0: true,
},
faultOnNilArg0: true,
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 264f4b3bf3..944e1d7854 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -26107,6 +26107,7 @@ var opcodeTable = [...]opInfo{
inputs: []inputInfo{
{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
},
+ clobbers: 2305843009213693952, // F31
clobbersArg0: true,
},
},