diff options
| author | David Chase <drchase@google.com> | 2025-08-14 17:31:09 -0400 |
|---|---|---|
| committer | David Chase <drchase@google.com> | 2025-08-22 21:19:38 -0700 |
| commit | a5137ec92a96d36669e4de43c3cbec5c749e482d (patch) | |
| tree | 99b6cd362a0d7d603d18ff4aa3eab04dc1de81f1 /src/cmd/compile | |
| parent | 83714616aac5b1721da8b7644065be0b770a6748 (diff) | |
| download | go-a5137ec92a96d36669e4de43c3cbec5c749e482d.tar.xz | |
[dev.simd] cmd/compile: sample peephole optimization for SIMD broadcast
After tinkering and rewrite, this also optimizes some instances
of SetElem(0).
Change-Id: Ibba2d50a56b68ccf9de517ef24ca52b64c6c5b2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/696376
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Diffstat (limited to 'src/cmd/compile')
| -rw-r--r-- | src/cmd/compile/internal/amd64/ssa.go | 22 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/_gen/AMD64.rules | 14 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/_gen/AMD64Ops.go | 7 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/_gen/rulegen.go | 6 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/opGen.go | 56 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/rewriteAMD64.go | 252 |
6 files changed, 352 insertions, 5 deletions
diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index ec4eaaed03..58a0f9cc81 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -1711,8 +1711,26 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { // SIMD ops case ssa.OpAMD64VZEROUPPER, ssa.OpAMD64VZEROALL: s.Prog(v.Op.Asm()) - case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512: - // zero-width, no instruction generated + + case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512: // no code emitted + + case ssa.OpAMD64VMOVSSf2v, ssa.OpAMD64VMOVSDf2v: + // These are for initializing the least 32/64 bits of a SIMD register from a "float". + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.AddRestSourceReg(x86.REG_X15) + p.To.Type = obj.TYPE_REG + p.To.Reg = simdReg(v) + + case ssa.OpAMD64VMOVD, ssa.OpAMD64VMOVQ: + // These are for initializing the least 32/64 bits of a SIMD register from an "int". + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = simdReg(v) + case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512, ssa.OpAMD64KMOVQload: p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 913ddbf559..0c7c7ced43 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1768,3 +1768,17 @@ (VPANDQ512 x (VPMOVMToVec32x16 k)) => (VMOVDQU32Masked512 x k) (VPANDQ512 x (VPMOVMToVec16x32 k)) => (VMOVDQU16Masked512 x k) (VPANDQ512 x (VPMOVMToVec8x64 k)) => (VMOVDQU8Masked512 x k) + +// Insert to zero of 32/64 bit floats and ints to a zero is just MOVS[SD] +(VPINSRQ128 [0] (Zero128 <t>) y) && y.Type.IsFloat() => (VMOVSDf2v <types.TypeVec128> y) +(VPINSRD128 [0] (Zero128 <t>) y) && y.Type.IsFloat() => (VMOVSSf2v <types.TypeVec128> y) +(VPINSRQ128 [0] (Zero128 <t>) y) && !y.Type.IsFloat() => (VMOVQ <types.TypeVec128> y) +(VPINSRD128 [0] (Zero128 <t>) y) && !y.Type.IsFloat() => (VMOVD <types.TypeVec128> y) + +// These rewrites can skip zero-extending the 8/16-bit inputs because they are +// only used as the input to a broadcast; the potentially "bad" bits are ignored +(VPBROADCASTB(128|256|512) x:(VPINSRB128 [0] (Zero128 <t>) y)) && x.Uses == 1 => + (VPBROADCASTB(128|256|512) (VMOVQ <types.TypeVec128> y)) +(VPBROADCASTW(128|256|512) x:(VPINSRW128 [0] (Zero128 <t>) y)) && x.Uses == 1 => + (VPBROADCASTW(128|256|512) (VMOVQ <types.TypeVec128> y)) + diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index 12be7cae41..03f38db640 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -226,6 +226,8 @@ func init() { vgp = regInfo{inputs: vonly, outputs: gponly} vfpv = regInfo{inputs: []regMask{vz, fp}, outputs: vonly} vfpkv = regInfo{inputs: []regMask{vz, fp, mask}, outputs: vonly} + fpv = regInfo{inputs: []regMask{fp}, outputs: vonly} + gpv = regInfo{inputs: []regMask{gp}, outputs: vonly} w11 = regInfo{inputs: wzonly, outputs: wonly} w21 = regInfo{inputs: []regMask{wz, wz}, outputs: wonly} @@ -1382,6 +1384,11 @@ func init() { {name: "Zero256", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true}, {name: "Zero512", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true}, + {name: "VMOVSDf2v", argLength: 1, reg: fpv, asm: "VMOVSD"}, + {name: "VMOVSSf2v", argLength: 1, reg: fpv, asm: "VMOVSS"}, + {name: "VMOVQ", argLength: 1, reg: gpv, asm: "VMOVQ"}, + {name: "VMOVD", argLength: 1, reg: gpv, asm: "VMOVD"}, + {name: "VZEROUPPER", argLength: 0, asm: "VZEROUPPER"}, {name: "VZEROALL", argLength: 0, asm: "VZEROALL"}, diff --git a/src/cmd/compile/internal/ssa/_gen/rulegen.go b/src/cmd/compile/internal/ssa/_gen/rulegen.go index d4ca1aef22..b16f9567ba 100644 --- a/src/cmd/compile/internal/ssa/_gen/rulegen.go +++ b/src/cmd/compile/internal/ssa/_gen/rulegen.go @@ -875,7 +875,7 @@ func declReserved(name, value string) *Declare { if !reservedNames[name] { panic(fmt.Sprintf("declReserved call does not use a reserved name: %q", name)) } - return &Declare{name, exprf(value)} + return &Declare{name, exprf("%s", value)} } // breakf constructs a simple "if cond { break }" statement, using exprf for its @@ -902,7 +902,7 @@ func genBlockRewrite(rule Rule, arch arch, data blockData) *RuleRewrite { if vname == "" { vname = fmt.Sprintf("v_%v", i) } - rr.add(declf(rr.Loc, vname, cname)) + rr.add(declf(rr.Loc, vname, "%s", cname)) p, op := genMatch0(rr, arch, expr, vname, nil, false) // TODO: pass non-nil cnt? if op != "" { check := fmt.Sprintf("%s.Op == %s", cname, op) @@ -917,7 +917,7 @@ func genBlockRewrite(rule Rule, arch arch, data blockData) *RuleRewrite { } pos[i] = p } else { - rr.add(declf(rr.Loc, arg, cname)) + rr.add(declf(rr.Loc, arg, "%s", cname)) pos[i] = arg + ".Pos" } } diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 76b0f84f35..7f6e9a0282 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1214,6 +1214,10 @@ const ( OpAMD64Zero128 OpAMD64Zero256 OpAMD64Zero512 + OpAMD64VMOVSDf2v + OpAMD64VMOVSSf2v + OpAMD64VMOVQ + OpAMD64VMOVD OpAMD64VZEROUPPER OpAMD64VZEROALL OpAMD64KMOVQload @@ -18870,6 +18874,58 @@ var opcodeTable = [...]opInfo{ }, }, { + name: "VMOVSDf2v", + argLen: 1, + asm: x86.AVMOVSD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VMOVSSf2v", + argLen: 1, + asm: x86.AVMOVSS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VMOVQ", + argLen: 1, + asm: x86.AVMOVQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VMOVD", + argLen: 1, + asm: x86.AVMOVD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { name: "VZEROUPPER", argLen: 0, asm: x86.AVZEROUPPER, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 77ae32519a..469417536f 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -517,6 +517,22 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v) case OpAMD64VPANDQ512: return rewriteValueAMD64_OpAMD64VPANDQ512(v) + case OpAMD64VPBROADCASTB128: + return rewriteValueAMD64_OpAMD64VPBROADCASTB128(v) + case OpAMD64VPBROADCASTB256: + return rewriteValueAMD64_OpAMD64VPBROADCASTB256(v) + case OpAMD64VPBROADCASTB512: + return rewriteValueAMD64_OpAMD64VPBROADCASTB512(v) + case OpAMD64VPBROADCASTW128: + return rewriteValueAMD64_OpAMD64VPBROADCASTW128(v) + case OpAMD64VPBROADCASTW256: + return rewriteValueAMD64_OpAMD64VPBROADCASTW256(v) + case OpAMD64VPBROADCASTW512: + return rewriteValueAMD64_OpAMD64VPBROADCASTW512(v) + case OpAMD64VPINSRD128: + return rewriteValueAMD64_OpAMD64VPINSRD128(v) + case OpAMD64VPINSRQ128: + return rewriteValueAMD64_OpAMD64VPINSRQ128(v) case OpAMD64VPMOVVec16x16ToM: return rewriteValueAMD64_OpAMD64VPMOVVec16x16ToM(v) case OpAMD64VPMOVVec16x32ToM: @@ -28848,6 +28864,242 @@ func rewriteValueAMD64_OpAMD64VPANDQ512(v *Value) bool { } return false } +func rewriteValueAMD64_OpAMD64VPBROADCASTB128(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (VPBROADCASTB128 x:(VPINSRB128 [0] (Zero128 <t>) y)) + // cond: x.Uses == 1 + // result: (VPBROADCASTB128 (VMOVQ <types.TypeVec128> y)) + for { + x := v_0 + if x.Op != OpAMD64VPINSRB128 || auxIntToUint8(x.AuxInt) != 0 { + break + } + y := x.Args[1] + x_0 := x.Args[0] + if x_0.Op != OpAMD64Zero128 { + break + } + if !(x.Uses == 1) { + break + } + v.reset(OpAMD64VPBROADCASTB128) + v0 := b.NewValue0(v.Pos, OpAMD64VMOVQ, types.TypeVec128) + v0.AddArg(y) + v.AddArg(v0) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPBROADCASTB256(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (VPBROADCASTB256 x:(VPINSRB128 [0] (Zero128 <t>) y)) + // cond: x.Uses == 1 + // result: (VPBROADCASTB256 (VMOVQ <types.TypeVec128> y)) + for { + x := v_0 + if x.Op != OpAMD64VPINSRB128 || auxIntToUint8(x.AuxInt) != 0 { + break + } + y := x.Args[1] + x_0 := x.Args[0] + if x_0.Op != OpAMD64Zero128 { + break + } + if !(x.Uses == 1) { + break + } + v.reset(OpAMD64VPBROADCASTB256) + v0 := b.NewValue0(v.Pos, OpAMD64VMOVQ, types.TypeVec128) + v0.AddArg(y) + v.AddArg(v0) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPBROADCASTB512(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (VPBROADCASTB512 x:(VPINSRB128 [0] (Zero128 <t>) y)) + // cond: x.Uses == 1 + // result: (VPBROADCASTB512 (VMOVQ <types.TypeVec128> y)) + for { + x := v_0 + if x.Op != OpAMD64VPINSRB128 || auxIntToUint8(x.AuxInt) != 0 { + break + } + y := x.Args[1] + x_0 := x.Args[0] + if x_0.Op != OpAMD64Zero128 { + break + } + if !(x.Uses == 1) { + break + } + v.reset(OpAMD64VPBROADCASTB512) + v0 := b.NewValue0(v.Pos, OpAMD64VMOVQ, types.TypeVec128) + v0.AddArg(y) + v.AddArg(v0) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPBROADCASTW128(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (VPBROADCASTW128 x:(VPINSRW128 [0] (Zero128 <t>) y)) + // cond: x.Uses == 1 + // result: (VPBROADCASTW128 (VMOVQ <types.TypeVec128> y)) + for { + x := v_0 + if x.Op != OpAMD64VPINSRW128 || auxIntToUint8(x.AuxInt) != 0 { + break + } + y := x.Args[1] + x_0 := x.Args[0] + if x_0.Op != OpAMD64Zero128 { + break + } + if !(x.Uses == 1) { + break + } + v.reset(OpAMD64VPBROADCASTW128) + v0 := b.NewValue0(v.Pos, OpAMD64VMOVQ, types.TypeVec128) + v0.AddArg(y) + v.AddArg(v0) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPBROADCASTW256(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (VPBROADCASTW256 x:(VPINSRW128 [0] (Zero128 <t>) y)) + // cond: x.Uses == 1 + // result: (VPBROADCASTW256 (VMOVQ <types.TypeVec128> y)) + for { + x := v_0 + if x.Op != OpAMD64VPINSRW128 || auxIntToUint8(x.AuxInt) != 0 { + break + } + y := x.Args[1] + x_0 := x.Args[0] + if x_0.Op != OpAMD64Zero128 { + break + } + if !(x.Uses == 1) { + break + } + v.reset(OpAMD64VPBROADCASTW256) + v0 := b.NewValue0(v.Pos, OpAMD64VMOVQ, types.TypeVec128) + v0.AddArg(y) + v.AddArg(v0) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPBROADCASTW512(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (VPBROADCASTW512 x:(VPINSRW128 [0] (Zero128 <t>) y)) + // cond: x.Uses == 1 + // result: (VPBROADCASTW512 (VMOVQ <types.TypeVec128> y)) + for { + x := v_0 + if x.Op != OpAMD64VPINSRW128 || auxIntToUint8(x.AuxInt) != 0 { + break + } + y := x.Args[1] + x_0 := x.Args[0] + if x_0.Op != OpAMD64Zero128 { + break + } + if !(x.Uses == 1) { + break + } + v.reset(OpAMD64VPBROADCASTW512) + v0 := b.NewValue0(v.Pos, OpAMD64VMOVQ, types.TypeVec128) + v0.AddArg(y) + v.AddArg(v0) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPINSRD128(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPINSRD128 [0] (Zero128 <t>) y) + // cond: y.Type.IsFloat() + // result: (VMOVSSf2v <types.TypeVec128> y) + for { + if auxIntToUint8(v.AuxInt) != 0 || v_0.Op != OpAMD64Zero128 { + break + } + y := v_1 + if !(y.Type.IsFloat()) { + break + } + v.reset(OpAMD64VMOVSSf2v) + v.Type = types.TypeVec128 + v.AddArg(y) + return true + } + // match: (VPINSRD128 [0] (Zero128 <t>) y) + // cond: !y.Type.IsFloat() + // result: (VMOVD <types.TypeVec128> y) + for { + if auxIntToUint8(v.AuxInt) != 0 || v_0.Op != OpAMD64Zero128 { + break + } + y := v_1 + if !(!y.Type.IsFloat()) { + break + } + v.reset(OpAMD64VMOVD) + v.Type = types.TypeVec128 + v.AddArg(y) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPINSRQ128(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPINSRQ128 [0] (Zero128 <t>) y) + // cond: y.Type.IsFloat() + // result: (VMOVSDf2v <types.TypeVec128> y) + for { + if auxIntToUint8(v.AuxInt) != 0 || v_0.Op != OpAMD64Zero128 { + break + } + y := v_1 + if !(y.Type.IsFloat()) { + break + } + v.reset(OpAMD64VMOVSDf2v) + v.Type = types.TypeVec128 + v.AddArg(y) + return true + } + // match: (VPINSRQ128 [0] (Zero128 <t>) y) + // cond: !y.Type.IsFloat() + // result: (VMOVQ <types.TypeVec128> y) + for { + if auxIntToUint8(v.AuxInt) != 0 || v_0.Op != OpAMD64Zero128 { + break + } + y := v_1 + if !(!y.Type.IsFloat()) { + break + } + v.reset(OpAMD64VMOVQ) + v.Type = types.TypeVec128 + v.AddArg(y) + return true + } + return false +} func rewriteValueAMD64_OpAMD64VPMOVVec16x16ToM(v *Value) bool { v_0 := v.Args[0] // match: (VPMOVVec16x16ToM (VPMOVMToVec16x16 x)) |
