From 967465da2975fe4322080703ce5a77ea90752829 Mon Sep 17 00:00:00 2001
From: Lynn Boger <laboger@linux.vnet.ibm.com>
Date: Mon, 31 Aug 2020 09:43:40 -0400
Subject: cmd/compile: use combined shifts to improve array addressing on
 ppc64x
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change adds rules to find pairs of instructions that can
be combined into a single shifts. These instruction sequences
are common in array addressing within loops. Improvements can
be seen in many crypto packages and the hash packages.

These are based on the extended mnemonics found in the ISA
sections C.8.1 and C.8.2.

Some rules in PPC64.rules were moved because the ordering prevented
some matching.

The following results were generated on power9.

hash/crc32:
    CRC32/poly=Koopman/size=40/align=0          195ns ± 0%     163ns ± 0%  -16.41%
    CRC32/poly=Koopman/size=40/align=1          200ns ± 0%     163ns ± 0%  -18.50%
    CRC32/poly=Koopman/size=512/align=0        1.98µs ± 0%    1.67µs ± 0%  -15.46%
    CRC32/poly=Koopman/size=512/align=1        1.98µs ± 0%    1.69µs ± 0%  -14.80%
    CRC32/poly=Koopman/size=1kB/align=0        3.90µs ± 0%    3.31µs ± 0%  -15.27%
    CRC32/poly=Koopman/size=1kB/align=1        3.85µs ± 0%    3.31µs ± 0%  -14.15%
    CRC32/poly=Koopman/size=4kB/align=0        15.3µs ± 0%    13.1µs ± 0%  -14.22%
    CRC32/poly=Koopman/size=4kB/align=1        15.4µs ± 0%    13.1µs ± 0%  -14.79%
    CRC32/poly=Koopman/size=32kB/align=0        137µs ± 0%     105µs ± 0%  -23.56%
    CRC32/poly=Koopman/size=32kB/align=1        137µs ± 0%     105µs ± 0%  -23.53%

crypto/rc4:
    RC4_128    733ns ± 0%    650ns ± 0%  -11.32%  (p=1.000 n=1+1)
    RC4_1K    5.80µs ± 0%   5.17µs ± 0%  -10.89%  (p=1.000 n=1+1)
    RC4_8K    45.7µs ± 0%   40.8µs ± 0%  -10.73%  (p=1.000 n=1+1)

crypto/sha1:
    Hash8Bytes       635ns ± 0%     613ns ± 0%   -3.46%  (p=1.000 n=1+1)
    Hash320Bytes    2.30µs ± 0%    2.18µs ± 0%   -5.38%  (p=1.000 n=1+1)
    Hash1K          5.88µs ± 0%    5.38µs ± 0%   -8.62%  (p=1.000 n=1+1)
    Hash8K          42.0µs ± 0%    37.9µs ± 0%   -9.75%  (p=1.000 n=1+1)

There are other improvements found in golang.org/x/crypto which are all in the
range of 5-15%.

Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c
Reviewed-on: https://go-review.googlesource.com/c/go/+/252097
Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
---
 test/codegen/shift.go | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'test/codegen')

diff --git a/test/codegen/shift.go b/test/codegen/shift.go
index 5e50ea6bff..32214851b5 100644
--- a/test/codegen/shift.go
+++ b/test/codegen/shift.go
@@ -150,6 +150,61 @@ func lshGuarded64(v int64, s uint) int64 {
 	panic("shift too large")
 }
 
+func checkUnneededTrunc(tab *[100000]uint32, d uint64, v uint32, h uint16, b byte) (uint32, uint64) {
+
+	// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+	// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+	f := tab[byte(v)^b]
+	// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+        // ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+	f += tab[byte(v)&b]
+	// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+        // ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+	f += tab[byte(v)|b]
+	// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+        // ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+	f += tab[uint16(v)&h]
+	// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+        // ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+	f += tab[uint16(v)^h]
+	// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+        // ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+	f += tab[uint16(v)|h]
+	// ppc64le:-".*AND",-"RLDICR",".*CLRLSLDI"
+	// ppc64:-".*AND",-"RLDICR",".*CLRLSLDI"
+	f += tab[v&0xff]
+	// ppc64le:-".*AND",".*CLRLSLWI"
+        // ppc64:-".*AND",".*CLRLSLWI"
+        f += 2*uint32(uint16(d))
+	// ppc64le:-".*AND",-"RLDICR",".*CLRLSLDI"
+	// ppc64:-".*AND",-"RLDICR",".*CLRLSLDI"
+	g := 2*uint64(uint32(d))
+	return f, g
+}
+
+func checkCombinedShifts(v8 uint8, v16 uint16, v32 uint32, v64 uint64) (uint8, uint16, uint32, uint64) {
+
+	// ppc64le:-"AND","CLRLSLWI"
+	// ppc64:-"AND","CLRLSLWI"
+	f := (v8 &0xF) << 2
+	// ppc64le:-"AND","CLRLSLWI"
+        // ppc64:-"AND","CLRLSLWI"
+	f += byte(v16)<<3
+	// ppc64le:-"AND","CLRLSLWI"
+	// ppc64:-"AND","CLRLSLWI"
+	g := (v16 & 0xFF) << 3
+	// ppc64le:-"AND","CLRLSLWI"
+	// ppc64:-"AND","CLRLSLWI"
+	h := (v32 & 0xFFFFF) << 2
+	// ppc64le:-"AND","CLRLSLWI"
+        // ppc64:-"AND","CLRLSLWI"
+	h += uint32(v64)<<4
+	// ppc64le:-"AND","CLRLSLDI"
+	// ppc64:-"AND","CLRLSLDI"
+	i := (v64 & 0xFFFFFFFF) << 5
+	return f, g, h, i
+}
+
 func checkWidenAfterShift(v int64, u uint64) (int64, uint64) {
 
 	// ppc64le:-".*MOVW"
-- 
cgit v1.3


From a424f6e45e29960c933a7ccc1cd8fc9bb2766f15 Mon Sep 17 00:00:00 2001
From: Lynn Boger <laboger@linux.vnet.ibm.com>
Date: Wed, 23 Sep 2020 11:06:39 -0400
Subject: cmd/asm,cmd/compile,cmd/internal/obj/ppc64: add extswsli support on
 power9
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds support for the extswsli instruction which combines
extsw followed by a shift.

New benchmark demonstrates the improvement:
name      old time/op  new time/op  delta
ExtShift  1.34µs ± 0%  1.30µs ± 0%  -3.15%  (p=0.057 n=4+3)

Change-Id: I21b410676fdf15d20e0cbbaa75d7c6dcd3bbb7b0
Reviewed-on: https://go-review.googlesource.com/c/go/+/257017
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Carlos Eduardo Seo <carlos.seo@gmail.com>
Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
---
 src/cmd/asm/internal/asm/testdata/ppc64enc.s |   1 +
 src/cmd/compile/internal/gc/bench_test.go    |  12 ++++
 src/cmd/compile/internal/ppc64/ssa.go        |   2 +-
 src/cmd/compile/internal/ssa/gen/PPC64.rules |   2 +
 src/cmd/compile/internal/ssa/gen/PPC64Ops.go |   1 +
 src/cmd/compile/internal/ssa/opGen.go        |  15 ++++
 src/cmd/compile/internal/ssa/rewritePPC64.go |  36 ++++++++++
 src/cmd/internal/obj/ppc64/a.out.go          |   2 +
 src/cmd/internal/obj/ppc64/anames.go         |   2 +
 src/cmd/internal/obj/ppc64/asm9.go           | 104 +++++++++++++++++----------
 test/codegen/shift.go                        |   7 +-
 11 files changed, 142 insertions(+), 42 deletions(-)

(limited to 'test/codegen')

diff --git a/src/cmd/asm/internal/asm/testdata/ppc64enc.s b/src/cmd/asm/internal/asm/testdata/ppc64enc.s
index e26f6f8933..88a7609ba8 100644
--- a/src/cmd/asm/internal/asm/testdata/ppc64enc.s
+++ b/src/cmd/asm/internal/asm/testdata/ppc64enc.s
@@ -266,6 +266,7 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
 	SRDCC R3, R4                    // 7c841c37
 	ROTLW $16, R3, R4               // 5464803e
 	ROTLW R3, R4, R5                // 5c85183e
+	EXTSWSLI $3, R4, R5             // 7c851ef4
 	RLWMI $7, R3, $65535, R6        // 50663c3e
 	RLWMICC $7, R3, $65535, R6      // 50663c3f
 	RLWNM $3, R4, $7, R6            // 54861f7e
diff --git a/src/cmd/compile/internal/gc/bench_test.go b/src/cmd/compile/internal/gc/bench_test.go
index 09aaf428c3..a2887f2f7b 100644
--- a/src/cmd/compile/internal/gc/bench_test.go
+++ b/src/cmd/compile/internal/gc/bench_test.go
@@ -20,6 +20,18 @@ func BenchmarkLoadAdd(b *testing.B) {
 	}
 }
 
+// Added for ppc64 extswsli on power9
+func BenchmarkExtShift(b *testing.B) {
+	x := make([]int32, 1024)
+	for i := 0; i < b.N; i++ {
+		var s int64
+		for i := range x {
+			s ^= int64(x[i]+32) * 8
+		}
+		globl = s
+	}
+}
+
 func BenchmarkModify(b *testing.B) {
 	a := make([]int64, 1024)
 	v := globl
diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go
index 4a83a0bdd7..a5fbdaffba 100644
--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@@ -677,7 +677,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.From.Reg = v.Args[0].Reg()
 
 	case ssa.OpPPC64ADDconst, ssa.OpPPC64ANDconst, ssa.OpPPC64ORconst, ssa.OpPPC64XORconst,
-		ssa.OpPPC64SRADconst, ssa.OpPPC64SRAWconst, ssa.OpPPC64SRDconst, ssa.OpPPC64SRWconst, ssa.OpPPC64SLDconst, ssa.OpPPC64SLWconst:
+		ssa.OpPPC64SRADconst, ssa.OpPPC64SRAWconst, ssa.OpPPC64SRDconst, ssa.OpPPC64SRWconst, ssa.OpPPC64SLDconst, ssa.OpPPC64SLWconst, ssa.OpPPC64EXTSWSLconst:
 		p := s.Prog(v.Op.Asm())
 		p.Reg = v.Args[0].Reg()
 		p.From.Type = obj.TYPE_CONST
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64.rules b/src/cmd/compile/internal/ssa/gen/PPC64.rules
index 774d5096de..de30d003e6 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@@ -1025,6 +1025,8 @@
 (SLWconst [c] z:(MOVWZreg x)) && z.Uses == 1 && c < 24 => (CLRLSLWI [newPPC64ShiftAuxInt(c,8,31,32)] x)
 (SLWconst [c] z:(ANDconst [d] x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
 (SLWconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
+// special case for power9
+(SL(W|D)const [c] z:(MOVWreg x)) && c < 32 && objabi.GOPPC64 >= 9 => (EXTSWSLconst [c] x)
 
 // Lose widening ops fed to stores
 (MOVBstore [off] {sym} ptr (MOV(B|BZ|H|HZ|W|WZ)reg x) mem) => (MOVBstore [off] {sym} ptr x mem)
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
index ed99c40cd2..28317928a8 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
@@ -223,6 +223,7 @@ func init() {
 
 		{name: "ROTLconst", argLength: 1, reg: gp11, asm: "ROTL", aux: "Int64"},   // arg0 rotate left by auxInt bits
 		{name: "ROTLWconst", argLength: 1, reg: gp11, asm: "ROTLW", aux: "Int64"}, // uint32(arg0) rotate left by auxInt bits
+		{name: "EXTSWSLconst", argLength: 1, reg: gp11, asm: "EXTSWSLI", aux: "Int64"},
 
 		{name: "CNTLZD", argLength: 1, reg: gp11, asm: "CNTLZD", clobberFlags: true}, // count leading zeros
 		{name: "CNTLZW", argLength: 1, reg: gp11, asm: "CNTLZW", clobberFlags: true}, // count leading zeros (32 bit)
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 1fc0f7ea79..1fe00c7026 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1865,6 +1865,7 @@ const (
 	OpPPC64SLWconst
 	OpPPC64ROTLconst
 	OpPPC64ROTLWconst
+	OpPPC64EXTSWSLconst
 	OpPPC64CNTLZD
 	OpPPC64CNTLZW
 	OpPPC64CNTTZD
@@ -24849,6 +24850,20 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "EXTSWSLconst",
+		auxType: auxInt64,
+		argLen:  1,
+		asm:     ppc64.AEXTSWSLI,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+			outputs: []outputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
 	{
 		name:         "CNTLZD",
 		argLen:       1,
diff --git a/src/cmd/compile/internal/ssa/rewritePPC64.go b/src/cmd/compile/internal/ssa/rewritePPC64.go
index 12b08824b5..29ec3992f2 100644
--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
@@ -12877,6 +12877,24 @@ func rewriteValuePPC64_OpPPC64SLDconst(v *Value) bool {
 		}
 		break
 	}
+	// match: (SLDconst [c] z:(MOVWreg x))
+	// cond: c < 32 && objabi.GOPPC64 >= 9
+	// result: (EXTSWSLconst [c] x)
+	for {
+		c := auxIntToInt64(v.AuxInt)
+		z := v_0
+		if z.Op != OpPPC64MOVWreg {
+			break
+		}
+		x := z.Args[0]
+		if !(c < 32 && objabi.GOPPC64 >= 9) {
+			break
+		}
+		v.reset(OpPPC64EXTSWSLconst)
+		v.AuxInt = int64ToAuxInt(c)
+		v.AddArg(x)
+		return true
+	}
 	return false
 }
 func rewriteValuePPC64_OpPPC64SLW(v *Value) bool {
@@ -13000,6 +13018,24 @@ func rewriteValuePPC64_OpPPC64SLWconst(v *Value) bool {
 		}
 		break
 	}
+	// match: (SLWconst [c] z:(MOVWreg x))
+	// cond: c < 32 && objabi.GOPPC64 >= 9
+	// result: (EXTSWSLconst [c] x)
+	for {
+		c := auxIntToInt64(v.AuxInt)
+		z := v_0
+		if z.Op != OpPPC64MOVWreg {
+			break
+		}
+		x := z.Args[0]
+		if !(c < 32 && objabi.GOPPC64 >= 9) {
+			break
+		}
+		v.reset(OpPPC64EXTSWSLconst)
+		v.AuxInt = int64ToAuxInt(c)
+		v.AddArg(x)
+		return true
+	}
 	return false
 }
 func rewriteValuePPC64_OpPPC64SRAD(v *Value) bool {
diff --git a/src/cmd/internal/obj/ppc64/a.out.go b/src/cmd/internal/obj/ppc64/a.out.go
index f438803fb5..4c97302f83 100644
--- a/src/cmd/internal/obj/ppc64/a.out.go
+++ b/src/cmd/internal/obj/ppc64/a.out.go
@@ -733,6 +733,8 @@ const (
 	ASRAD
 	ASRADCC
 	ASRDCC
+	AEXTSWSLI
+	AEXTSWSLICC
 	ASTDCCC
 	ATD
 
diff --git a/src/cmd/internal/obj/ppc64/anames.go b/src/cmd/internal/obj/ppc64/anames.go
index accd87fe00..fca4b3e355 100644
--- a/src/cmd/internal/obj/ppc64/anames.go
+++ b/src/cmd/internal/obj/ppc64/anames.go
@@ -329,6 +329,8 @@ var Anames = []string{
 	"SRAD",
 	"SRADCC",
 	"SRDCC",
+	"EXTSWSLI",
+	"EXTSWSLICC",
 	"STDCCC",
 	"TD",
 	"DWORD",
diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go
index 60dda72507..9f06bdf8b3 100644
--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -160,6 +160,8 @@ var optab = []Optab{
 	{ASLD, C_REG, C_REG, C_NONE, C_REG, 6, 4, 0},
 	{ASLD, C_SCON, C_REG, C_NONE, C_REG, 25, 4, 0},
 	{ASLD, C_SCON, C_NONE, C_NONE, C_REG, 25, 4, 0},
+	{AEXTSWSLI, C_SCON, C_NONE, C_NONE, C_REG, 25, 4, 0},
+	{AEXTSWSLI, C_SCON, C_REG, C_NONE, C_REG, 25, 4, 0},
 	{ASLW, C_SCON, C_REG, C_NONE, C_REG, 57, 4, 0},
 	{ASLW, C_SCON, C_NONE, C_NONE, C_REG, 57, 4, 0},
 	{ASRAW, C_REG, C_NONE, C_NONE, C_REG, 6, 4, 0},
@@ -1877,6 +1879,9 @@ func buildop(ctxt *obj.Link) {
 		case ASRAW: /* sraw Rb,Rs,Ra; srawi sh,Rs,Ra */
 			opset(ASRAWCC, r0)
 
+		case AEXTSWSLI:
+			opset(AEXTSWSLICC, r0)
+
 		case ASRAD: /* sraw Rb,Rs,Ra; srawi sh,Rs,Ra */
 			opset(ASRADCC, r0)
 
@@ -2189,49 +2194,54 @@ func AOP_RLDIC(op uint32, a uint32, s uint32, sh uint32, m uint32) uint32 {
 	return op | (s&31)<<21 | (a&31)<<16 | (sh&31)<<11 | ((sh&32)>>5)<<1 | (m&31)<<6 | ((m&32)>>5)<<5
 }
 
+func AOP_EXTSWSLI(op uint32, a uint32, s uint32, sh uint32) uint32 {
+	return op | (a&31)<<21 | (s&31)<<16 | (sh&31)<<11 | ((sh&32)>>5)<<1
+}
+
 func AOP_ISEL(op uint32, t uint32, a uint32, b uint32, bc uint32) uint32 {
 	return op | (t&31)<<21 | (a&31)<<16 | (b&31)<<11 | (bc&0x1F)<<6
 }
 
 const (
 	/* each rhs is OPVCC(_, _, _, _) */
-	OP_ADD    = 31<<26 | 266<<1 | 0<<10 | 0
-	OP_ADDI   = 14<<26 | 0<<1 | 0<<10 | 0
-	OP_ADDIS  = 15<<26 | 0<<1 | 0<<10 | 0
-	OP_ANDI   = 28<<26 | 0<<1 | 0<<10 | 0
-	OP_EXTSB  = 31<<26 | 954<<1 | 0<<10 | 0
-	OP_EXTSH  = 31<<26 | 922<<1 | 0<<10 | 0
-	OP_EXTSW  = 31<<26 | 986<<1 | 0<<10 | 0
-	OP_ISEL   = 31<<26 | 15<<1 | 0<<10 | 0
-	OP_MCRF   = 19<<26 | 0<<1 | 0<<10 | 0
-	OP_MCRFS  = 63<<26 | 64<<1 | 0<<10 | 0
-	OP_MCRXR  = 31<<26 | 512<<1 | 0<<10 | 0
-	OP_MFCR   = 31<<26 | 19<<1 | 0<<10 | 0
-	OP_MFFS   = 63<<26 | 583<<1 | 0<<10 | 0
-	OP_MFMSR  = 31<<26 | 83<<1 | 0<<10 | 0
-	OP_MFSPR  = 31<<26 | 339<<1 | 0<<10 | 0
-	OP_MFSR   = 31<<26 | 595<<1 | 0<<10 | 0
-	OP_MFSRIN = 31<<26 | 659<<1 | 0<<10 | 0
-	OP_MTCRF  = 31<<26 | 144<<1 | 0<<10 | 0
-	OP_MTFSF  = 63<<26 | 711<<1 | 0<<10 | 0
-	OP_MTFSFI = 63<<26 | 134<<1 | 0<<10 | 0
-	OP_MTMSR  = 31<<26 | 146<<1 | 0<<10 | 0
-	OP_MTMSRD = 31<<26 | 178<<1 | 0<<10 | 0
-	OP_MTSPR  = 31<<26 | 467<<1 | 0<<10 | 0
-	OP_MTSR   = 31<<26 | 210<<1 | 0<<10 | 0
-	OP_MTSRIN = 31<<26 | 242<<1 | 0<<10 | 0
-	OP_MULLW  = 31<<26 | 235<<1 | 0<<10 | 0
-	OP_MULLD  = 31<<26 | 233<<1 | 0<<10 | 0
-	OP_OR     = 31<<26 | 444<<1 | 0<<10 | 0
-	OP_ORI    = 24<<26 | 0<<1 | 0<<10 | 0
-	OP_ORIS   = 25<<26 | 0<<1 | 0<<10 | 0
-	OP_RLWINM = 21<<26 | 0<<1 | 0<<10 | 0
-	OP_RLWNM  = 23<<26 | 0<<1 | 0<<10 | 0
-	OP_SUBF   = 31<<26 | 40<<1 | 0<<10 | 0
-	OP_RLDIC  = 30<<26 | 4<<1 | 0<<10 | 0
-	OP_RLDICR = 30<<26 | 2<<1 | 0<<10 | 0
-	OP_RLDICL = 30<<26 | 0<<1 | 0<<10 | 0
-	OP_RLDCL  = 30<<26 | 8<<1 | 0<<10 | 0
+	OP_ADD      = 31<<26 | 266<<1 | 0<<10 | 0
+	OP_ADDI     = 14<<26 | 0<<1 | 0<<10 | 0
+	OP_ADDIS    = 15<<26 | 0<<1 | 0<<10 | 0
+	OP_ANDI     = 28<<26 | 0<<1 | 0<<10 | 0
+	OP_EXTSB    = 31<<26 | 954<<1 | 0<<10 | 0
+	OP_EXTSH    = 31<<26 | 922<<1 | 0<<10 | 0
+	OP_EXTSW    = 31<<26 | 986<<1 | 0<<10 | 0
+	OP_ISEL     = 31<<26 | 15<<1 | 0<<10 | 0
+	OP_MCRF     = 19<<26 | 0<<1 | 0<<10 | 0
+	OP_MCRFS    = 63<<26 | 64<<1 | 0<<10 | 0
+	OP_MCRXR    = 31<<26 | 512<<1 | 0<<10 | 0
+	OP_MFCR     = 31<<26 | 19<<1 | 0<<10 | 0
+	OP_MFFS     = 63<<26 | 583<<1 | 0<<10 | 0
+	OP_MFMSR    = 31<<26 | 83<<1 | 0<<10 | 0
+	OP_MFSPR    = 31<<26 | 339<<1 | 0<<10 | 0
+	OP_MFSR     = 31<<26 | 595<<1 | 0<<10 | 0
+	OP_MFSRIN   = 31<<26 | 659<<1 | 0<<10 | 0
+	OP_MTCRF    = 31<<26 | 144<<1 | 0<<10 | 0
+	OP_MTFSF    = 63<<26 | 711<<1 | 0<<10 | 0
+	OP_MTFSFI   = 63<<26 | 134<<1 | 0<<10 | 0
+	OP_MTMSR    = 31<<26 | 146<<1 | 0<<10 | 0
+	OP_MTMSRD   = 31<<26 | 178<<1 | 0<<10 | 0
+	OP_MTSPR    = 31<<26 | 467<<1 | 0<<10 | 0
+	OP_MTSR     = 31<<26 | 210<<1 | 0<<10 | 0
+	OP_MTSRIN   = 31<<26 | 242<<1 | 0<<10 | 0
+	OP_MULLW    = 31<<26 | 235<<1 | 0<<10 | 0
+	OP_MULLD    = 31<<26 | 233<<1 | 0<<10 | 0
+	OP_OR       = 31<<26 | 444<<1 | 0<<10 | 0
+	OP_ORI      = 24<<26 | 0<<1 | 0<<10 | 0
+	OP_ORIS     = 25<<26 | 0<<1 | 0<<10 | 0
+	OP_RLWINM   = 21<<26 | 0<<1 | 0<<10 | 0
+	OP_RLWNM    = 23<<26 | 0<<1 | 0<<10 | 0
+	OP_SUBF     = 31<<26 | 40<<1 | 0<<10 | 0
+	OP_RLDIC    = 30<<26 | 4<<1 | 0<<10 | 0
+	OP_RLDICR   = 30<<26 | 2<<1 | 0<<10 | 0
+	OP_RLDICL   = 30<<26 | 0<<1 | 0<<10 | 0
+	OP_RLDCL    = 30<<26 | 8<<1 | 0<<10 | 0
+	OP_EXTSWSLI = 31<<26 | 445<<2
 )
 
 func oclass(a *obj.Addr) int {
@@ -2965,14 +2975,21 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		case AROTL:
 			a = int(0)
 			op = OP_RLDICL
+		case AEXTSWSLI:
+			a = int(v)
 		default:
 			c.ctxt.Diag("unexpected op in sldi case\n%v", p)
 			a = 0
 			o1 = 0
 		}
 
-		o1 = AOP_RLDIC(op, uint32(p.To.Reg), uint32(r), uint32(v), uint32(a))
-		if p.As == ASLDCC || p.As == ASRDCC {
+		if p.As == AEXTSWSLI || p.As == AEXTSWSLICC {
+			o1 = AOP_EXTSWSLI(OP_EXTSWSLI, uint32(r), uint32(p.To.Reg), uint32(v))
+
+		} else {
+			o1 = AOP_RLDIC(op, uint32(p.To.Reg), uint32(r), uint32(v), uint32(a))
+		}
+		if p.As == ASLDCC || p.As == ASRDCC || p.As == AEXTSWSLICC {
 			o1 |= 1 // Set the condition code bit
 		}
 
@@ -4350,6 +4367,11 @@ func (c *ctxt9) oprrr(a obj.As) uint32 {
 	case ASRADCC:
 		return OPVCC(31, 794, 0, 1)
 
+	case AEXTSWSLI:
+		return OPVCC(31, 445, 0, 0)
+	case AEXTSWSLICC:
+		return OPVCC(31, 445, 0, 1)
+
 	case ASRW:
 		return OPVCC(31, 536, 0, 0)
 	case ASRWCC:
@@ -5013,6 +5035,10 @@ func (c *ctxt9) opirr(a obj.As) uint32 {
 		return OPVCC(31, (413 << 1), 0, 0)
 	case ASRADCC:
 		return OPVCC(31, (413 << 1), 0, 1)
+	case AEXTSWSLI:
+		return OPVCC(31, 445, 0, 0)
+	case AEXTSWSLICC:
+		return OPVCC(31, 445, 0, 1)
 
 	case ASTSW:
 		return OPVCC(31, 725, 0, 0)
diff --git a/test/codegen/shift.go b/test/codegen/shift.go
index 32214851b5..abc4b091c9 100644
--- a/test/codegen/shift.go
+++ b/test/codegen/shift.go
@@ -182,7 +182,7 @@ func checkUnneededTrunc(tab *[100000]uint32, d uint64, v uint32, h uint16, b byt
 	return f, g
 }
 
-func checkCombinedShifts(v8 uint8, v16 uint16, v32 uint32, v64 uint64) (uint8, uint16, uint32, uint64) {
+func checkCombinedShifts(v8 uint8, v16 uint16, v32 uint32, x32 int32, v64 uint64) (uint8, uint16, uint32, uint64, int64) {
 
 	// ppc64le:-"AND","CLRLSLWI"
 	// ppc64:-"AND","CLRLSLWI"
@@ -202,7 +202,10 @@ func checkCombinedShifts(v8 uint8, v16 uint16, v32 uint32, v64 uint64) (uint8, u
 	// ppc64le:-"AND","CLRLSLDI"
 	// ppc64:-"AND","CLRLSLDI"
 	i := (v64 & 0xFFFFFFFF) << 5
-	return f, g, h, i
+	// ppc64le/power9:-"SLD","EXTSWSLI"
+	// ppc64/power9:-"SLD","EXTSWSLI"
+	j := int64(x32+32)*8
+	return f, g, h, i, j
 }
 
 func checkWidenAfterShift(v int64, u uint64) (int64, uint64) {
-- 
cgit v1.3


From cc2a5cf4b8b0aeaccd3dd439f8d3d68f25eef358 Mon Sep 17 00:00:00 2001
From: Lynn Boger <laboger@linux.vnet.ibm.com>
Date: Mon, 28 Sep 2020 18:20:12 -0400
Subject: cmd/compile,cmd/internal/obj/ppc64: fix some shift rules due to a
 regression

A recent change to improve shifts was generating some
invalid cases when the rule was based on an AND. The
extended mnemonics CLRLSLDI and CLRLSLWI only allow
certain values for the operands and in the mask case
those values were not being checked properly. This
adds a check to those rules to verify that the
'b' and 'n' values used when an AND was part of the rule
have correct values.

There was a bug in some diag messages in asm9. The
message expected 3 values but only provided 2. Those are
corrected here also.

The test/codegen/shift.go was updated to add a few more
cases to check for the case mentioned here.

Some of the comments that mention the order of operands
in these extended mnemonics were wrong and those have been
corrected.

Fixes #41683.

Change-Id: If5bb860acaa5051b9e0cd80784b2868b85898c31
Reviewed-on: https://go-review.googlesource.com/c/go/+/258138
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Paul Murphy <murp@ibm.com>
Reviewed-by: Carlos Eduardo Seo <carlos.seo@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
---
 src/cmd/asm/internal/asm/testdata/ppc64enc.s |  4 ++--
 src/cmd/compile/internal/ppc64/ssa.go        | 12 +++++-----
 src/cmd/compile/internal/ssa/gen/PPC64.rules |  9 ++++---
 src/cmd/compile/internal/ssa/rewrite.go      |  4 ++--
 src/cmd/compile/internal/ssa/rewritePPC64.go | 34 +++++++--------------------
 src/cmd/internal/obj/ppc64/asm9.go           | 35 ++++++++++++++--------------
 test/codegen/shift.go                        | 17 ++++++++------
 7 files changed, 50 insertions(+), 65 deletions(-)

(limited to 'test/codegen')

diff --git a/src/cmd/asm/internal/asm/testdata/ppc64enc.s b/src/cmd/asm/internal/asm/testdata/ppc64enc.s
index 88a7609ba8..869f8c2d4f 100644
--- a/src/cmd/asm/internal/asm/testdata/ppc64enc.s
+++ b/src/cmd/asm/internal/asm/testdata/ppc64enc.s
@@ -287,8 +287,8 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
 	RLDICRCC $0, R4, $15, R6        // 788603c5
 	RLDIC $0, R4, $15, R6           // 788603c8
 	RLDICCC $0, R4, $15, R6         // 788603c9
-	CLRLSLWI $16, R5, $8, R4        // 54a4861e
-	CLRLSLDI $2, R4, $24, R3        // 78831588
+	CLRLSLWI $8, R5, $6, R4         // 54a430b2
+	CLRLSLDI $24, R4, $4, R3        // 78832508
 
 	BEQ 0(PC)                       // 41820000
 	BGE 0(PC)                       // 40800000
diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go
index a5fbdaffba..d83b2df379 100644
--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@@ -570,9 +570,9 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		r1 := v.Args[0].Reg()
 		shifts := v.AuxInt
 		p := s.Prog(v.Op.Asm())
-		// clrlslwi ra,rs,sh,mb will become rlwinm ra,rs,sh,mb-sh,31-n as described in ISA
-		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftsh(shifts)}
-		p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)})
+		// clrlslwi ra,rs,mb,sh will become rlwinm ra,rs,sh,mb-sh,31-sh as described in ISA
+		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)}
+		p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftsh(shifts)})
 		p.Reg = r1
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = r
@@ -582,9 +582,9 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		r1 := v.Args[0].Reg()
 		shifts := v.AuxInt
 		p := s.Prog(v.Op.Asm())
-		// clrlsldi ra,rs,sh,mb will become rldic ra,rs,sh,mb-sh
-		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftsh(shifts)}
-		p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)})
+		// clrlsldi ra,rs,mb,sh will become rldic ra,rs,sh,mb-sh
+		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)}
+		p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftsh(shifts)})
 		p.Reg = r1
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = r
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64.rules b/src/cmd/compile/internal/ssa/gen/PPC64.rules
index de30d003e6..83ee4c499b 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@@ -1018,13 +1018,12 @@
 (SLDconst [c] z:(MOVHZreg x)) && c < 16 && z.Uses == 1 => (CLRLSLDI [newPPC64ShiftAuxInt(c,48,63,64)] x)
 (SLDconst [c] z:(MOVWZreg x)) && c < 32 && z.Uses == 1 => (CLRLSLDI [newPPC64ShiftAuxInt(c,32,63,64)] x)
 
-(SLDconst [c] z:(ANDconst [d] x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
-(SLDconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
+(SLDconst [c] z:(ANDconst [d] x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c <= (64-getPPC64ShiftMaskLength(d)) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
+(SLDconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(64-getPPC64ShiftMaskLength(d)) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
 (SLWconst [c] z:(MOVBZreg x)) && z.Uses == 1 && c < 8 => (CLRLSLWI [newPPC64ShiftAuxInt(c,24,31,32)] x)
 (SLWconst [c] z:(MOVHZreg x)) && z.Uses == 1 && c < 16 => (CLRLSLWI [newPPC64ShiftAuxInt(c,16,31,32)] x)
-(SLWconst [c] z:(MOVWZreg x)) && z.Uses == 1 && c < 24 => (CLRLSLWI [newPPC64ShiftAuxInt(c,8,31,32)] x)
-(SLWconst [c] z:(ANDconst [d] x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
-(SLWconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
+(SLWconst [c] z:(ANDconst [d] x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(32-getPPC64ShiftMaskLength(d)) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
+(SLWconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(32-getPPC64ShiftMaskLength(d)) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
 // special case for power9
 (SL(W|D)const [c] z:(MOVWreg x)) && c < 32 && objabi.GOPPC64 >= 9 => (EXTSWSLconst [c] x)
 
diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
index 9f4de83a77..5d8b3ddc4e 100644
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -1380,8 +1380,8 @@ func GetPPC64Shiftme(auxint int64) int64 {
 	return int64(int8(auxint))
 }
 
-// Catch the simple ones first
-// TODO: Later catch more cases
+// This verifies that the mask occupies the
+// rightmost bits.
 func isPPC64ValidShiftMask(v int64) bool {
 	if ((v + 1) & v) == 0 {
 		return true
diff --git a/src/cmd/compile/internal/ssa/rewritePPC64.go b/src/cmd/compile/internal/ssa/rewritePPC64.go
index 29ec3992f2..9822637b05 100644
--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
@@ -12831,7 +12831,7 @@ func rewriteValuePPC64_OpPPC64SLDconst(v *Value) bool {
 		return true
 	}
 	// match: (SLDconst [c] z:(ANDconst [d] x))
-	// cond: z.Uses == 1 && isPPC64ValidShiftMask(d)
+	// cond: z.Uses == 1 && isPPC64ValidShiftMask(d) && c <= (64-getPPC64ShiftMaskLength(d))
 	// result: (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
 	for {
 		c := auxIntToInt64(v.AuxInt)
@@ -12841,7 +12841,7 @@ func rewriteValuePPC64_OpPPC64SLDconst(v *Value) bool {
 		}
 		d := auxIntToInt64(z.AuxInt)
 		x := z.Args[0]
-		if !(z.Uses == 1 && isPPC64ValidShiftMask(d)) {
+		if !(z.Uses == 1 && isPPC64ValidShiftMask(d) && c <= (64-getPPC64ShiftMaskLength(d))) {
 			break
 		}
 		v.reset(OpPPC64CLRLSLDI)
@@ -12850,7 +12850,7 @@ func rewriteValuePPC64_OpPPC64SLDconst(v *Value) bool {
 		return true
 	}
 	// match: (SLDconst [c] z:(AND (MOVDconst [d]) x))
-	// cond: z.Uses == 1 && isPPC64ValidShiftMask(d)
+	// cond: z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(64-getPPC64ShiftMaskLength(d))
 	// result: (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
 	for {
 		c := auxIntToInt64(v.AuxInt)
@@ -12867,7 +12867,7 @@ func rewriteValuePPC64_OpPPC64SLDconst(v *Value) bool {
 			}
 			d := auxIntToInt64(z_0.AuxInt)
 			x := z_1
-			if !(z.Uses == 1 && isPPC64ValidShiftMask(d)) {
+			if !(z.Uses == 1 && isPPC64ValidShiftMask(d) && c <= (64-getPPC64ShiftMaskLength(d))) {
 				continue
 			}
 			v.reset(OpPPC64CLRLSLDI)
@@ -12953,26 +12953,8 @@ func rewriteValuePPC64_OpPPC64SLWconst(v *Value) bool {
 		v.AddArg(x)
 		return true
 	}
-	// match: (SLWconst [c] z:(MOVWZreg x))
-	// cond: z.Uses == 1 && c < 24
-	// result: (CLRLSLWI [newPPC64ShiftAuxInt(c,8,31,32)] x)
-	for {
-		c := auxIntToInt64(v.AuxInt)
-		z := v_0
-		if z.Op != OpPPC64MOVWZreg {
-			break
-		}
-		x := z.Args[0]
-		if !(z.Uses == 1 && c < 24) {
-			break
-		}
-		v.reset(OpPPC64CLRLSLWI)
-		v.AuxInt = int32ToAuxInt(newPPC64ShiftAuxInt(c, 8, 31, 32))
-		v.AddArg(x)
-		return true
-	}
 	// match: (SLWconst [c] z:(ANDconst [d] x))
-	// cond: z.Uses == 1 && isPPC64ValidShiftMask(d)
+	// cond: z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(32-getPPC64ShiftMaskLength(d))
 	// result: (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
 	for {
 		c := auxIntToInt64(v.AuxInt)
@@ -12982,7 +12964,7 @@ func rewriteValuePPC64_OpPPC64SLWconst(v *Value) bool {
 		}
 		d := auxIntToInt64(z.AuxInt)
 		x := z.Args[0]
-		if !(z.Uses == 1 && isPPC64ValidShiftMask(d)) {
+		if !(z.Uses == 1 && isPPC64ValidShiftMask(d) && c <= (32-getPPC64ShiftMaskLength(d))) {
 			break
 		}
 		v.reset(OpPPC64CLRLSLWI)
@@ -12991,7 +12973,7 @@ func rewriteValuePPC64_OpPPC64SLWconst(v *Value) bool {
 		return true
 	}
 	// match: (SLWconst [c] z:(AND (MOVDconst [d]) x))
-	// cond: z.Uses == 1 && isPPC64ValidShiftMask(d)
+	// cond: z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(32-getPPC64ShiftMaskLength(d))
 	// result: (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
 	for {
 		c := auxIntToInt64(v.AuxInt)
@@ -13008,7 +12990,7 @@ func rewriteValuePPC64_OpPPC64SLWconst(v *Value) bool {
 			}
 			d := auxIntToInt64(z_0.AuxInt)
 			x := z_1
-			if !(z.Uses == 1 && isPPC64ValidShiftMask(d)) {
+			if !(z.Uses == 1 && isPPC64ValidShiftMask(d) && c <= (32-getPPC64ShiftMaskLength(d))) {
 				continue
 			}
 			v.reset(OpPPC64CLRLSLWI)
diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go
index 9f06bdf8b3..928e299f43 100644
--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -2749,7 +2749,7 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			me := int(d)
 			sh := c.regoff(&p.From)
 			if me < 0 || me > 63 || sh > 63 {
-				c.ctxt.Diag("Invalid me or sh for RLDICR: %x %x\n%v", int(d), sh)
+				c.ctxt.Diag("Invalid me or sh for RLDICR: %x %x\n%v", int(d), sh, p)
 			}
 			o1 = AOP_RLDIC(c.oprrr(p.As), uint32(p.To.Reg), uint32(r), uint32(sh), uint32(me))
 
@@ -2757,19 +2757,19 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			mb := int(d)
 			sh := c.regoff(&p.From)
 			if mb < 0 || mb > 63 || sh > 63 {
-				c.ctxt.Diag("Invalid mb or sh for RLDIC, RLDICL: %x %x\n%v", mb, sh)
+				c.ctxt.Diag("Invalid mb or sh for RLDIC, RLDICL: %x %x\n%v", mb, sh, p)
 			}
 			o1 = AOP_RLDIC(c.oprrr(p.As), uint32(p.To.Reg), uint32(r), uint32(sh), uint32(mb))
 
 		case ACLRLSLDI:
 			// This is an extended mnemonic defined in the ISA section C.8.1
-			// clrlsldi ra,rs,n,b --> rldic ra,rs,n,b-n
+			// clrlsldi ra,rs,b,n --> rldic ra,rs,n,b-n
 			// It maps onto RLDIC so is directly generated here based on the operands from
 			// the clrlsldi.
-			b := int(d)
-			n := c.regoff(&p.From)
-			if n > int32(b) || b > 63 {
-				c.ctxt.Diag("Invalid n or b for CLRLSLDI: %x %x\n%v", n, b)
+			n := int32(d)
+			b := c.regoff(&p.From)
+			if n > b || b > 63 {
+				c.ctxt.Diag("Invalid n or b for CLRLSLDI: %x %x\n%v", n, b, p)
 			}
 			o1 = AOP_RLDIC(OP_RLDIC, uint32(p.To.Reg), uint32(r), uint32(n), uint32(b)-uint32(n))
 
@@ -3395,14 +3395,15 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		v := c.regoff(&p.From)
 		switch p.As {
 		case ACLRLSLWI:
-			b := c.regoff(p.GetFrom3())
+			n := c.regoff(p.GetFrom3())
 			// This is an extended mnemonic described in the ISA C.8.2
-			// clrlslwi ra,rs,n,b -> rlwinm ra,rs,n,b-n,31-n
+			// clrlslwi ra,rs,b,n -> rlwinm ra,rs,n,b-n,31-n
 			// It maps onto rlwinm which is directly generated here.
-			if v < 0 || v > 32 || b > 32 {
-				c.ctxt.Diag("Invalid n or b for CLRLSLWI: %x %x\n%v", v, b)
+			if n > v || v >= 32 {
+				c.ctxt.Diag("Invalid n or b for CLRLSLWI: %x %x\n%v", v, n, p)
 			}
-			o1 = OP_RLW(OP_RLWINM, uint32(p.To.Reg), uint32(p.Reg), uint32(v), uint32(b-v), uint32(31-v))
+
+			o1 = OP_RLW(OP_RLWINM, uint32(p.To.Reg), uint32(p.Reg), uint32(n), uint32(v-n), uint32(31-n))
 		default:
 			var mask [2]uint8
 			c.maskgen(p, mask[:], uint32(c.regoff(p.GetFrom3())))
@@ -3414,16 +3415,16 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		v := c.regoff(&p.From)
 		switch p.As {
 		case ACLRLSLWI:
-			b := c.regoff(p.GetFrom3())
-			if v > b || b > 32 {
+			n := c.regoff(p.GetFrom3())
+			if n > v || v >= 32 {
 				// Message will match operands from the ISA even though in the
 				// code it uses 'v'
-				c.ctxt.Diag("Invalid n or b for CLRLSLWI: %x %x\n%v", v, b)
+				c.ctxt.Diag("Invalid n or b for CLRLSLWI: %x %x\n%v", v, n, p)
 			}
 			// This is an extended mnemonic described in the ISA C.8.2
-			// clrlslwi ra,rs,n,b -> rlwinm ra,rs,n,b-n,31-n
+			// clrlslwi ra,rs,b,n -> rlwinm ra,rs,n,b-n,31-n
 			// It generates the rlwinm directly here.
-			o1 = OP_RLW(OP_RLWINM, uint32(p.To.Reg), uint32(p.Reg), uint32(v), uint32(b-v), uint32(31-v))
+			o1 = OP_RLW(OP_RLWINM, uint32(p.To.Reg), uint32(p.Reg), uint32(n), uint32(v-n), uint32(31-n))
 		default:
 			var mask [2]uint8
 			c.maskgen(p, mask[:], uint32(c.regoff(p.GetFrom3())))
diff --git a/test/codegen/shift.go b/test/codegen/shift.go
index abc4b091c9..bbfc85ffbb 100644
--- a/test/codegen/shift.go
+++ b/test/codegen/shift.go
@@ -187,8 +187,8 @@ func checkCombinedShifts(v8 uint8, v16 uint16, v32 uint32, x32 int32, v64 uint64
 	// ppc64le:-"AND","CLRLSLWI"
 	// ppc64:-"AND","CLRLSLWI"
 	f := (v8 &0xF) << 2
-	// ppc64le:-"AND","CLRLSLWI"
-        // ppc64:-"AND","CLRLSLWI"
+	// ppc64le:"CLRLSLWI"
+	// ppc64:"CLRLSLWI"
 	f += byte(v16)<<3
 	// ppc64le:-"AND","CLRLSLWI"
 	// ppc64:-"AND","CLRLSLWI"
@@ -196,12 +196,15 @@ func checkCombinedShifts(v8 uint8, v16 uint16, v32 uint32, x32 int32, v64 uint64
 	// ppc64le:-"AND","CLRLSLWI"
 	// ppc64:-"AND","CLRLSLWI"
 	h := (v32 & 0xFFFFF) << 2
-	// ppc64le:-"AND","CLRLSLWI"
-        // ppc64:-"AND","CLRLSLWI"
-	h += uint32(v64)<<4
-	// ppc64le:-"AND","CLRLSLDI"
-	// ppc64:-"AND","CLRLSLDI"
+	// ppc64le:"CLRLSLDI"
+	// ppc64:"CLRLSLDI"
 	i := (v64 & 0xFFFFFFFF) << 5
+	// ppc64le:-"CLRLSLDI"
+	// ppc64:-"CLRLSLDI"
+	i += (v64 & 0xFFFFFFF) << 38
+	// ppc64le/power9:-"CLRLSLDI"
+	// ppc64/power9:-"CLRLSLDI"
+	i += (v64 & 0xFFFF00) << 10
 	// ppc64le/power9:-"SLD","EXTSWSLI"
 	// ppc64/power9:-"SLD","EXTSWSLI"
 	j := int64(x32+32)*8
-- 
cgit v1.3


From fe2cfb74ba6352990f5b41260b99e80f78e4a90a Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Thu, 1 Oct 2020 14:49:33 -0700
Subject: all: drop 387 support

My last 387 CL. So sad ... ... ... ... not!

Fixes #40255

Change-Id: I8d4ddb744b234b8adc735db2f7c3c7b6d8bbdfa4
Reviewed-on: https://go-review.googlesource.com/c/go/+/258957
Trust: Keith Randall <khr@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
---
 src/cmd/asm/internal/asm/endtoend_test.go  |   7 +-
 src/cmd/compile/internal/gc/float_test.go  |  19 --
 src/cmd/compile/internal/gc/go.go          |   5 -
 src/cmd/compile/internal/gc/ssa.go         |  15 +-
 src/cmd/compile/internal/ssa/config.go     |   6 -
 src/cmd/compile/internal/ssa/gen/386.rules |  10 +-
 src/cmd/compile/internal/ssa/gen/386Ops.go |  14 -
 src/cmd/compile/internal/ssa/opGen.go      |  13 -
 src/cmd/compile/internal/ssa/regalloc.go   |  14 -
 src/cmd/compile/internal/ssa/rewrite386.go |  84 ++----
 src/cmd/compile/internal/x86/387.go        | 403 -----------------------------
 src/cmd/compile/internal/x86/galign.go     |  17 +-
 src/cmd/compile/internal/x86/ssa.go        |   2 -
 src/cmd/dist/build.go                      |  15 --
 src/cmd/dist/buildruntime.go               |   2 -
 src/cmd/dist/cpuid_386.s                   |  16 --
 src/cmd/dist/cpuid_amd64.s                 |  16 --
 src/cmd/dist/cpuid_default.s               |  10 -
 src/cmd/dist/util_gc.go                    |  12 -
 src/cmd/dist/util_gccgo.go                 |  13 -
 src/cmd/go/alldocs.go                      |   3 -
 src/cmd/go/internal/cfg/cfg.go             |   3 -
 src/cmd/go/internal/envcmd/env.go          |   5 +-
 src/cmd/go/internal/help/helpdoc.go        |   3 -
 src/cmd/go/internal/work/exec.go           |   4 +-
 src/cmd/internal/objabi/util.go            |   9 +-
 src/internal/cfg/cfg.go                    |   1 -
 src/reflect/all_test.go                    |  18 --
 src/runtime/mkpreempt.go                   |  33 +--
 src/runtime/preempt_386.s                  |  45 ++--
 src/runtime/vlrt.go                        |   5 +-
 test/codegen/arithmetic.go                 |   6 +-
 test/codegen/floats.go                     |  19 +-
 test/codegen/math.go                       |   2 +-
 test/codegen/memops.go                     |  32 +--
 test/run.go                                |  12 +-
 36 files changed, 97 insertions(+), 796 deletions(-)
 delete mode 100644 src/cmd/compile/internal/x86/387.go
 delete mode 100644 src/cmd/dist/cpuid_386.s
 delete mode 100644 src/cmd/dist/cpuid_amd64.s
 delete mode 100644 src/cmd/dist/cpuid_default.s

(limited to 'test/codegen')

diff --git a/src/cmd/asm/internal/asm/endtoend_test.go b/src/cmd/asm/internal/asm/endtoend_test.go
index 0759b7d10f..15202dc5dc 100644
--- a/src/cmd/asm/internal/asm/endtoend_test.go
+++ b/src/cmd/asm/internal/asm/endtoend_test.go
@@ -353,12 +353,7 @@ func testErrors(t *testing.T, goarch, file string) {
 }
 
 func Test386EndToEnd(t *testing.T) {
-	defer func(old string) { objabi.GO386 = old }(objabi.GO386)
-	for _, go386 := range []string{"387", "sse2"} {
-		t.Logf("GO386=%v", go386)
-		objabi.GO386 = go386
-		testEndToEnd(t, "386", "386")
-	}
+	testEndToEnd(t, "386", "386")
 }
 
 func TestARMEndToEnd(t *testing.T) {
diff --git a/src/cmd/compile/internal/gc/float_test.go b/src/cmd/compile/internal/gc/float_test.go
index 6ae363be22..c619d25705 100644
--- a/src/cmd/compile/internal/gc/float_test.go
+++ b/src/cmd/compile/internal/gc/float_test.go
@@ -6,17 +6,9 @@ package gc
 
 import (
 	"math"
-	"os"
-	"runtime"
 	"testing"
 )
 
-// For GO386=387, make sure fucomi* opcodes are not used
-// for comparison operations.
-// Note that this test will fail only on a Pentium MMX
-// processor (with GOARCH=386 GO386=387), as it just runs
-// some code and looks for an unimplemented instruction fault.
-
 //go:noinline
 func compare1(a, b float64) bool {
 	return a < b
@@ -137,9 +129,6 @@ func TestFloatCompareFolded(t *testing.T) {
 	}
 }
 
-// For GO386=387, make sure fucomi* opcodes are not used
-// for float->int conversions.
-
 //go:noinline
 func cvt1(a float64) uint64 {
 	return uint64(a)
@@ -370,14 +359,6 @@ func TestFloat32StoreToLoadConstantFold(t *testing.T) {
 	// are not converted to quiet NaN (qNaN) values during compilation.
 	// See issue #27193 for more information.
 
-	// TODO: this method for detecting 387 won't work if the compiler has been
-	// built using GOARCH=386 GO386=387 and either the target is a different
-	// architecture or the GO386=387 environment variable is not set when the
-	// test is run.
-	if runtime.GOARCH == "386" && os.Getenv("GO386") == "387" {
-		t.Skip("signaling NaNs are not propagated on 387 (issue #27516)")
-	}
-
 	// signaling NaNs
 	{
 		const nan = uint32(0x7f800001) // sNaN
diff --git a/src/cmd/compile/internal/gc/go.go b/src/cmd/compile/internal/gc/go.go
index 9079ce2afc..2fbdf71055 100644
--- a/src/cmd/compile/internal/gc/go.go
+++ b/src/cmd/compile/internal/gc/go.go
@@ -259,7 +259,6 @@ type Arch struct {
 
 	REGSP     int
 	MAXWIDTH  int64
-	Use387    bool // should 386 backend use 387 FP instructions instead of sse2.
 	SoftFloat bool
 
 	PadFrame func(int64) int64
@@ -328,10 +327,6 @@ var (
 	BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym
 	ExtendCheckFunc [ssa.BoundsKindCount]*obj.LSym
 
-	// GO386=387
-	ControlWord64trunc,
-	ControlWord32 *obj.LSym
-
 	// Wasm
 	WasmMove,
 	WasmZero,
diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go
index 1d50cefe54..32394c4b1a 100644
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -62,9 +62,6 @@ func initssaconfig() {
 	_ = types.NewPtr(types.Errortype)                                 // *error
 	types.NewPtrCacheEnabled = false
 	ssaConfig = ssa.NewConfig(thearch.LinkArch.Name, *types_, Ctxt, Debug['N'] == 0)
-	if thearch.LinkArch.Name == "386" {
-		ssaConfig.Set387(thearch.Use387)
-	}
 	ssaConfig.SoftFloat = thearch.SoftFloat
 	ssaConfig.Race = flag_race
 	ssaCaches = make([]ssa.Cache, nBackendWorkers)
@@ -175,10 +172,6 @@ func initssaconfig() {
 		ExtendCheckFunc[ssa.BoundsSlice3CU] = sysvar("panicExtendSlice3CU")
 	}
 
-	// GO386=387 runtime definitions
-	ControlWord64trunc = sysvar("controlWord64trunc") // uint16
-	ControlWord32 = sysvar("controlWord32")           // uint16
-
 	// Wasm (all asm funcs with special ABIs)
 	WasmMove = sysvar("wasmMove")
 	WasmZero = sysvar("wasmZero")
@@ -5946,9 +5939,7 @@ type SSAGenState struct {
 	// bstart remembers where each block starts (indexed by block ID)
 	bstart []*obj.Prog
 
-	// 387 port: maps from SSE registers (REG_X?) to 387 registers (REG_F?)
-	SSEto387 map[int16]int16
-	// Some architectures require a 64-bit temporary for FP-related register shuffling. Examples include x86-387, PPC, and Sparc V8.
+	// Some architectures require a 64-bit temporary for FP-related register shuffling. Examples include PPC and Sparc V8.
 	ScratchFpMem *Node
 
 	maxarg int64 // largest frame size for arguments to calls made by the function
@@ -6115,10 +6106,6 @@ func genssa(f *ssa.Func, pp *Progs) {
 		progToBlock[s.pp.next] = f.Blocks[0]
 	}
 
-	if thearch.Use387 {
-		s.SSEto387 = map[int16]int16{}
-	}
-
 	s.ScratchFpMem = e.scratchFpMem
 
 	if Ctxt.Flag_locationlists {
diff --git a/src/cmd/compile/internal/ssa/config.go b/src/cmd/compile/internal/ssa/config.go
index 88a406deb9..649b5ba820 100644
--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@@ -38,7 +38,6 @@ type Config struct {
 	useSSE         bool          // Use SSE for non-float operations
 	useAvg         bool          // Use optimizations that need Avg* operations
 	useHmul        bool          // Use optimizations that need Hmul* operations
-	use387         bool          // GO386=387
 	SoftFloat      bool          //
 	Race           bool          // race detector enabled
 	NeedsFpScratch bool          // No direct move between GP and FP register sets
@@ -387,9 +386,4 @@ func NewConfig(arch string, types Types, ctxt *obj.Link, optimize bool) *Config
 	return c
 }
 
-func (c *Config) Set387(b bool) {
-	c.NeedsFpScratch = b
-	c.use387 = b
-}
-
 func (c *Config) Ctxt() *obj.Link { return c.ctxt }
diff --git a/src/cmd/compile/internal/ssa/gen/386.rules b/src/cmd/compile/internal/ssa/gen/386.rules
index 4a8244eb27..6a0b87cab4 100644
--- a/src/cmd/compile/internal/ssa/gen/386.rules
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
@@ -38,10 +38,8 @@
 (Xor(32|16|8) ...) => (XORL ...)
 
 (Neg(32|16|8) ...) => (NEGL ...)
-(Neg32F x) && !config.use387 => (PXOR x (MOVSSconst <typ.Float32> [float32(math.Copysign(0, -1))]))
-(Neg64F x) && !config.use387 => (PXOR x (MOVSDconst <typ.Float64> [math.Copysign(0, -1)]))
-(Neg32F x) && config.use387 => (FCHS x)
-(Neg64F x) && config.use387 => (FCHS x)
+(Neg32F x) => (PXOR x (MOVSSconst <typ.Float32> [float32(math.Copysign(0, -1))]))
+(Neg64F x) => (PXOR x (MOVSDconst <typ.Float64> [math.Copysign(0, -1)]))
 
 (Com(32|16|8) ...) => (NOTL ...)
 
@@ -670,8 +668,8 @@
 
 // Merge load/store to op
 ((ADD|AND|OR|XOR|SUB|MUL)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|AND|OR|XOR|SUB|MUL)Lload x [off] {sym} ptr mem)
-((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l) => ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
-((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l) => ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
 (MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
 (MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
 	((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
diff --git a/src/cmd/compile/internal/ssa/gen/386Ops.go b/src/cmd/compile/internal/ssa/gen/386Ops.go
index ddabde7d3d..737b99c371 100644
--- a/src/cmd/compile/internal/ssa/gen/386Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/386Ops.go
@@ -51,17 +51,6 @@ var regNames386 = []string{
 	"SB",
 }
 
-// Notes on 387 support.
-//  - The 387 has a weird stack-register setup for floating-point registers.
-//    We use these registers when SSE registers are not available (when GO386=387).
-//  - We use the same register names (X0-X7) but they refer to the 387
-//    floating-point registers. That way, most of the SSA backend is unchanged.
-//  - The instruction generation pass maintains an SSE->387 register mapping.
-//    This mapping is updated whenever the FP stack is pushed or popped so that
-//    we can always find a given SSE register even when the TOS pointer has changed.
-//  - To facilitate the mapping from SSE to 387, we enforce that
-//    every basic block starts and ends with an empty floating-point stack.
-
 func init() {
 	// Make map from reg names to reg integers.
 	if len(regNames386) > 64 {
@@ -552,9 +541,6 @@ func init() {
 		{name: "FlagGT_UGT"}, // signed > and unsigned <
 		{name: "FlagGT_ULT"}, // signed > and unsigned >
 
-		// Special op for -x on 387
-		{name: "FCHS", argLength: 1, reg: fp11},
-
 		// Special ops for PIC floating-point constants.
 		// MOVSXconst1 loads the address of the constant-pool entry into a register.
 		// MOVSXconst2 loads the constant from that address.
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 9fe943c2e0..d7d2b24a48 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -536,7 +536,6 @@ const (
 	Op386FlagLT_UGT
 	Op386FlagGT_UGT
 	Op386FlagGT_ULT
-	Op386FCHS
 	Op386MOVSSconst1
 	Op386MOVSDconst1
 	Op386MOVSSconst2
@@ -6060,18 +6059,6 @@ var opcodeTable = [...]opInfo{
 		argLen: 0,
 		reg:    regInfo{},
 	},
-	{
-		name:   "FCHS",
-		argLen: 1,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
-			},
-			outputs: []outputInfo{
-				{0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
-			},
-		},
-	},
 	{
 		name:    "MOVSSconst1",
 		auxType: auxFloat32,
diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go
index 64c6aed3e7..691530ec0b 100644
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -625,9 +625,6 @@ func (s *regAllocState) init(f *Func) {
 			s.f.fe.Fatalf(src.NoXPos, "arch %s not implemented", s.f.Config.arch)
 		}
 	}
-	if s.f.Config.use387 {
-		s.allocatable &^= 1 << 15 // X7 disallowed (one 387 register is used as scratch space during SSE->387 generation in ../x86/387.go)
-	}
 
 	// Linear scan register allocation can be influenced by the order in which blocks appear.
 	// Decouple the register allocation order from the generated block order.
@@ -1024,9 +1021,6 @@ func (s *regAllocState) regalloc(f *Func) {
 				if phiRegs[i] != noRegister {
 					continue
 				}
-				if s.f.Config.use387 && v.Type.IsFloat() {
-					continue // 387 can't handle floats in registers between blocks
-				}
 				m := s.compatRegs(v.Type) &^ phiUsed &^ s.used
 				if m != 0 {
 					r := pickReg(m)
@@ -1528,11 +1522,6 @@ func (s *regAllocState) regalloc(f *Func) {
 			s.freeUseRecords = u
 		}
 
-		// Spill any values that can't live across basic block boundaries.
-		if s.f.Config.use387 {
-			s.freeRegs(s.f.Config.fpRegMask)
-		}
-
 		// If we are approaching a merge point and we are the primary
 		// predecessor of it, find live values that we use soon after
 		// the merge point and promote them to registers now.
@@ -1562,9 +1551,6 @@ func (s *regAllocState) regalloc(f *Func) {
 					continue
 				}
 				v := s.orig[vid]
-				if s.f.Config.use387 && v.Type.IsFloat() {
-					continue // 387 can't handle floats in registers between blocks
-				}
 				m := s.compatRegs(v.Type) &^ s.used
 				if m&^desired.avoid != 0 {
 					m &^= desired.avoid
diff --git a/src/cmd/compile/internal/ssa/rewrite386.go b/src/cmd/compile/internal/ssa/rewrite386.go
index fc1e0541b2..0f08160f44 100644
--- a/src/cmd/compile/internal/ssa/rewrite386.go
+++ b/src/cmd/compile/internal/ssa/rewrite386.go
@@ -1310,10 +1310,8 @@ func rewriteValue386_Op386ADDLmodify(v *Value) bool {
 func rewriteValue386_Op386ADDSD(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	b := v.Block
-	config := b.Func.Config
 	// match: (ADDSD x l:(MOVSDload [off] {sym} ptr mem))
-	// cond: canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)
+	// cond: canMergeLoadClobber(v, l, x) && clobber(l)
 	// result: (ADDSDload x [off] {sym} ptr mem)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
@@ -1326,7 +1324,7 @@ func rewriteValue386_Op386ADDSD(v *Value) bool {
 			sym := auxToSym(l.Aux)
 			mem := l.Args[1]
 			ptr := l.Args[0]
-			if !(canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)) {
+			if !(canMergeLoadClobber(v, l, x) && clobber(l)) {
 				continue
 			}
 			v.reset(Op386ADDSDload)
@@ -1395,10 +1393,8 @@ func rewriteValue386_Op386ADDSDload(v *Value) bool {
 func rewriteValue386_Op386ADDSS(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	b := v.Block
-	config := b.Func.Config
 	// match: (ADDSS x l:(MOVSSload [off] {sym} ptr mem))
-	// cond: canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)
+	// cond: canMergeLoadClobber(v, l, x) && clobber(l)
 	// result: (ADDSSload x [off] {sym} ptr mem)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
@@ -1411,7 +1407,7 @@ func rewriteValue386_Op386ADDSS(v *Value) bool {
 			sym := auxToSym(l.Aux)
 			mem := l.Args[1]
 			ptr := l.Args[0]
-			if !(canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)) {
+			if !(canMergeLoadClobber(v, l, x) && clobber(l)) {
 				continue
 			}
 			v.reset(Op386ADDSSload)
@@ -2640,10 +2636,8 @@ func rewriteValue386_Op386CMPWload(v *Value) bool {
 func rewriteValue386_Op386DIVSD(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	b := v.Block
-	config := b.Func.Config
 	// match: (DIVSD x l:(MOVSDload [off] {sym} ptr mem))
-	// cond: canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)
+	// cond: canMergeLoadClobber(v, l, x) && clobber(l)
 	// result: (DIVSDload x [off] {sym} ptr mem)
 	for {
 		x := v_0
@@ -2655,7 +2649,7 @@ func rewriteValue386_Op386DIVSD(v *Value) bool {
 		sym := auxToSym(l.Aux)
 		mem := l.Args[1]
 		ptr := l.Args[0]
-		if !(canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)) {
+		if !(canMergeLoadClobber(v, l, x) && clobber(l)) {
 			break
 		}
 		v.reset(Op386DIVSDload)
@@ -2722,10 +2716,8 @@ func rewriteValue386_Op386DIVSDload(v *Value) bool {
 func rewriteValue386_Op386DIVSS(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	b := v.Block
-	config := b.Func.Config
 	// match: (DIVSS x l:(MOVSSload [off] {sym} ptr mem))
-	// cond: canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)
+	// cond: canMergeLoadClobber(v, l, x) && clobber(l)
 	// result: (DIVSSload x [off] {sym} ptr mem)
 	for {
 		x := v_0
@@ -2737,7 +2729,7 @@ func rewriteValue386_Op386DIVSS(v *Value) bool {
 		sym := auxToSym(l.Aux)
 		mem := l.Args[1]
 		ptr := l.Args[0]
-		if !(canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)) {
+		if !(canMergeLoadClobber(v, l, x) && clobber(l)) {
 			break
 		}
 		v.reset(Op386DIVSSload)
@@ -6104,10 +6096,8 @@ func rewriteValue386_Op386MULLload(v *Value) bool {
 func rewriteValue386_Op386MULSD(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	b := v.Block
-	config := b.Func.Config
 	// match: (MULSD x l:(MOVSDload [off] {sym} ptr mem))
-	// cond: canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)
+	// cond: canMergeLoadClobber(v, l, x) && clobber(l)
 	// result: (MULSDload x [off] {sym} ptr mem)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
@@ -6120,7 +6110,7 @@ func rewriteValue386_Op386MULSD(v *Value) bool {
 			sym := auxToSym(l.Aux)
 			mem := l.Args[1]
 			ptr := l.Args[0]
-			if !(canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)) {
+			if !(canMergeLoadClobber(v, l, x) && clobber(l)) {
 				continue
 			}
 			v.reset(Op386MULSDload)
@@ -6189,10 +6179,8 @@ func rewriteValue386_Op386MULSDload(v *Value) bool {
 func rewriteValue386_Op386MULSS(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	b := v.Block
-	config := b.Func.Config
 	// match: (MULSS x l:(MOVSSload [off] {sym} ptr mem))
-	// cond: canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)
+	// cond: canMergeLoadClobber(v, l, x) && clobber(l)
 	// result: (MULSSload x [off] {sym} ptr mem)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
@@ -6205,7 +6193,7 @@ func rewriteValue386_Op386MULSS(v *Value) bool {
 			sym := auxToSym(l.Aux)
 			mem := l.Args[1]
 			ptr := l.Args[0]
-			if !(canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)) {
+			if !(canMergeLoadClobber(v, l, x) && clobber(l)) {
 				continue
 			}
 			v.reset(Op386MULSSload)
@@ -8187,10 +8175,8 @@ func rewriteValue386_Op386SUBLmodify(v *Value) bool {
 func rewriteValue386_Op386SUBSD(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	b := v.Block
-	config := b.Func.Config
 	// match: (SUBSD x l:(MOVSDload [off] {sym} ptr mem))
-	// cond: canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)
+	// cond: canMergeLoadClobber(v, l, x) && clobber(l)
 	// result: (SUBSDload x [off] {sym} ptr mem)
 	for {
 		x := v_0
@@ -8202,7 +8188,7 @@ func rewriteValue386_Op386SUBSD(v *Value) bool {
 		sym := auxToSym(l.Aux)
 		mem := l.Args[1]
 		ptr := l.Args[0]
-		if !(canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)) {
+		if !(canMergeLoadClobber(v, l, x) && clobber(l)) {
 			break
 		}
 		v.reset(Op386SUBSDload)
@@ -8269,10 +8255,8 @@ func rewriteValue386_Op386SUBSDload(v *Value) bool {
 func rewriteValue386_Op386SUBSS(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	b := v.Block
-	config := b.Func.Config
 	// match: (SUBSS x l:(MOVSSload [off] {sym} ptr mem))
-	// cond: canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)
+	// cond: canMergeLoadClobber(v, l, x) && clobber(l)
 	// result: (SUBSSload x [off] {sym} ptr mem)
 	for {
 		x := v_0
@@ -8284,7 +8268,7 @@ func rewriteValue386_Op386SUBSS(v *Value) bool {
 		sym := auxToSym(l.Aux)
 		mem := l.Args[1]
 		ptr := l.Args[0]
-		if !(canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l)) {
+		if !(canMergeLoadClobber(v, l, x) && clobber(l)) {
 			break
 		}
 		v.reset(Op386SUBSSload)
@@ -10043,68 +10027,32 @@ func rewriteValue386_OpMove(v *Value) bool {
 func rewriteValue386_OpNeg32F(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	config := b.Func.Config
 	typ := &b.Func.Config.Types
 	// match: (Neg32F x)
-	// cond: !config.use387
 	// result: (PXOR x (MOVSSconst <typ.Float32> [float32(math.Copysign(0, -1))]))
 	for {
 		x := v_0
-		if !(!config.use387) {
-			break
-		}
 		v.reset(Op386PXOR)
 		v0 := b.NewValue0(v.Pos, Op386MOVSSconst, typ.Float32)
 		v0.AuxInt = float32ToAuxInt(float32(math.Copysign(0, -1)))
 		v.AddArg2(x, v0)
 		return true
 	}
-	// match: (Neg32F x)
-	// cond: config.use387
-	// result: (FCHS x)
-	for {
-		x := v_0
-		if !(config.use387) {
-			break
-		}
-		v.reset(Op386FCHS)
-		v.AddArg(x)
-		return true
-	}
-	return false
 }
 func rewriteValue386_OpNeg64F(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	config := b.Func.Config
 	typ := &b.Func.Config.Types
 	// match: (Neg64F x)
-	// cond: !config.use387
 	// result: (PXOR x (MOVSDconst <typ.Float64> [math.Copysign(0, -1)]))
 	for {
 		x := v_0
-		if !(!config.use387) {
-			break
-		}
 		v.reset(Op386PXOR)
 		v0 := b.NewValue0(v.Pos, Op386MOVSDconst, typ.Float64)
 		v0.AuxInt = float64ToAuxInt(math.Copysign(0, -1))
 		v.AddArg2(x, v0)
 		return true
 	}
-	// match: (Neg64F x)
-	// cond: config.use387
-	// result: (FCHS x)
-	for {
-		x := v_0
-		if !(config.use387) {
-			break
-		}
-		v.reset(Op386FCHS)
-		v.AddArg(x)
-		return true
-	}
-	return false
 }
 func rewriteValue386_OpNeq16(v *Value) bool {
 	v_1 := v.Args[1]
diff --git a/src/cmd/compile/internal/x86/387.go b/src/cmd/compile/internal/x86/387.go
deleted file mode 100644
index 594adb2cd5..0000000000
--- a/src/cmd/compile/internal/x86/387.go
+++ /dev/null
@@ -1,403 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package x86
-
-import (
-	"cmd/compile/internal/gc"
-	"cmd/compile/internal/ssa"
-	"cmd/compile/internal/types"
-	"cmd/internal/obj"
-	"cmd/internal/obj/x86"
-	"math"
-)
-
-// Generates code for v using 387 instructions.
-func ssaGenValue387(s *gc.SSAGenState, v *ssa.Value) {
-	// The SSA compiler pretends that it has an SSE backend.
-	// If we don't have one of those, we need to translate
-	// all the SSE ops to equivalent 387 ops. That's what this
-	// function does.
-
-	switch v.Op {
-	case ssa.Op386MOVSSconst, ssa.Op386MOVSDconst:
-		iv := uint64(v.AuxInt)
-		if iv == 0x0000000000000000 { // +0.0
-			s.Prog(x86.AFLDZ)
-		} else if iv == 0x3ff0000000000000 { // +1.0
-			s.Prog(x86.AFLD1)
-		} else if iv == 0x8000000000000000 { // -0.0
-			s.Prog(x86.AFLDZ)
-			s.Prog(x86.AFCHS)
-		} else if iv == 0xbff0000000000000 { // -1.0
-			s.Prog(x86.AFLD1)
-			s.Prog(x86.AFCHS)
-		} else if iv == 0x400921fb54442d18 { // +pi
-			s.Prog(x86.AFLDPI)
-		} else if iv == 0xc00921fb54442d18 { // -pi
-			s.Prog(x86.AFLDPI)
-			s.Prog(x86.AFCHS)
-		} else { // others
-			p := s.Prog(loadPush(v.Type))
-			p.From.Type = obj.TYPE_FCONST
-			p.From.Val = math.Float64frombits(iv)
-			p.To.Type = obj.TYPE_REG
-			p.To.Reg = x86.REG_F0
-		}
-		popAndSave(s, v)
-
-	case ssa.Op386MOVSSconst2, ssa.Op386MOVSDconst2:
-		p := s.Prog(loadPush(v.Type))
-		p.From.Type = obj.TYPE_MEM
-		p.From.Reg = v.Args[0].Reg()
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = x86.REG_F0
-		popAndSave(s, v)
-
-	case ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVSSloadidx1, ssa.Op386MOVSDloadidx1, ssa.Op386MOVSSloadidx4, ssa.Op386MOVSDloadidx8:
-		p := s.Prog(loadPush(v.Type))
-		p.From.Type = obj.TYPE_MEM
-		p.From.Reg = v.Args[0].Reg()
-		gc.AddAux(&p.From, v)
-		switch v.Op {
-		case ssa.Op386MOVSSloadidx1, ssa.Op386MOVSDloadidx1:
-			p.From.Scale = 1
-			p.From.Index = v.Args[1].Reg()
-			if p.From.Index == x86.REG_SP {
-				p.From.Reg, p.From.Index = p.From.Index, p.From.Reg
-			}
-		case ssa.Op386MOVSSloadidx4:
-			p.From.Scale = 4
-			p.From.Index = v.Args[1].Reg()
-		case ssa.Op386MOVSDloadidx8:
-			p.From.Scale = 8
-			p.From.Index = v.Args[1].Reg()
-		}
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = x86.REG_F0
-		popAndSave(s, v)
-
-	case ssa.Op386MOVSSstore, ssa.Op386MOVSDstore:
-		// Push to-be-stored value on top of stack.
-		push(s, v.Args[1])
-
-		// Pop and store value.
-		var op obj.As
-		switch v.Op {
-		case ssa.Op386MOVSSstore:
-			op = x86.AFMOVFP
-		case ssa.Op386MOVSDstore:
-			op = x86.AFMOVDP
-		}
-		p := s.Prog(op)
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = x86.REG_F0
-		p.To.Type = obj.TYPE_MEM
-		p.To.Reg = v.Args[0].Reg()
-		gc.AddAux(&p.To, v)
-
-	case ssa.Op386MOVSSstoreidx1, ssa.Op386MOVSDstoreidx1, ssa.Op386MOVSSstoreidx4, ssa.Op386MOVSDstoreidx8:
-		push(s, v.Args[2])
-		var op obj.As
-		switch v.Op {
-		case ssa.Op386MOVSSstoreidx1, ssa.Op386MOVSSstoreidx4:
-			op = x86.AFMOVFP
-		case ssa.Op386MOVSDstoreidx1, ssa.Op386MOVSDstoreidx8:
-			op = x86.AFMOVDP
-		}
-		p := s.Prog(op)
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = x86.REG_F0
-		p.To.Type = obj.TYPE_MEM
-		p.To.Reg = v.Args[0].Reg()
-		gc.AddAux(&p.To, v)
-		switch v.Op {
-		case ssa.Op386MOVSSstoreidx1, ssa.Op386MOVSDstoreidx1:
-			p.To.Scale = 1
-			p.To.Index = v.Args[1].Reg()
-			if p.To.Index == x86.REG_SP {
-				p.To.Reg, p.To.Index = p.To.Index, p.To.Reg
-			}
-		case ssa.Op386MOVSSstoreidx4:
-			p.To.Scale = 4
-			p.To.Index = v.Args[1].Reg()
-		case ssa.Op386MOVSDstoreidx8:
-			p.To.Scale = 8
-			p.To.Index = v.Args[1].Reg()
-		}
-
-	case ssa.Op386ADDSS, ssa.Op386ADDSD, ssa.Op386SUBSS, ssa.Op386SUBSD,
-		ssa.Op386MULSS, ssa.Op386MULSD, ssa.Op386DIVSS, ssa.Op386DIVSD:
-		if v.Reg() != v.Args[0].Reg() {
-			v.Fatalf("input[0] and output not in same register %s", v.LongString())
-		}
-
-		// Push arg1 on top of stack
-		push(s, v.Args[1])
-
-		// Set precision if needed.  64 bits is the default.
-		switch v.Op {
-		case ssa.Op386ADDSS, ssa.Op386SUBSS, ssa.Op386MULSS, ssa.Op386DIVSS:
-			// Save AX so we can use it as scratch space.
-			p := s.Prog(x86.AMOVL)
-			p.From.Type = obj.TYPE_REG
-			p.From.Reg = x86.REG_AX
-			s.AddrScratch(&p.To)
-			// Install a 32-bit version of the control word.
-			installControlWord(s, gc.ControlWord32, x86.REG_AX)
-			// Restore AX.
-			p = s.Prog(x86.AMOVL)
-			s.AddrScratch(&p.From)
-			p.To.Type = obj.TYPE_REG
-			p.To.Reg = x86.REG_AX
-		}
-
-		var op obj.As
-		switch v.Op {
-		case ssa.Op386ADDSS, ssa.Op386ADDSD:
-			op = x86.AFADDDP
-		case ssa.Op386SUBSS, ssa.Op386SUBSD:
-			op = x86.AFSUBDP
-		case ssa.Op386MULSS, ssa.Op386MULSD:
-			op = x86.AFMULDP
-		case ssa.Op386DIVSS, ssa.Op386DIVSD:
-			op = x86.AFDIVDP
-		}
-		p := s.Prog(op)
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = x86.REG_F0
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = s.SSEto387[v.Reg()] + 1
-
-		// Restore precision if needed.
-		switch v.Op {
-		case ssa.Op386ADDSS, ssa.Op386SUBSS, ssa.Op386MULSS, ssa.Op386DIVSS:
-			restoreControlWord(s)
-		}
-
-	case ssa.Op386UCOMISS, ssa.Op386UCOMISD:
-		push(s, v.Args[0])
-
-		// Compare.
-		p := s.Prog(x86.AFUCOMP)
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = x86.REG_F0
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = s.SSEto387[v.Args[1].Reg()] + 1
-
-		// Save AX.
-		p = s.Prog(x86.AMOVL)
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = x86.REG_AX
-		s.AddrScratch(&p.To)
-
-		// Move status word into AX.
-		p = s.Prog(x86.AFSTSW)
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = x86.REG_AX
-
-		// Then move the flags we need to the integer flags.
-		s.Prog(x86.ASAHF)
-
-		// Restore AX.
-		p = s.Prog(x86.AMOVL)
-		s.AddrScratch(&p.From)
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = x86.REG_AX
-
-	case ssa.Op386SQRTSD:
-		push(s, v.Args[0])
-		s.Prog(x86.AFSQRT)
-		popAndSave(s, v)
-
-	case ssa.Op386FCHS:
-		push(s, v.Args[0])
-		s.Prog(x86.AFCHS)
-		popAndSave(s, v)
-
-	case ssa.Op386CVTSL2SS, ssa.Op386CVTSL2SD:
-		p := s.Prog(x86.AMOVL)
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = v.Args[0].Reg()
-		s.AddrScratch(&p.To)
-		p = s.Prog(x86.AFMOVL)
-		s.AddrScratch(&p.From)
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = x86.REG_F0
-		popAndSave(s, v)
-
-	case ssa.Op386CVTTSD2SL, ssa.Op386CVTTSS2SL:
-		push(s, v.Args[0])
-
-		// Load control word which truncates (rounds towards zero).
-		installControlWord(s, gc.ControlWord64trunc, v.Reg())
-
-		// Now do the conversion.
-		p := s.Prog(x86.AFMOVLP)
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = x86.REG_F0
-		s.AddrScratch(&p.To)
-		p = s.Prog(x86.AMOVL)
-		s.AddrScratch(&p.From)
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = v.Reg()
-
-		// Restore control word.
-		restoreControlWord(s)
-
-	case ssa.Op386CVTSS2SD:
-		// float32 -> float64 is a nop
-		push(s, v.Args[0])
-		popAndSave(s, v)
-
-	case ssa.Op386CVTSD2SS:
-		// Round to nearest float32.
-		push(s, v.Args[0])
-		p := s.Prog(x86.AFMOVFP)
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = x86.REG_F0
-		s.AddrScratch(&p.To)
-		p = s.Prog(x86.AFMOVF)
-		s.AddrScratch(&p.From)
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = x86.REG_F0
-		popAndSave(s, v)
-
-	case ssa.OpLoadReg:
-		if !v.Type.IsFloat() {
-			ssaGenValue(s, v)
-			return
-		}
-		// Load+push the value we need.
-		p := s.Prog(loadPush(v.Type))
-		gc.AddrAuto(&p.From, v.Args[0])
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = x86.REG_F0
-		// Move the value to its assigned register.
-		popAndSave(s, v)
-
-	case ssa.OpStoreReg:
-		if !v.Type.IsFloat() {
-			ssaGenValue(s, v)
-			return
-		}
-		push(s, v.Args[0])
-		var op obj.As
-		switch v.Type.Size() {
-		case 4:
-			op = x86.AFMOVFP
-		case 8:
-			op = x86.AFMOVDP
-		}
-		p := s.Prog(op)
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = x86.REG_F0
-		gc.AddrAuto(&p.To, v)
-
-	case ssa.OpCopy:
-		if !v.Type.IsFloat() {
-			ssaGenValue(s, v)
-			return
-		}
-		push(s, v.Args[0])
-		popAndSave(s, v)
-
-	case ssa.Op386CALLstatic, ssa.Op386CALLclosure, ssa.Op386CALLinter:
-		flush387(s) // Calls must empty the FP stack.
-		fallthrough // then issue the call as normal
-	default:
-		ssaGenValue(s, v)
-	}
-}
-
-// push pushes v onto the floating-point stack.  v must be in a register.
-func push(s *gc.SSAGenState, v *ssa.Value) {
-	p := s.Prog(x86.AFMOVD)
-	p.From.Type = obj.TYPE_REG
-	p.From.Reg = s.SSEto387[v.Reg()]
-	p.To.Type = obj.TYPE_REG
-	p.To.Reg = x86.REG_F0
-}
-
-// popAndSave pops a value off of the floating-point stack and stores
-// it in the register assigned to v.
-func popAndSave(s *gc.SSAGenState, v *ssa.Value) {
-	r := v.Reg()
-	if _, ok := s.SSEto387[r]; ok {
-		// Pop value, write to correct register.
-		p := s.Prog(x86.AFMOVDP)
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = x86.REG_F0
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = s.SSEto387[v.Reg()] + 1
-	} else {
-		// Don't actually pop value. This 387 register is now the
-		// new home for the not-yet-assigned-a-home SSE register.
-		// Increase the register mapping of all other registers by one.
-		for rSSE, r387 := range s.SSEto387 {
-			s.SSEto387[rSSE] = r387 + 1
-		}
-		s.SSEto387[r] = x86.REG_F0
-	}
-}
-
-// loadPush returns the opcode for load+push of the given type.
-func loadPush(t *types.Type) obj.As {
-	if t.Size() == 4 {
-		return x86.AFMOVF
-	}
-	return x86.AFMOVD
-}
-
-// flush387 removes all entries from the 387 floating-point stack.
-func flush387(s *gc.SSAGenState) {
-	for k := range s.SSEto387 {
-		p := s.Prog(x86.AFMOVDP)
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = x86.REG_F0
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = x86.REG_F0
-		delete(s.SSEto387, k)
-	}
-}
-
-func ssaGenBlock387(s *gc.SSAGenState, b, next *ssa.Block) {
-	// Empty the 387's FP stack before the block ends.
-	flush387(s)
-
-	ssaGenBlock(s, b, next)
-}
-
-// installControlWord saves the current floating-point control
-// word and installs a new one loaded from cw.
-// scratchReg must be an unused register.
-// This call must be paired with restoreControlWord.
-// Bytes 4-5 of the scratch space (s.AddrScratch) are used between
-// this call and restoreControlWord.
-func installControlWord(s *gc.SSAGenState, cw *obj.LSym, scratchReg int16) {
-	// Save current control word.
-	p := s.Prog(x86.AFSTCW)
-	s.AddrScratch(&p.To)
-	p.To.Offset += 4
-
-	// Materialize address of new control word.
-	// Note: this must be a seperate instruction to handle PIE correctly.
-	// See issue 41503.
-	p = s.Prog(x86.ALEAL)
-	p.From.Type = obj.TYPE_MEM
-	p.From.Name = obj.NAME_EXTERN
-	p.From.Sym = cw
-	p.To.Type = obj.TYPE_REG
-	p.To.Reg = scratchReg
-
-	// Load replacement control word.
-	p = s.Prog(x86.AFLDCW)
-	p.From.Type = obj.TYPE_MEM
-	p.From.Reg = scratchReg
-}
-func restoreControlWord(s *gc.SSAGenState) {
-	p := s.Prog(x86.AFLDCW)
-	s.AddrScratch(&p.From)
-	p.From.Offset += 4
-}
diff --git a/src/cmd/compile/internal/x86/galign.go b/src/cmd/compile/internal/x86/galign.go
index 56c6989d93..2d20b6a6d0 100644
--- a/src/cmd/compile/internal/x86/galign.go
+++ b/src/cmd/compile/internal/x86/galign.go
@@ -7,26 +7,13 @@ package x86
 import (
 	"cmd/compile/internal/gc"
 	"cmd/internal/obj/x86"
-	"cmd/internal/objabi"
-	"fmt"
-	"os"
 )
 
 func Init(arch *gc.Arch) {
 	arch.LinkArch = &x86.Link386
 	arch.REGSP = x86.REGSP
-	switch v := objabi.GO386; v {
-	case "387":
-		arch.Use387 = true
-		arch.SSAGenValue = ssaGenValue387
-		arch.SSAGenBlock = ssaGenBlock387
-	case "sse2":
-		arch.SSAGenValue = ssaGenValue
-		arch.SSAGenBlock = ssaGenBlock
-	default:
-		fmt.Fprintf(os.Stderr, "unsupported setting GO386=%s\n", v)
-		gc.Exit(1)
-	}
+	arch.SSAGenValue = ssaGenValue
+	arch.SSAGenBlock = ssaGenBlock
 	arch.MAXWIDTH = (1 << 32) - 1
 
 	arch.ZeroRange = zerorange
diff --git a/src/cmd/compile/internal/x86/ssa.go b/src/cmd/compile/internal/x86/ssa.go
index c21ac32297..74a4570770 100644
--- a/src/cmd/compile/internal/x86/ssa.go
+++ b/src/cmd/compile/internal/x86/ssa.go
@@ -852,8 +852,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		if gc.Debug_checknil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
 			gc.Warnl(v.Pos, "generated nil check")
 		}
-	case ssa.Op386FCHS:
-		v.Fatalf("FCHS in non-387 mode")
 	case ssa.OpClobber:
 		p := s.Prog(x86.AMOVL)
 		p.From.Type = obj.TYPE_CONST
diff --git a/src/cmd/dist/build.go b/src/cmd/dist/build.go
index 3ac742fa55..5d62c1e8fa 100644
--- a/src/cmd/dist/build.go
+++ b/src/cmd/dist/build.go
@@ -30,7 +30,6 @@ var (
 	gohostos         string
 	goos             string
 	goarm            string
-	go386            string
 	gomips           string
 	gomips64         string
 	goppc64          string
@@ -142,16 +141,6 @@ func xinit() {
 	}
 	goarm = b
 
-	b = os.Getenv("GO386")
-	if b == "" {
-		if cansse2() {
-			b = "sse2"
-		} else {
-			b = "387"
-		}
-	}
-	go386 = b
-
 	b = os.Getenv("GOMIPS")
 	if b == "" {
 		b = "hardfloat"
@@ -223,7 +212,6 @@ func xinit() {
 	defaultldso = os.Getenv("GO_LDSO")
 
 	// For tools being invoked but also for os.ExpandEnv.
-	os.Setenv("GO386", go386)
 	os.Setenv("GOARCH", goarch)
 	os.Setenv("GOARM", goarm)
 	os.Setenv("GOHOSTARCH", gohostarch)
@@ -1165,9 +1153,6 @@ func cmdenv() {
 	if goarch == "arm" {
 		xprintf(format, "GOARM", goarm)
 	}
-	if goarch == "386" {
-		xprintf(format, "GO386", go386)
-	}
 	if goarch == "mips" || goarch == "mipsle" {
 		xprintf(format, "GOMIPS", gomips)
 	}
diff --git a/src/cmd/dist/buildruntime.go b/src/cmd/dist/buildruntime.go
index 2744951597..67d1d72db4 100644
--- a/src/cmd/dist/buildruntime.go
+++ b/src/cmd/dist/buildruntime.go
@@ -41,7 +41,6 @@ func mkzversion(dir, file string) {
 //	package objabi
 //
 //	const defaultGOROOT = <goroot>
-//	const defaultGO386 = <go386>
 //	const defaultGOARM = <goarm>
 //	const defaultGOMIPS = <gomips>
 //	const defaultGOMIPS64 = <gomips64>
@@ -70,7 +69,6 @@ func mkzbootstrap(file string) {
 	fmt.Fprintln(&buf)
 	fmt.Fprintf(&buf, "import \"runtime\"\n")
 	fmt.Fprintln(&buf)
-	fmt.Fprintf(&buf, "const defaultGO386 = `%s`\n", go386)
 	fmt.Fprintf(&buf, "const defaultGOARM = `%s`\n", goarm)
 	fmt.Fprintf(&buf, "const defaultGOMIPS = `%s`\n", gomips)
 	fmt.Fprintf(&buf, "const defaultGOMIPS64 = `%s`\n", gomips64)
diff --git a/src/cmd/dist/cpuid_386.s b/src/cmd/dist/cpuid_386.s
deleted file mode 100644
index 65fbb2dcb7..0000000000
--- a/src/cmd/dist/cpuid_386.s
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !gccgo
-
-TEXT ·cpuid(SB),$0-8
-	MOVL ax+4(FP), AX
-	CPUID
-	MOVL info+0(FP), DI
-	MOVL AX, 0(DI)
-	MOVL BX, 4(DI)
-	MOVL CX, 8(DI)
-	MOVL DX, 12(DI)
-	RET
-
diff --git a/src/cmd/dist/cpuid_amd64.s b/src/cmd/dist/cpuid_amd64.s
deleted file mode 100644
index ea0b9d4dc9..0000000000
--- a/src/cmd/dist/cpuid_amd64.s
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !gccgo
-
-TEXT ·cpuid(SB),$0-12
-	MOVL ax+8(FP), AX
-	CPUID
-	MOVQ info+0(FP), DI
-	MOVL AX, 0(DI)
-	MOVL BX, 4(DI)
-	MOVL CX, 8(DI)
-	MOVL DX, 12(DI)
-	RET
-
diff --git a/src/cmd/dist/cpuid_default.s b/src/cmd/dist/cpuid_default.s
deleted file mode 100644
index 6412a507a9..0000000000
--- a/src/cmd/dist/cpuid_default.s
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !386,!amd64,!gccgo
-
-#include "textflag.h"
-
-TEXT ·cpuid(SB),NOSPLIT,$0-0
-	RET
diff --git a/src/cmd/dist/util_gc.go b/src/cmd/dist/util_gc.go
index 698beef704..17a0e6fbb5 100644
--- a/src/cmd/dist/util_gc.go
+++ b/src/cmd/dist/util_gc.go
@@ -6,18 +6,6 @@
 
 package main
 
-func cpuid(info *[4]uint32, ax uint32)
-
-func cansse2() bool {
-	if gohostarch != "386" && gohostarch != "amd64" {
-		return false
-	}
-
-	var info [4]uint32
-	cpuid(&info, 1)
-	return info[3]&(1<<26) != 0 // SSE2
-}
-
 // useVFPv1 tries to execute one VFPv1 instruction on ARM.
 // It will crash the current process if VFPv1 is missing.
 func useVFPv1()
diff --git a/src/cmd/dist/util_gccgo.go b/src/cmd/dist/util_gccgo.go
index f9f01dc048..dc897236fb 100644
--- a/src/cmd/dist/util_gccgo.go
+++ b/src/cmd/dist/util_gccgo.go
@@ -6,19 +6,6 @@
 
 package main
 
-/*
-int supports_sse2() {
-#if defined(__i386__) || defined(__x86_64__)
-	return __builtin_cpu_supports("sse2");
-#else
-	return 0;
-#endif
-}
-*/
-import "C"
-
-func cansse2() bool { return C.supports_sse2() != 0 }
-
 func useVFPv1() {}
 
 func useVFPv3() {}
diff --git a/src/cmd/go/alldocs.go b/src/cmd/go/alldocs.go
index 4bc87008ff..500682ed02 100644
--- a/src/cmd/go/alldocs.go
+++ b/src/cmd/go/alldocs.go
@@ -1853,9 +1853,6 @@
 // 	GOARM
 // 		For GOARCH=arm, the ARM architecture for which to compile.
 // 		Valid values are 5, 6, 7.
-// 	GO386
-// 		For GOARCH=386, the floating point instruction set.
-// 		Valid values are 387, sse2.
 // 	GOMIPS
 // 		For GOARCH=mips{,le}, whether to use floating point instructions.
 // 		Valid values are hardfloat (default), softfloat.
diff --git a/src/cmd/go/internal/cfg/cfg.go b/src/cmd/go/internal/cfg/cfg.go
index 9bf1db73ef..ebbaf04115 100644
--- a/src/cmd/go/internal/cfg/cfg.go
+++ b/src/cmd/go/internal/cfg/cfg.go
@@ -244,7 +244,6 @@ var (
 
 	// Used in envcmd.MkEnv and build ID computations.
 	GOARM    = envOr("GOARM", fmt.Sprint(objabi.GOARM))
-	GO386    = envOr("GO386", objabi.GO386)
 	GOMIPS   = envOr("GOMIPS", objabi.GOMIPS)
 	GOMIPS64 = envOr("GOMIPS64", objabi.GOMIPS64)
 	GOPPC64  = envOr("GOPPC64", fmt.Sprintf("%s%d", "power", objabi.GOPPC64))
@@ -268,8 +267,6 @@ func GetArchEnv() (key, val string) {
 	switch Goarch {
 	case "arm":
 		return "GOARM", GOARM
-	case "386":
-		return "GO386", GO386
 	case "mips", "mipsle":
 		return "GOMIPS", GOMIPS
 	case "mips64", "mips64le":
diff --git a/src/cmd/go/internal/envcmd/env.go b/src/cmd/go/internal/envcmd/env.go
index 7bd75f7305..ee0bb0d0b2 100644
--- a/src/cmd/go/internal/envcmd/env.go
+++ b/src/cmd/go/internal/envcmd/env.go
@@ -497,7 +497,10 @@ func lineToKey(line string) string {
 }
 
 // sortKeyValues sorts a sequence of lines by key.
-// It differs from sort.Strings in that GO386= sorts after GO=.
+// It differs from sort.Strings in that keys which are GOx where x is an ASCII
+// character smaller than = sort after GO=.
+// (There are no such keys currently. It used to matter for GO386 which was
+// removed in Go 1.16.)
 func sortKeyValues(lines []string) {
 	sort.Slice(lines, func(i, j int) bool {
 		return lineToKey(lines[i]) < lineToKey(lines[j])
diff --git a/src/cmd/go/internal/help/helpdoc.go b/src/cmd/go/internal/help/helpdoc.go
index 0ae5fd7ca9..befa10a0e4 100644
--- a/src/cmd/go/internal/help/helpdoc.go
+++ b/src/cmd/go/internal/help/helpdoc.go
@@ -581,9 +581,6 @@ Architecture-specific environment variables:
 	GOARM
 		For GOARCH=arm, the ARM architecture for which to compile.
 		Valid values are 5, 6, 7.
-	GO386
-		For GOARCH=386, the floating point instruction set.
-		Valid values are 387, sse2.
 	GOMIPS
 		For GOARCH=mips{,le}, whether to use floating point instructions.
 		Valid values are hardfloat (default), softfloat.
diff --git a/src/cmd/go/internal/work/exec.go b/src/cmd/go/internal/work/exec.go
index 51fc2b588d..e68b322c7d 100644
--- a/src/cmd/go/internal/work/exec.go
+++ b/src/cmd/go/internal/work/exec.go
@@ -271,7 +271,7 @@ func (b *Builder) buildActionID(a *Action) cache.ActionID {
 			fmt.Fprintf(h, "asm %q %q %q\n", b.toolID("asm"), forcedAsmflags, p.Internal.Asmflags)
 		}
 
-		// GO386, GOARM, GOMIPS, etc.
+		// GOARM, GOMIPS, etc.
 		key, val := cfg.GetArchEnv()
 		fmt.Fprintf(h, "%s=%s\n", key, val)
 
@@ -1175,7 +1175,7 @@ func (b *Builder) printLinkerConfig(h io.Writer, p *load.Package) {
 			fmt.Fprintf(h, "linkflags %q\n", p.Internal.Ldflags)
 		}
 
-		// GO386, GOARM, GOMIPS, etc.
+		// GOARM, GOMIPS, etc.
 		key, val := cfg.GetArchEnv()
 		fmt.Fprintf(h, "%s=%s\n", key, val)
 
diff --git a/src/cmd/internal/objabi/util.go b/src/cmd/internal/objabi/util.go
index b81b73a022..cedb2d0a26 100644
--- a/src/cmd/internal/objabi/util.go
+++ b/src/cmd/internal/objabi/util.go
@@ -24,7 +24,6 @@ var (
 	GOROOT   = envOr("GOROOT", defaultGOROOT)
 	GOARCH   = envOr("GOARCH", defaultGOARCH)
 	GOOS     = envOr("GOOS", defaultGOOS)
-	GO386    = envOr("GO386", defaultGO386)
 	GOAMD64  = goamd64()
 	GOARM    = goarm()
 	GOMIPS   = gomips()
@@ -136,6 +135,14 @@ func init() {
 	if GOARCH != "amd64" {
 		Regabi_enabled = 0
 	}
+
+	if v := os.Getenv("GO386"); v != "" && v != "sse2" {
+		msg := fmt.Sprintf("unsupported setting GO386=%s", v)
+		if v == "387" {
+			msg += ". 387 support was dropped in Go 1.16. Consider using gccgo instead."
+		}
+		log.Fatal(msg)
+	}
 }
 
 // Note: must agree with runtime.framepointer_enabled.
diff --git a/src/internal/cfg/cfg.go b/src/internal/cfg/cfg.go
index bdbe9df3e7..023429e441 100644
--- a/src/internal/cfg/cfg.go
+++ b/src/internal/cfg/cfg.go
@@ -32,7 +32,6 @@ const KnownEnv = `
 	FC
 	GCCGO
 	GO111MODULE
-	GO386
 	GOARCH
 	GOARM
 	GOBIN
diff --git a/src/reflect/all_test.go b/src/reflect/all_test.go
index ec87ec0c8a..0684eab973 100644
--- a/src/reflect/all_test.go
+++ b/src/reflect/all_test.go
@@ -4265,24 +4265,6 @@ var gFloat32 float32
 
 func TestConvertNaNs(t *testing.T) {
 	const snan uint32 = 0x7f800001
-
-	// Test to see if a store followed by a load of a signaling NaN
-	// maintains the signaling bit. The only platform known to fail
-	// this test is 386,GO386=387. The real test below will always fail
-	// if the platform can't even store+load a float without mucking
-	// with the bits.
-	gFloat32 = math.Float32frombits(snan)
-	runtime.Gosched() // make sure we don't optimize the store/load away
-	r := math.Float32bits(gFloat32)
-	if r != snan {
-		// This should only happen on 386,GO386=387. We have no way to
-		// test for 387, so we just make sure we're at least on 386.
-		if runtime.GOARCH != "386" {
-			t.Errorf("store/load of sNaN not faithful")
-		}
-		t.Skip("skipping test, float store+load not faithful")
-	}
-
 	type myFloat32 float32
 	x := V(myFloat32(math.Float32frombits(snan)))
 	y := x.Convert(TypeOf(float32(0)))
diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index c2e14cdcd6..c5bfb0f207 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -190,40 +190,25 @@ func (l *layout) restore() {
 func gen386() {
 	p("PUSHFL")
 
-	// Save general purpose registers.
+	// Assign stack offsets.
 	var l = layout{sp: "SP"}
 	for _, reg := range regNames386 {
-		if reg == "SP" || strings.HasPrefix(reg, "X") {
+		if reg == "SP" {
 			continue
 		}
-		l.add("MOVL", reg, 4)
-	}
-
-	// Save the 387 state.
-	l.addSpecial(
-		"FSAVE %d(SP)\nFLDCW runtime·controlWord64(SB)",
-		"FRSTOR %d(SP)",
-		108)
-
-	// Save SSE state only if supported.
-	lSSE := layout{stack: l.stack, sp: "SP"}
-	for i := 0; i < 8; i++ {
-		lSSE.add("MOVUPS", fmt.Sprintf("X%d", i), 16)
+		if strings.HasPrefix(reg, "X") {
+			l.add("MOVUPS", reg, 16)
+		} else {
+			l.add("MOVL", reg, 4)
+		}
 	}
 
-	p("ADJSP $%d", lSSE.stack)
+	p("ADJSP $%d", l.stack)
 	p("NOP SP")
 	l.save()
-	p("CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1\nJNE nosse")
-	lSSE.save()
-	label("nosse:")
 	p("CALL ·asyncPreempt2(SB)")
-	p("CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1\nJNE nosse2")
-	lSSE.restore()
-	label("nosse2:")
 	l.restore()
-	p("ADJSP $%d", -lSSE.stack)
-
+	p("ADJSP $%d", -l.stack)
 	p("POPFL")
 	p("RET")
 }
diff --git a/src/runtime/preempt_386.s b/src/runtime/preempt_386.s
index a00ac8f385..5c9b8ea224 100644
--- a/src/runtime/preempt_386.s
+++ b/src/runtime/preempt_386.s
@@ -5,7 +5,7 @@
 
 TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	PUSHFL
-	ADJSP $264
+	ADJSP $156
 	NOP SP
 	MOVL AX, 0(SP)
 	MOVL CX, 4(SP)
@@ -14,32 +14,23 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVL BP, 16(SP)
 	MOVL SI, 20(SP)
 	MOVL DI, 24(SP)
-	FSAVE 28(SP)
-	FLDCW runtime·controlWord64(SB)
-	CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
-	JNE nosse
-	MOVUPS X0, 136(SP)
-	MOVUPS X1, 152(SP)
-	MOVUPS X2, 168(SP)
-	MOVUPS X3, 184(SP)
-	MOVUPS X4, 200(SP)
-	MOVUPS X5, 216(SP)
-	MOVUPS X6, 232(SP)
-	MOVUPS X7, 248(SP)
-nosse:
+	MOVUPS X0, 28(SP)
+	MOVUPS X1, 44(SP)
+	MOVUPS X2, 60(SP)
+	MOVUPS X3, 76(SP)
+	MOVUPS X4, 92(SP)
+	MOVUPS X5, 108(SP)
+	MOVUPS X6, 124(SP)
+	MOVUPS X7, 140(SP)
 	CALL ·asyncPreempt2(SB)
-	CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
-	JNE nosse2
-	MOVUPS 248(SP), X7
-	MOVUPS 232(SP), X6
-	MOVUPS 216(SP), X5
-	MOVUPS 200(SP), X4
-	MOVUPS 184(SP), X3
-	MOVUPS 168(SP), X2
-	MOVUPS 152(SP), X1
-	MOVUPS 136(SP), X0
-nosse2:
-	FRSTOR 28(SP)
+	MOVUPS 140(SP), X7
+	MOVUPS 124(SP), X6
+	MOVUPS 108(SP), X5
+	MOVUPS 92(SP), X4
+	MOVUPS 76(SP), X3
+	MOVUPS 60(SP), X2
+	MOVUPS 44(SP), X1
+	MOVUPS 28(SP), X0
 	MOVL 24(SP), DI
 	MOVL 20(SP), SI
 	MOVL 16(SP), BP
@@ -47,6 +38,6 @@ nosse2:
 	MOVL 8(SP), DX
 	MOVL 4(SP), CX
 	MOVL 0(SP), AX
-	ADJSP $-264
+	ADJSP $-156
 	POPFL
 	RET
diff --git a/src/runtime/vlrt.go b/src/runtime/vlrt.go
index 38e0b32801..996c0611fd 100644
--- a/src/runtime/vlrt.go
+++ b/src/runtime/vlrt.go
@@ -263,7 +263,7 @@ func slowdodiv(n, d uint64) (q, r uint64) {
 	return q, n
 }
 
-// Floating point control word values for GOARCH=386 GO386=387.
+// Floating point control word values.
 // Bits 0-5 are bits to disable floating-point exceptions.
 // Bits 8-9 are the precision control:
 //   0 = single precision a.k.a. float32
@@ -273,6 +273,5 @@ func slowdodiv(n, d uint64) (q, r uint64) {
 //   3 = round toward zero
 var (
 	controlWord64      uint16 = 0x3f + 2<<8 + 0<<10
-	controlWord32             = 0x3f + 0<<8 + 0<<10
-	controlWord64trunc        = 0x3f + 2<<8 + 3<<10
+	controlWord64trunc uint16 = 0x3f + 2<<8 + 3<<10
 )
diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go
index 0bdb66a376..30f39a8da1 100644
--- a/test/codegen/arithmetic.go
+++ b/test/codegen/arithmetic.go
@@ -125,7 +125,7 @@ func Mul_n120(n int) int {
 func MulMemSrc(a []uint32, b []float32) {
 	// 386:`IMULL\s4\([A-Z]+\),\s[A-Z]+`
 	a[0] *= a[1]
-	// 386/sse2:`MULSS\s4\([A-Z]+\),\sX[0-9]+`
+	// 386:`MULSS\s4\([A-Z]+\),\sX[0-9]+`
 	// amd64:`MULSS\s4\([A-Z]+\),\sX[0-9]+`
 	b[0] *= b[1]
 }
@@ -167,7 +167,7 @@ func MergeMuls5(a, n int) int {
 // -------------- //
 
 func DivMemSrc(a []float64) {
-	// 386/sse2:`DIVSD\s8\([A-Z]+\),\sX[0-9]+`
+	// 386:`DIVSD\s8\([A-Z]+\),\sX[0-9]+`
 	// amd64:`DIVSD\s8\([A-Z]+\),\sX[0-9]+`
 	a[0] /= a[1]
 }
@@ -211,7 +211,7 @@ func ConstDivs(n1 uint, n2 int) (uint, int) {
 
 func FloatDivs(a []float32) float32 {
 	// amd64:`DIVSS\s8\([A-Z]+\),\sX[0-9]+`
-	// 386/sse2:`DIVSS\s8\([A-Z]+\),\sX[0-9]+`
+	// 386:`DIVSS\s8\([A-Z]+\),\sX[0-9]+`
 	return a[1] / a[2]
 }
 
diff --git a/test/codegen/floats.go b/test/codegen/floats.go
index 3fae1a327c..d115800a67 100644
--- a/test/codegen/floats.go
+++ b/test/codegen/floats.go
@@ -6,8 +6,6 @@
 
 package codegen
 
-import "math"
-
 // This file contains codegen tests related to arithmetic
 // simplifications and optimizations on float types.
 // For codegen tests on integer types, see arithmetic.go.
@@ -17,8 +15,7 @@ import "math"
 // --------------------- //
 
 func Mul2(f float64) float64 {
-	// 386/sse2:"ADDSD",-"MULSD"
-	// 386/387:"FADDDP",-"FMULDP"
+	// 386:"ADDSD",-"MULSD"
 	// amd64:"ADDSD",-"MULSD"
 	// arm/7:"ADDD",-"MULD"
 	// arm64:"FADDD",-"FMULD"
@@ -28,8 +25,7 @@ func Mul2(f float64) float64 {
 }
 
 func DivPow2(f1, f2, f3 float64) (float64, float64, float64) {
-	// 386/sse2:"MULSD",-"DIVSD"
-	// 386/387:"FMULDP",-"FDIVDP"
+	// 386:"MULSD",-"DIVSD"
 	// amd64:"MULSD",-"DIVSD"
 	// arm/7:"MULD",-"DIVD"
 	// arm64:"FMULD",-"FDIVD"
@@ -37,8 +33,7 @@ func DivPow2(f1, f2, f3 float64) (float64, float64, float64) {
 	// ppc64le:"FMUL",-"FDIV"
 	x := f1 / 16.0
 
-	// 386/sse2:"MULSD",-"DIVSD"
-	// 386/387:"FMULDP",-"FDIVDP"
+	// 386:"MULSD",-"DIVSD"
 	// amd64:"MULSD",-"DIVSD"
 	// arm/7:"MULD",-"DIVD"
 	// arm64:"FMULD",-"FDIVD"
@@ -46,8 +41,7 @@ func DivPow2(f1, f2, f3 float64) (float64, float64, float64) {
 	// ppc64le:"FMUL",-"FDIVD"
 	y := f2 / 0.125
 
-	// 386/sse2:"ADDSD",-"DIVSD",-"MULSD"
-	// 386/387:"FADDDP",-"FDIVDP",-"FMULDP"
+	// 386:"ADDSD",-"DIVSD",-"MULSD"
 	// amd64:"ADDSD",-"DIVSD",-"MULSD"
 	// arm/7:"ADDD",-"MULD",-"DIVD"
 	// arm64:"FADDD",-"FMULD",-"FDIVD"
@@ -58,11 +52,6 @@ func DivPow2(f1, f2, f3 float64) (float64, float64, float64) {
 	return x, y, z
 }
 
-func getPi() float64 {
-	// 386/387:"FLDPI"
-	return math.Pi
-}
-
 func indexLoad(b0 []float32, b1 float32, idx int) float32 {
 	// arm64:`FMOVS\s\(R[0-9]+\)\(R[0-9]+\),\sF[0-9]+`
 	return b0[idx] * b1
diff --git a/test/codegen/math.go b/test/codegen/math.go
index 1ebfda0405..fe678eea23 100644
--- a/test/codegen/math.go
+++ b/test/codegen/math.go
@@ -46,7 +46,7 @@ func approx(x float64) {
 
 func sqrt(x float64) float64 {
 	// amd64:"SQRTSD"
-	// 386/387:"FSQRT" 386/sse2:"SQRTSD"
+	// 386:"SQRTSD"
 	// arm64:"FSQRTD"
 	// arm/7:"SQRTD"
 	// mips/hardfloat:"SQRTD" mips/softfloat:-"SQRTD"
diff --git a/test/codegen/memops.go b/test/codegen/memops.go
index a234283146..4b003ad861 100644
--- a/test/codegen/memops.go
+++ b/test/codegen/memops.go
@@ -175,33 +175,33 @@ func idxInt64(x, y []int64, i int) {
 
 func idxFloat32(x, y []float32, i int) {
 	var t float32
-	//    amd64: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\), X[0-9]+`
-	// 386/sse2: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\), X[0-9]+`
+	// amd64: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\), X[0-9]+`
+	//   386: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\), X[0-9]+`
 	t = x[i+1]
-	//    amd64: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\)`
-	// 386/sse2: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\)`
+	// amd64: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\)`
+	//   386: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\)`
 	y[i+1] = t
-	//    amd64: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\), X[0-9]+`
-	// 386/sse2: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\), X[0-9]+`
+	// amd64: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\), X[0-9]+`
+	//   386: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\), X[0-9]+`
 	t = x[16*i+1]
-	//    amd64: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\)`
-	// 386/sse2: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\)`
+	// amd64: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\)`
+	//   386: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\)`
 	y[16*i+1] = t
 }
 
 func idxFloat64(x, y []float64, i int) {
 	var t float64
-	//    amd64: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), X[0-9]+`
-	// 386/sse2: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), X[0-9]+`
+	// amd64: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), X[0-9]+`
+	//   386: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), X[0-9]+`
 	t = x[i+1]
-	//    amd64: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\)`
-	// 386/sse2: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\)`
+	// amd64: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\)`
+	//   386: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\)`
 	y[i+1] = t
-	//    amd64: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\), X[0-9]+`
-	// 386/sse2: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\), X[0-9]+`
+	// amd64: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\), X[0-9]+`
+	//   386: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\), X[0-9]+`
 	t = x[16*i+1]
-	//    amd64: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\)`
-	// 386/sse2: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\)`
+	// amd64: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\)`
+	//   386: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\)`
 	y[16*i+1] = t
 }
 
diff --git a/test/run.go b/test/run.go
index 95b94b7277..77710fd89a 100644
--- a/test/run.go
+++ b/test/run.go
@@ -1489,7 +1489,7 @@ var (
 	// value[0] is the variant-changing environment variable, and values[1:]
 	// are the supported variants.
 	archVariants = map[string][]string{
-		"386":     {"GO386", "387", "sse2"},
+		"386":     {},
 		"amd64":   {},
 		"arm":     {"GOARM", "5", "6", "7"},
 		"arm64":   {},
@@ -1511,12 +1511,12 @@ type wantedAsmOpcode struct {
 	found    bool           // true if the opcode check matched at least one in the output
 }
 
-// A build environment triplet separated by slashes (eg: linux/386/sse2).
+// A build environment triplet separated by slashes (eg: linux/arm/7).
 // The third field can be empty if the arch does not support variants (eg: "plan9/amd64/")
 type buildEnv string
 
 // Environ returns the environment it represents in cmd.Environ() "key=val" format
-// For instance, "linux/386/sse2".Environ() returns {"GOOS=linux", "GOARCH=386", "GO386=sse2"}
+// For instance, "linux/arm/7".Environ() returns {"GOOS=linux", "GOARCH=arm", "GOARM=7"}
 func (b buildEnv) Environ() []string {
 	fields := strings.Split(string(b), "/")
 	if len(fields) != 3 {
@@ -1571,11 +1571,11 @@ func (t *test) wantedAsmOpcodes(fn string) asmChecks {
 
 			var arch, subarch, os string
 			switch {
-			case archspec[2] != "": // 3 components: "linux/386/sse2"
+			case archspec[2] != "": // 3 components: "linux/arm/7"
 				os, arch, subarch = archspec[0], archspec[1][1:], archspec[2][1:]
-			case archspec[1] != "": // 2 components: "386/sse2"
+			case archspec[1] != "": // 2 components: "arm/7"
 				os, arch, subarch = "linux", archspec[0], archspec[1][1:]
-			default: // 1 component: "386"
+			default: // 1 component: "arm"
 				os, arch, subarch = "linux", archspec[0], ""
 				if arch == "wasm" {
 					os = "js"
-- 
cgit v1.3


From 04b8a9fea57e37589d82410281f22ebde0027808 Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Tue, 6 Oct 2020 14:42:15 -0700
Subject: all: implement GO386=softfloat

Backstop support for non-sse2 chips now that 387 is gone.

RELNOTE=yes

Change-Id: Ib10e69c4a3654c15a03568f93393437e1939e013
Reviewed-on: https://go-review.googlesource.com/c/go/+/260017
Trust: Keith Randall <khr@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
---
 src/cmd/compile/internal/x86/galign.go | 15 +++++++++++++++
 src/cmd/dist/build.go                  | 11 +++++++++++
 src/cmd/dist/buildruntime.go           |  2 ++
 src/cmd/go/alldocs.go                  |  3 +++
 src/cmd/go/internal/cfg/cfg.go         |  3 +++
 src/cmd/go/internal/help/helpdoc.go    |  3 +++
 src/cmd/internal/objabi/util.go        |  9 +--------
 src/internal/cfg/cfg.go                |  1 +
 test/codegen/arithmetic.go             |  6 +++---
 test/codegen/floats.go                 |  8 ++++----
 test/codegen/math.go                   |  2 +-
 test/codegen/memops.go                 | 32 ++++++++++++++++----------------
 test/run.go                            | 12 ++++++------
 13 files changed, 69 insertions(+), 38 deletions(-)

(limited to 'test/codegen')

diff --git a/src/cmd/compile/internal/x86/galign.go b/src/cmd/compile/internal/x86/galign.go
index 2d20b6a6d0..e137daa3fc 100644
--- a/src/cmd/compile/internal/x86/galign.go
+++ b/src/cmd/compile/internal/x86/galign.go
@@ -7,6 +7,9 @@ package x86
 import (
 	"cmd/compile/internal/gc"
 	"cmd/internal/obj/x86"
+	"cmd/internal/objabi"
+	"fmt"
+	"os"
 )
 
 func Init(arch *gc.Arch) {
@@ -15,6 +18,18 @@ func Init(arch *gc.Arch) {
 	arch.SSAGenValue = ssaGenValue
 	arch.SSAGenBlock = ssaGenBlock
 	arch.MAXWIDTH = (1 << 32) - 1
+	switch v := objabi.GO386; v {
+	case "sse2":
+	case "softfloat":
+		arch.SoftFloat = true
+	case "387":
+		fmt.Fprintf(os.Stderr, "unsupported setting GO386=387. Consider using GO386=softfloat instead.\n")
+		gc.Exit(1)
+	default:
+		fmt.Fprintf(os.Stderr, "unsupported setting GO386=%s\n", v)
+		gc.Exit(1)
+
+	}
 
 	arch.ZeroRange = zerorange
 	arch.Ginsnop = ginsnop
diff --git a/src/cmd/dist/build.go b/src/cmd/dist/build.go
index 3b3eb113b1..69a66abd2d 100644
--- a/src/cmd/dist/build.go
+++ b/src/cmd/dist/build.go
@@ -30,6 +30,7 @@ var (
 	gohostos         string
 	goos             string
 	goarm            string
+	go386            string
 	gomips           string
 	gomips64         string
 	goppc64          string
@@ -141,6 +142,12 @@ func xinit() {
 	}
 	goarm = b
 
+	b = os.Getenv("GO386")
+	if b == "" {
+		b = "sse2"
+	}
+	go386 = b
+
 	b = os.Getenv("GOMIPS")
 	if b == "" {
 		b = "hardfloat"
@@ -212,6 +219,7 @@ func xinit() {
 	defaultldso = os.Getenv("GO_LDSO")
 
 	// For tools being invoked but also for os.ExpandEnv.
+	os.Setenv("GO386", go386)
 	os.Setenv("GOARCH", goarch)
 	os.Setenv("GOARM", goarm)
 	os.Setenv("GOHOSTARCH", gohostarch)
@@ -1153,6 +1161,9 @@ func cmdenv() {
 	if goarch == "arm" {
 		xprintf(format, "GOARM", goarm)
 	}
+	if goarch == "386" {
+		xprintf(format, "GO386", go386)
+	}
 	if goarch == "mips" || goarch == "mipsle" {
 		xprintf(format, "GOMIPS", gomips)
 	}
diff --git a/src/cmd/dist/buildruntime.go b/src/cmd/dist/buildruntime.go
index 67d1d72db4..2744951597 100644
--- a/src/cmd/dist/buildruntime.go
+++ b/src/cmd/dist/buildruntime.go
@@ -41,6 +41,7 @@ func mkzversion(dir, file string) {
 //	package objabi
 //
 //	const defaultGOROOT = <goroot>
+//	const defaultGO386 = <go386>
 //	const defaultGOARM = <goarm>
 //	const defaultGOMIPS = <gomips>
 //	const defaultGOMIPS64 = <gomips64>
@@ -69,6 +70,7 @@ func mkzbootstrap(file string) {
 	fmt.Fprintln(&buf)
 	fmt.Fprintf(&buf, "import \"runtime\"\n")
 	fmt.Fprintln(&buf)
+	fmt.Fprintf(&buf, "const defaultGO386 = `%s`\n", go386)
 	fmt.Fprintf(&buf, "const defaultGOARM = `%s`\n", goarm)
 	fmt.Fprintf(&buf, "const defaultGOMIPS = `%s`\n", gomips)
 	fmt.Fprintf(&buf, "const defaultGOMIPS64 = `%s`\n", gomips64)
diff --git a/src/cmd/go/alldocs.go b/src/cmd/go/alldocs.go
index 14840efb22..5cb32c80e9 100644
--- a/src/cmd/go/alldocs.go
+++ b/src/cmd/go/alldocs.go
@@ -1852,6 +1852,9 @@
 // 	GOARM
 // 		For GOARCH=arm, the ARM architecture for which to compile.
 // 		Valid values are 5, 6, 7.
+// 	GO386
+// 		For GOARCH=386, how to implement floating point instructions.
+// 		Valid values are sse2 (default), softfloat.
 // 	GOMIPS
 // 		For GOARCH=mips{,le}, whether to use floating point instructions.
 // 		Valid values are hardfloat (default), softfloat.
diff --git a/src/cmd/go/internal/cfg/cfg.go b/src/cmd/go/internal/cfg/cfg.go
index 9169c12d8f..67d581f6e6 100644
--- a/src/cmd/go/internal/cfg/cfg.go
+++ b/src/cmd/go/internal/cfg/cfg.go
@@ -256,6 +256,7 @@ var (
 
 	// Used in envcmd.MkEnv and build ID computations.
 	GOARM    = envOr("GOARM", fmt.Sprint(objabi.GOARM))
+	GO386    = envOr("GO386", objabi.GO386)
 	GOMIPS   = envOr("GOMIPS", objabi.GOMIPS)
 	GOMIPS64 = envOr("GOMIPS64", objabi.GOMIPS64)
 	GOPPC64  = envOr("GOPPC64", fmt.Sprintf("%s%d", "power", objabi.GOPPC64))
@@ -279,6 +280,8 @@ func GetArchEnv() (key, val string) {
 	switch Goarch {
 	case "arm":
 		return "GOARM", GOARM
+	case "386":
+		return "GO386", GO386
 	case "mips", "mipsle":
 		return "GOMIPS", GOMIPS
 	case "mips64", "mips64le":
diff --git a/src/cmd/go/internal/help/helpdoc.go b/src/cmd/go/internal/help/helpdoc.go
index befa10a0e4..8dfabbaa4a 100644
--- a/src/cmd/go/internal/help/helpdoc.go
+++ b/src/cmd/go/internal/help/helpdoc.go
@@ -581,6 +581,9 @@ Architecture-specific environment variables:
 	GOARM
 		For GOARCH=arm, the ARM architecture for which to compile.
 		Valid values are 5, 6, 7.
+	GO386
+		For GOARCH=386, how to implement floating point instructions.
+		Valid values are sse2 (default), softfloat.
 	GOMIPS
 		For GOARCH=mips{,le}, whether to use floating point instructions.
 		Valid values are hardfloat (default), softfloat.
diff --git a/src/cmd/internal/objabi/util.go b/src/cmd/internal/objabi/util.go
index cedb2d0a26..b81b73a022 100644
--- a/src/cmd/internal/objabi/util.go
+++ b/src/cmd/internal/objabi/util.go
@@ -24,6 +24,7 @@ var (
 	GOROOT   = envOr("GOROOT", defaultGOROOT)
 	GOARCH   = envOr("GOARCH", defaultGOARCH)
 	GOOS     = envOr("GOOS", defaultGOOS)
+	GO386    = envOr("GO386", defaultGO386)
 	GOAMD64  = goamd64()
 	GOARM    = goarm()
 	GOMIPS   = gomips()
@@ -135,14 +136,6 @@ func init() {
 	if GOARCH != "amd64" {
 		Regabi_enabled = 0
 	}
-
-	if v := os.Getenv("GO386"); v != "" && v != "sse2" {
-		msg := fmt.Sprintf("unsupported setting GO386=%s", v)
-		if v == "387" {
-			msg += ". 387 support was dropped in Go 1.16. Consider using gccgo instead."
-		}
-		log.Fatal(msg)
-	}
 }
 
 // Note: must agree with runtime.framepointer_enabled.
diff --git a/src/internal/cfg/cfg.go b/src/internal/cfg/cfg.go
index 023429e441..bdbe9df3e7 100644
--- a/src/internal/cfg/cfg.go
+++ b/src/internal/cfg/cfg.go
@@ -32,6 +32,7 @@ const KnownEnv = `
 	FC
 	GCCGO
 	GO111MODULE
+	GO386
 	GOARCH
 	GOARM
 	GOBIN
diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go
index 30f39a8da1..0bdb66a376 100644
--- a/test/codegen/arithmetic.go
+++ b/test/codegen/arithmetic.go
@@ -125,7 +125,7 @@ func Mul_n120(n int) int {
 func MulMemSrc(a []uint32, b []float32) {
 	// 386:`IMULL\s4\([A-Z]+\),\s[A-Z]+`
 	a[0] *= a[1]
-	// 386:`MULSS\s4\([A-Z]+\),\sX[0-9]+`
+	// 386/sse2:`MULSS\s4\([A-Z]+\),\sX[0-9]+`
 	// amd64:`MULSS\s4\([A-Z]+\),\sX[0-9]+`
 	b[0] *= b[1]
 }
@@ -167,7 +167,7 @@ func MergeMuls5(a, n int) int {
 // -------------- //
 
 func DivMemSrc(a []float64) {
-	// 386:`DIVSD\s8\([A-Z]+\),\sX[0-9]+`
+	// 386/sse2:`DIVSD\s8\([A-Z]+\),\sX[0-9]+`
 	// amd64:`DIVSD\s8\([A-Z]+\),\sX[0-9]+`
 	a[0] /= a[1]
 }
@@ -211,7 +211,7 @@ func ConstDivs(n1 uint, n2 int) (uint, int) {
 
 func FloatDivs(a []float32) float32 {
 	// amd64:`DIVSS\s8\([A-Z]+\),\sX[0-9]+`
-	// 386:`DIVSS\s8\([A-Z]+\),\sX[0-9]+`
+	// 386/sse2:`DIVSS\s8\([A-Z]+\),\sX[0-9]+`
 	return a[1] / a[2]
 }
 
diff --git a/test/codegen/floats.go b/test/codegen/floats.go
index d115800a67..83b4a358a5 100644
--- a/test/codegen/floats.go
+++ b/test/codegen/floats.go
@@ -15,7 +15,7 @@ package codegen
 // --------------------- //
 
 func Mul2(f float64) float64 {
-	// 386:"ADDSD",-"MULSD"
+	// 386/sse2:"ADDSD",-"MULSD"
 	// amd64:"ADDSD",-"MULSD"
 	// arm/7:"ADDD",-"MULD"
 	// arm64:"FADDD",-"FMULD"
@@ -25,7 +25,7 @@ func Mul2(f float64) float64 {
 }
 
 func DivPow2(f1, f2, f3 float64) (float64, float64, float64) {
-	// 386:"MULSD",-"DIVSD"
+	// 386/sse2:"MULSD",-"DIVSD"
 	// amd64:"MULSD",-"DIVSD"
 	// arm/7:"MULD",-"DIVD"
 	// arm64:"FMULD",-"FDIVD"
@@ -33,7 +33,7 @@ func DivPow2(f1, f2, f3 float64) (float64, float64, float64) {
 	// ppc64le:"FMUL",-"FDIV"
 	x := f1 / 16.0
 
-	// 386:"MULSD",-"DIVSD"
+	// 386/sse2:"MULSD",-"DIVSD"
 	// amd64:"MULSD",-"DIVSD"
 	// arm/7:"MULD",-"DIVD"
 	// arm64:"FMULD",-"FDIVD"
@@ -41,7 +41,7 @@ func DivPow2(f1, f2, f3 float64) (float64, float64, float64) {
 	// ppc64le:"FMUL",-"FDIVD"
 	y := f2 / 0.125
 
-	// 386:"ADDSD",-"DIVSD",-"MULSD"
+	// 386/sse2:"ADDSD",-"DIVSD",-"MULSD"
 	// amd64:"ADDSD",-"DIVSD",-"MULSD"
 	// arm/7:"ADDD",-"MULD",-"DIVD"
 	// arm64:"FADDD",-"FMULD",-"FDIVD"
diff --git a/test/codegen/math.go b/test/codegen/math.go
index fe678eea23..ac8071400e 100644
--- a/test/codegen/math.go
+++ b/test/codegen/math.go
@@ -46,7 +46,7 @@ func approx(x float64) {
 
 func sqrt(x float64) float64 {
 	// amd64:"SQRTSD"
-	// 386:"SQRTSD"
+	// 386/sse2:"SQRTSD" 386/softfloat:-"SQRTD"
 	// arm64:"FSQRTD"
 	// arm/7:"SQRTD"
 	// mips/hardfloat:"SQRTD" mips/softfloat:-"SQRTD"
diff --git a/test/codegen/memops.go b/test/codegen/memops.go
index 4b003ad861..a234283146 100644
--- a/test/codegen/memops.go
+++ b/test/codegen/memops.go
@@ -175,33 +175,33 @@ func idxInt64(x, y []int64, i int) {
 
 func idxFloat32(x, y []float32, i int) {
 	var t float32
-	// amd64: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\), X[0-9]+`
-	//   386: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\), X[0-9]+`
+	//    amd64: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\), X[0-9]+`
+	// 386/sse2: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\), X[0-9]+`
 	t = x[i+1]
-	// amd64: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\)`
-	//   386: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\)`
+	//    amd64: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\)`
+	// 386/sse2: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\)`
 	y[i+1] = t
-	// amd64: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\), X[0-9]+`
-	//   386: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\), X[0-9]+`
+	//    amd64: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\), X[0-9]+`
+	// 386/sse2: `MOVSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\), X[0-9]+`
 	t = x[16*i+1]
-	// amd64: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\)`
-	//   386: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\)`
+	//    amd64: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\)`
+	// 386/sse2: `MOVSS\tX[0-9]+, 4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[14]\)`
 	y[16*i+1] = t
 }
 
 func idxFloat64(x, y []float64, i int) {
 	var t float64
-	// amd64: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), X[0-9]+`
-	//   386: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), X[0-9]+`
+	//    amd64: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), X[0-9]+`
+	// 386/sse2: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), X[0-9]+`
 	t = x[i+1]
-	// amd64: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\)`
-	//   386: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\)`
+	//    amd64: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\)`
+	// 386/sse2: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\)`
 	y[i+1] = t
-	// amd64: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\), X[0-9]+`
-	//   386: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\), X[0-9]+`
+	//    amd64: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\), X[0-9]+`
+	// 386/sse2: `MOVSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\), X[0-9]+`
 	t = x[16*i+1]
-	// amd64: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\)`
-	//   386: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\)`
+	//    amd64: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\)`
+	// 386/sse2: `MOVSD\tX[0-9]+, 8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*[18]\)`
 	y[16*i+1] = t
 }
 
diff --git a/test/run.go b/test/run.go
index 77710fd89a..672861c8d7 100644
--- a/test/run.go
+++ b/test/run.go
@@ -1489,7 +1489,7 @@ var (
 	// value[0] is the variant-changing environment variable, and values[1:]
 	// are the supported variants.
 	archVariants = map[string][]string{
-		"386":     {},
+		"386":     {"GO386", "sse2", "softfloat"},
 		"amd64":   {},
 		"arm":     {"GOARM", "5", "6", "7"},
 		"arm64":   {},
@@ -1511,12 +1511,12 @@ type wantedAsmOpcode struct {
 	found    bool           // true if the opcode check matched at least one in the output
 }
 
-// A build environment triplet separated by slashes (eg: linux/arm/7).
+// A build environment triplet separated by slashes (eg: linux/386/sse2).
 // The third field can be empty if the arch does not support variants (eg: "plan9/amd64/")
 type buildEnv string
 
 // Environ returns the environment it represents in cmd.Environ() "key=val" format
-// For instance, "linux/arm/7".Environ() returns {"GOOS=linux", "GOARCH=arm", "GOARM=7"}
+// For instance, "linux/386/sse2".Environ() returns {"GOOS=linux", "GOARCH=386", "GO386=sse2"}
 func (b buildEnv) Environ() []string {
 	fields := strings.Split(string(b), "/")
 	if len(fields) != 3 {
@@ -1571,11 +1571,11 @@ func (t *test) wantedAsmOpcodes(fn string) asmChecks {
 
 			var arch, subarch, os string
 			switch {
-			case archspec[2] != "": // 3 components: "linux/arm/7"
+			case archspec[2] != "": // 3 components: "linux/386/sse2"
 				os, arch, subarch = archspec[0], archspec[1][1:], archspec[2][1:]
-			case archspec[1] != "": // 2 components: "arm/7"
+			case archspec[1] != "": // 2 components: "386/sse2"
 				os, arch, subarch = "linux", archspec[0], archspec[1][1:]
-			default: // 1 component: "arm"
+			default: // 1 component: "386"
 				os, arch, subarch = "linux", archspec[0], ""
 				if arch == "wasm" {
 					os = "js"
-- 
cgit v1.3


From c3c6fbf31419d37b0ae7d99b5378f6f8e9080b24 Mon Sep 17 00:00:00 2001
From: "Paul E. Murphy" <murp@ibm.com>
Date: Fri, 23 Oct 2020 12:12:34 -0500
Subject: cmd/compile: combine more 32 bit shift and mask operations on ppc64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Combine (AND m (SRWconst x)) or (SRWconst (AND m x)) when mask m is
and the shift value produce constant which can be encoded into an
RLWINM instruction.

Combine (CLRLSLDI (SRWconst x)) if the combining of the underling rotate
masks produces a constant which can be encoded into RLWINM.

Likewise for (SLDconst (SRWconst x)) and (CLRLSDI (RLWINM x)).

Combine rotate word + and operations which can be encoded as a single
RLWINM/RLWNM instruction.

The most notable performance improvements arise from the crypto
benchmarks below (GOARCH=power8 on a ppc64le/linux):

pkg:golang.org/x/crypto/blowfish goos:linux goarch:ppc64le
ExpandKeyWithSalt                               52.2µs ± 0%    47.5µs ± 0%  -8.88%
ExpandKey                                       44.4µs ± 0%    40.3µs ± 0%  -9.15%

pkg:golang.org/x/crypto/ssh/internal/bcrypt_pbkdf goos:linux goarch:ppc64le
Key                                             57.6ms ± 0%    52.3ms ± 0%  -9.13%

pkg:golang.org/x/crypto/bcrypt goos:linux goarch:ppc64le
Equal                                           90.9ms ± 0%    82.6ms ± 0%  -9.13%
DefaultCost                                     91.0ms ± 0%    82.7ms ± 0%  -9.12%

Change-Id: I59a0ca29face38f4ab46e37124c32906f216c4ce
Reviewed-on: https://go-review.googlesource.com/c/go/+/260798
Run-TryBot: Carlos Eduardo Seo <carlos.seo@linaro.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Carlos Eduardo Seo <carlos.seo@linaro.com>
Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
---
 src/cmd/compile/internal/ppc64/ssa.go        |  18 ++
 src/cmd/compile/internal/ssa/gen/PPC64.rules |  25 ++
 src/cmd/compile/internal/ssa/gen/PPC64Ops.go |   5 +
 src/cmd/compile/internal/ssa/opGen.go        |  48 ++++
 src/cmd/compile/internal/ssa/rewrite.go      | 137 ++++++++++
 src/cmd/compile/internal/ssa/rewritePPC64.go | 368 +++++++++++++++++++++++++++
 src/cmd/compile/internal/ssa/rewrite_test.go | 181 +++++++++++++
 test/codegen/rotate.go                       |  45 ++++
 test/codegen/shift.go                        |  94 +++++--
 9 files changed, 900 insertions(+), 21 deletions(-)

(limited to 'test/codegen')

diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go
index 79f18bfebb..3888aa6527 100644
--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@@ -649,6 +649,24 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
 
+		// Auxint holds encoded rotate + mask
+	case ssa.OpPPC64RLWINM, ssa.OpPPC64RLWMI:
+		rot, _, _, mask := ssa.DecodePPC64RotateMask(v.AuxInt)
+		p := s.Prog(v.Op.Asm())
+		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
+		p.Reg = v.Args[0].Reg()
+		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: int64(rot)}
+		p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: int64(mask)})
+
+		// Auxint holds mask
+	case ssa.OpPPC64RLWNM:
+		_, _, _, mask := ssa.DecodePPC64RotateMask(v.AuxInt)
+		p := s.Prog(v.Op.Asm())
+		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
+		p.Reg = v.Args[0].Reg()
+		p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[1].Reg()}
+		p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: int64(mask)})
+
 	case ssa.OpPPC64MADDLD:
 		r := v.Reg()
 		r1 := v.Args[0].Reg()
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64.rules b/src/cmd/compile/internal/ssa/gen/PPC64.rules
index 6175b42b89..558b09c9f2 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@@ -150,6 +150,31 @@
 (ROTLW  x (MOVDconst [c])) => (ROTLWconst  x [c&31])
 (ROTL   x (MOVDconst [c])) => (ROTLconst   x [c&63])
 
+// Combine rotate and mask operations
+(ANDconst [m] (ROTLWconst [r] x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,m,32)] x)
+(AND (MOVDconst [m]) (ROTLWconst [r] x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,m,32)] x)
+(ANDconst [m] (ROTLW x r)) && isPPC64WordRotateMask(m) => (RLWNM [encodePPC64RotateMask(0,m,32)] x r)
+(AND (MOVDconst [m]) (ROTLW x r)) && isPPC64WordRotateMask(m) => (RLWNM [encodePPC64RotateMask(0,m,32)] x r)
+
+// Note, any rotated word bitmask is still a valid word bitmask.
+(ROTLWconst [r] (AND (MOVDconst [m]) x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,rotateLeft32(m,r),32)] x)
+(ROTLWconst [r] (ANDconst [m] x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,rotateLeft32(m,r),32)] x)
+
+(ANDconst [m] (SRWconst x [s])) && mergePPC64RShiftMask(m,s,32) == 0 => (MOVDconst [0])
+(ANDconst [m] (SRWconst x [s])) && mergePPC64AndSrwi(m,s) != 0 => (RLWINM [mergePPC64AndSrwi(m,s)] x)
+(AND (MOVDconst [m]) (SRWconst x [s])) && mergePPC64RShiftMask(m,s,32) == 0 => (MOVDconst [0])
+(AND (MOVDconst [m]) (SRWconst x [s])) && mergePPC64AndSrwi(m,s) != 0 => (RLWINM [mergePPC64AndSrwi(m,s)] x)
+
+(SRWconst (ANDconst [m] x) [s]) && mergePPC64RShiftMask(m>>uint(s),s,32) == 0 => (MOVDconst [0])
+(SRWconst (ANDconst [m] x) [s]) && mergePPC64AndSrwi(m>>uint(s),s) != 0 => (RLWINM [mergePPC64AndSrwi(m>>uint(s),s)] x)
+(SRWconst (AND (MOVDconst [m]) x) [s]) && mergePPC64RShiftMask(m>>uint(s),s,32) == 0 => (MOVDconst [0])
+(SRWconst (AND (MOVDconst [m]) x) [s]) && mergePPC64AndSrwi(m>>uint(s),s) != 0 => (RLWINM [mergePPC64AndSrwi(m>>uint(s),s)] x)
+
+// Merge shift right + shift left and clear left (e.g for a table lookup)
+(CLRLSLDI [c] (SRWconst [s] x)) && mergePPC64ClrlsldiSrw(int64(c),s) != 0 => (RLWINM [mergePPC64ClrlsldiSrw(int64(c),s)] x)
+(SLDconst [l] (SRWconst [r] x)) && mergePPC64SldiSrw(l,r) != 0 => (RLWINM [mergePPC64SldiSrw(l,r)] x)
+// The following reduction shows up frequently too. e.g b[(x>>14)&0xFF]
+(CLRLSLDI [c] i:(RLWINM [s] x)) && mergePPC64ClrlsldiRlwinm(c,s) != 0 => (RLWINM [mergePPC64ClrlsldiRlwinm(c,s)] x)
 
 // large constant shifts
 (Lsh64x64  _ (MOVDconst [c])) && uint64(c) >= 64 => (MOVDconst [0])
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
index f4a53262f0..f7198b90c3 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
@@ -137,6 +137,7 @@ func init() {
 		gp01        = regInfo{inputs: nil, outputs: []regMask{gp}}
 		gp11        = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}
 		gp21        = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}}
+		gp21a0      = regInfo{inputs: []regMask{gp, gp | sp | sb}, outputs: []regMask{gp}}
 		gp31        = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}}
 		gp22        = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp, gp}}
 		gp32        = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp, gp}}
@@ -227,6 +228,10 @@ func init() {
 		{name: "ROTLWconst", argLength: 1, reg: gp11, asm: "ROTLW", aux: "Int64"}, // uint32(arg0) rotate left by auxInt bits
 		{name: "EXTSWSLconst", argLength: 1, reg: gp11, asm: "EXTSWSLI", aux: "Int64"},
 
+		{name: "RLWINM", argLength: 1, reg: gp11, asm: "RLWNM", aux: "Int64"},                      // Rotate and mask by immediate "rlwinm". encodePPC64RotateMask describes aux
+		{name: "RLWNM", argLength: 2, reg: gp21, asm: "RLWNM", aux: "Int64"},                       // Rotate and mask by "rlwnm". encodePPC64RotateMask describes aux
+		{name: "RLWMI", argLength: 2, reg: gp21a0, asm: "RLWMI", aux: "Int64", resultInArg0: true}, // "rlwimi" similar aux encoding as above
+
 		{name: "CNTLZD", argLength: 1, reg: gp11, asm: "CNTLZD", clobberFlags: true}, // count leading zeros
 		{name: "CNTLZW", argLength: 1, reg: gp11, asm: "CNTLZW", clobberFlags: true}, // count leading zeros (32 bit)
 
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 779c19f72d..bb1cbc0baa 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1871,6 +1871,9 @@ const (
 	OpPPC64ROTLconst
 	OpPPC64ROTLWconst
 	OpPPC64EXTSWSLconst
+	OpPPC64RLWINM
+	OpPPC64RLWNM
+	OpPPC64RLWMI
 	OpPPC64CNTLZD
 	OpPPC64CNTLZW
 	OpPPC64CNTTZD
@@ -24971,6 +24974,51 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "RLWINM",
+		auxType: auxInt64,
+		argLen:  1,
+		asm:     ppc64.ARLWNM,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+			outputs: []outputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
+	{
+		name:    "RLWNM",
+		auxType: auxInt64,
+		argLen:  2,
+		asm:     ppc64.ARLWNM,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+				{1, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+			outputs: []outputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
+	{
+		name:         "RLWMI",
+		auxType:      auxInt64,
+		argLen:       2,
+		resultInArg0: true,
+		asm:          ppc64.ARLWMI,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+				{1, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+			outputs: []outputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
 	{
 		name:         "CNTLZD",
 		argLen:       1,
diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
index e5f858a339..9b3c83d1cf 100644
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -1381,6 +1381,71 @@ func GetPPC64Shiftme(auxint int64) int64 {
 	return int64(int8(auxint))
 }
 
+// Test if this value can encoded as a mask for a rlwinm like
+// operation.  Masks can also extend from the msb and wrap to
+// the lsb too.  That is, the valid masks are 32 bit strings
+// of the form: 0..01..10..0 or 1..10..01..1 or 1...1
+func isPPC64WordRotateMask(v64 int64) bool {
+	// Isolate rightmost 1 (if none 0) and add.
+	v := uint32(v64)
+	vp := (v & -v) + v
+	// Likewise, for the wrapping case.
+	vn := ^v
+	vpn := (vn & -vn) + vn
+	return (v&vp == 0 || vn&vpn == 0) && v != 0
+}
+
+// Compress mask and and shift into single value of the form
+// me | mb<<8 | rotate<<16 | nbits<<24 where me and mb can
+// be used to regenerate the input mask.
+func encodePPC64RotateMask(rotate, mask, nbits int64) int64 {
+	var mb, me, mbn, men int
+
+	// Determine boundaries and then decode them
+	if mask == 0 || ^mask == 0 || rotate >= nbits {
+		panic("Invalid PPC64 rotate mask")
+	} else if nbits == 32 {
+		mb = bits.LeadingZeros32(uint32(mask))
+		me = 32 - bits.TrailingZeros32(uint32(mask))
+		mbn = bits.LeadingZeros32(^uint32(mask))
+		men = 32 - bits.TrailingZeros32(^uint32(mask))
+	} else {
+		mb = bits.LeadingZeros64(uint64(mask))
+		me = 64 - bits.TrailingZeros64(uint64(mask))
+		mbn = bits.LeadingZeros64(^uint64(mask))
+		men = 64 - bits.TrailingZeros64(^uint64(mask))
+	}
+	// Check for a wrapping mask (e.g bits at 0 and 63)
+	if mb == 0 && me == int(nbits) {
+		// swap the inverted values
+		mb, me = men, mbn
+	}
+
+	return int64(me) | int64(mb<<8) | int64(rotate<<16) | int64(nbits<<24)
+}
+
+// The inverse operation of encodePPC64RotateMask.  The values returned as
+// mb and me satisfy the POWER ISA definition of MASK(x,y) where MASK(mb,me) = mask.
+func DecodePPC64RotateMask(sauxint int64) (rotate, mb, me int64, mask uint64) {
+	auxint := uint64(sauxint)
+	rotate = int64((auxint >> 16) & 0xFF)
+	mb = int64((auxint >> 8) & 0xFF)
+	me = int64((auxint >> 0) & 0xFF)
+	nbits := int64((auxint >> 24) & 0xFF)
+	mask = ((1 << uint(nbits-mb)) - 1) ^ ((1 << uint(nbits-me)) - 1)
+	if mb > me {
+		mask = ^mask
+	}
+	if nbits == 32 {
+		mask = uint64(uint32(mask))
+	}
+
+	// Fixup ME to match ISA definition.  The second argument to MASK(..,me)
+	// is inclusive.
+	me = (me - 1) & (nbits - 1)
+	return
+}
+
 // This verifies that the mask occupies the
 // rightmost bits.
 func isPPC64ValidShiftMask(v int64) bool {
@@ -1394,6 +1459,78 @@ func getPPC64ShiftMaskLength(v int64) int64 {
 	return int64(bits.Len64(uint64(v)))
 }
 
+// Decompose a shift right into an equivalent rotate/mask,
+// and return mask & m.
+func mergePPC64RShiftMask(m, s, nbits int64) int64 {
+	smask := uint64((1<<uint(nbits))-1) >> uint(s)
+	return m & int64(smask)
+}
+
+// Combine (ANDconst [m] (SRWconst [s])) into (RLWINM [y]) or return 0
+func mergePPC64AndSrwi(m, s int64) int64 {
+	mask := mergePPC64RShiftMask(m, s, 32)
+	if !isPPC64WordRotateMask(mask) {
+		return 0
+	}
+	return encodePPC64RotateMask(32-s, mask, 32)
+}
+
+// Test if a shift right feeding into a CLRLSLDI can be merged into RLWINM.
+// Return the encoded RLWINM constant, or 0 if they cannot be merged.
+func mergePPC64ClrlsldiSrw(sld, srw int64) int64 {
+	mask_1 := uint64(0xFFFFFFFF >> uint(srw))
+	// for CLRLSLDI, it's more convient to think of it as a mask left bits then rotate left.
+	mask_2 := uint64(0xFFFFFFFFFFFFFFFF) >> uint(GetPPC64Shiftmb(int64(sld)))
+
+	// Rewrite mask to apply after the final left shift.
+	mask_3 := (mask_1 & mask_2) << uint(GetPPC64Shiftsh(sld))
+
+	r_1 := 32 - srw
+	r_2 := GetPPC64Shiftsh(sld)
+	r_3 := (r_1 + r_2) & 31 // This can wrap.
+
+	if uint64(uint32(mask_3)) != mask_3 || mask_3 == 0 {
+		return 0
+	}
+	return encodePPC64RotateMask(int64(r_3), int64(mask_3), 32)
+}
+
+// Test if a RLWINM feeding into a CLRLSLDI can be merged into RLWINM.  Return
+// the encoded RLWINM constant, or 0 if they cannot be merged.
+func mergePPC64ClrlsldiRlwinm(sld int32, rlw int64) int64 {
+	r_1, _, _, mask_1 := DecodePPC64RotateMask(rlw)
+	// for CLRLSLDI, it's more convient to think of it as a mask left bits then rotate left.
+	mask_2 := uint64(0xFFFFFFFFFFFFFFFF) >> uint(GetPPC64Shiftmb(int64(sld)))
+
+	// combine the masks, and adjust for the final left shift.
+	mask_3 := (mask_1 & mask_2) << uint(GetPPC64Shiftsh(int64(sld)))
+	r_2 := GetPPC64Shiftsh(int64(sld))
+	r_3 := (r_1 + r_2) & 31 // This can wrap.
+
+	// Verify the result is still a valid bitmask of <= 32 bits.
+	if !isPPC64WordRotateMask(int64(mask_3)) || uint64(uint32(mask_3)) != mask_3 {
+		return 0
+	}
+	return encodePPC64RotateMask(r_3, int64(mask_3), 32)
+}
+
+// Compute the encoded RLWINM constant from combining (SLDconst [sld] (SRWconst [srw] x)),
+// or return 0 if they cannot be combined.
+func mergePPC64SldiSrw(sld, srw int64) int64 {
+	if sld > srw || srw >= 32 {
+		return 0
+	}
+	mask_r := uint32(0xFFFFFFFF) >> uint(srw)
+	mask_l := uint32(0xFFFFFFFF) >> uint(sld)
+	mask := (mask_r & mask_l) << uint(sld)
+	return encodePPC64RotateMask((32-srw+sld)&31, int64(mask), 32)
+}
+
+// Convenience function to rotate a 32 bit constant value by another constant.
+func rotateLeft32(v, rotate int64) int64 {
+	return int64(bits.RotateLeft32(uint32(v), int(rotate)))
+}
+
 // encodes the lsb and width for arm(64) bitfield ops into the expected auxInt format.
 func armBFAuxInt(lsb, width int64) arm64BitField {
 	if lsb < 0 || lsb > 63 {
diff --git a/src/cmd/compile/internal/ssa/rewritePPC64.go b/src/cmd/compile/internal/ssa/rewritePPC64.go
index 84938fe27a..e5a23e8625 100644
--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
@@ -444,6 +444,8 @@ func rewriteValuePPC64(v *Value) bool {
 		return rewriteValuePPC64_OpPPC64ANDN(v)
 	case OpPPC64ANDconst:
 		return rewriteValuePPC64_OpPPC64ANDconst(v)
+	case OpPPC64CLRLSLDI:
+		return rewriteValuePPC64_OpPPC64CLRLSLDI(v)
 	case OpPPC64CMP:
 		return rewriteValuePPC64_OpPPC64CMP(v)
 	case OpPPC64CMPU:
@@ -598,6 +600,8 @@ func rewriteValuePPC64(v *Value) bool {
 		return rewriteValuePPC64_OpPPC64ROTL(v)
 	case OpPPC64ROTLW:
 		return rewriteValuePPC64_OpPPC64ROTLW(v)
+	case OpPPC64ROTLWconst:
+		return rewriteValuePPC64_OpPPC64ROTLWconst(v)
 	case OpPPC64SLD:
 		return rewriteValuePPC64_OpPPC64SLD(v)
 	case OpPPC64SLDconst:
@@ -614,6 +618,8 @@ func rewriteValuePPC64(v *Value) bool {
 		return rewriteValuePPC64_OpPPC64SRD(v)
 	case OpPPC64SRW:
 		return rewriteValuePPC64_OpPPC64SRW(v)
+	case OpPPC64SRWconst:
+		return rewriteValuePPC64_OpPPC64SRWconst(v)
 	case OpPPC64SUB:
 		return rewriteValuePPC64_OpPPC64SUB(v)
 	case OpPPC64SUBFCconst:
@@ -4212,6 +4218,100 @@ func rewriteValuePPC64_OpPPC64ADDconst(v *Value) bool {
 func rewriteValuePPC64_OpPPC64AND(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
+	// match: (AND (MOVDconst [m]) (ROTLWconst [r] x))
+	// cond: isPPC64WordRotateMask(m)
+	// result: (RLWINM [encodePPC64RotateMask(r,m,32)] x)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			if v_0.Op != OpPPC64MOVDconst {
+				continue
+			}
+			m := auxIntToInt64(v_0.AuxInt)
+			if v_1.Op != OpPPC64ROTLWconst {
+				continue
+			}
+			r := auxIntToInt64(v_1.AuxInt)
+			x := v_1.Args[0]
+			if !(isPPC64WordRotateMask(m)) {
+				continue
+			}
+			v.reset(OpPPC64RLWINM)
+			v.AuxInt = int64ToAuxInt(encodePPC64RotateMask(r, m, 32))
+			v.AddArg(x)
+			return true
+		}
+		break
+	}
+	// match: (AND (MOVDconst [m]) (ROTLW x r))
+	// cond: isPPC64WordRotateMask(m)
+	// result: (RLWNM [encodePPC64RotateMask(0,m,32)] x r)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			if v_0.Op != OpPPC64MOVDconst {
+				continue
+			}
+			m := auxIntToInt64(v_0.AuxInt)
+			if v_1.Op != OpPPC64ROTLW {
+				continue
+			}
+			r := v_1.Args[1]
+			x := v_1.Args[0]
+			if !(isPPC64WordRotateMask(m)) {
+				continue
+			}
+			v.reset(OpPPC64RLWNM)
+			v.AuxInt = int64ToAuxInt(encodePPC64RotateMask(0, m, 32))
+			v.AddArg2(x, r)
+			return true
+		}
+		break
+	}
+	// match: (AND (MOVDconst [m]) (SRWconst x [s]))
+	// cond: mergePPC64RShiftMask(m,s,32) == 0
+	// result: (MOVDconst [0])
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			if v_0.Op != OpPPC64MOVDconst {
+				continue
+			}
+			m := auxIntToInt64(v_0.AuxInt)
+			if v_1.Op != OpPPC64SRWconst {
+				continue
+			}
+			s := auxIntToInt64(v_1.AuxInt)
+			if !(mergePPC64RShiftMask(m, s, 32) == 0) {
+				continue
+			}
+			v.reset(OpPPC64MOVDconst)
+			v.AuxInt = int64ToAuxInt(0)
+			return true
+		}
+		break
+	}
+	// match: (AND (MOVDconst [m]) (SRWconst x [s]))
+	// cond: mergePPC64AndSrwi(m,s) != 0
+	// result: (RLWINM [mergePPC64AndSrwi(m,s)] x)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			if v_0.Op != OpPPC64MOVDconst {
+				continue
+			}
+			m := auxIntToInt64(v_0.AuxInt)
+			if v_1.Op != OpPPC64SRWconst {
+				continue
+			}
+			s := auxIntToInt64(v_1.AuxInt)
+			x := v_1.Args[0]
+			if !(mergePPC64AndSrwi(m, s) != 0) {
+				continue
+			}
+			v.reset(OpPPC64RLWINM)
+			v.AuxInt = int64ToAuxInt(mergePPC64AndSrwi(m, s))
+			v.AddArg(x)
+			return true
+		}
+		break
+	}
 	// match: (AND x (NOR y y))
 	// result: (ANDN x y)
 	for {
@@ -4347,6 +4447,76 @@ func rewriteValuePPC64_OpPPC64ANDN(v *Value) bool {
 }
 func rewriteValuePPC64_OpPPC64ANDconst(v *Value) bool {
 	v_0 := v.Args[0]
+	// match: (ANDconst [m] (ROTLWconst [r] x))
+	// cond: isPPC64WordRotateMask(m)
+	// result: (RLWINM [encodePPC64RotateMask(r,m,32)] x)
+	for {
+		m := auxIntToInt64(v.AuxInt)
+		if v_0.Op != OpPPC64ROTLWconst {
+			break
+		}
+		r := auxIntToInt64(v_0.AuxInt)
+		x := v_0.Args[0]
+		if !(isPPC64WordRotateMask(m)) {
+			break
+		}
+		v.reset(OpPPC64RLWINM)
+		v.AuxInt = int64ToAuxInt(encodePPC64RotateMask(r, m, 32))
+		v.AddArg(x)
+		return true
+	}
+	// match: (ANDconst [m] (ROTLW x r))
+	// cond: isPPC64WordRotateMask(m)
+	// result: (RLWNM [encodePPC64RotateMask(0,m,32)] x r)
+	for {
+		m := auxIntToInt64(v.AuxInt)
+		if v_0.Op != OpPPC64ROTLW {
+			break
+		}
+		r := v_0.Args[1]
+		x := v_0.Args[0]
+		if !(isPPC64WordRotateMask(m)) {
+			break
+		}
+		v.reset(OpPPC64RLWNM)
+		v.AuxInt = int64ToAuxInt(encodePPC64RotateMask(0, m, 32))
+		v.AddArg2(x, r)
+		return true
+	}
+	// match: (ANDconst [m] (SRWconst x [s]))
+	// cond: mergePPC64RShiftMask(m,s,32) == 0
+	// result: (MOVDconst [0])
+	for {
+		m := auxIntToInt64(v.AuxInt)
+		if v_0.Op != OpPPC64SRWconst {
+			break
+		}
+		s := auxIntToInt64(v_0.AuxInt)
+		if !(mergePPC64RShiftMask(m, s, 32) == 0) {
+			break
+		}
+		v.reset(OpPPC64MOVDconst)
+		v.AuxInt = int64ToAuxInt(0)
+		return true
+	}
+	// match: (ANDconst [m] (SRWconst x [s]))
+	// cond: mergePPC64AndSrwi(m,s) != 0
+	// result: (RLWINM [mergePPC64AndSrwi(m,s)] x)
+	for {
+		m := auxIntToInt64(v.AuxInt)
+		if v_0.Op != OpPPC64SRWconst {
+			break
+		}
+		s := auxIntToInt64(v_0.AuxInt)
+		x := v_0.Args[0]
+		if !(mergePPC64AndSrwi(m, s) != 0) {
+			break
+		}
+		v.reset(OpPPC64RLWINM)
+		v.AuxInt = int64ToAuxInt(mergePPC64AndSrwi(m, s))
+		v.AddArg(x)
+		return true
+	}
 	// match: (ANDconst [c] (ANDconst [d] x))
 	// result: (ANDconst [c&d] x)
 	for {
@@ -4511,6 +4681,47 @@ func rewriteValuePPC64_OpPPC64ANDconst(v *Value) bool {
 	}
 	return false
 }
+func rewriteValuePPC64_OpPPC64CLRLSLDI(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CLRLSLDI [c] (SRWconst [s] x))
+	// cond: mergePPC64ClrlsldiSrw(int64(c),s) != 0
+	// result: (RLWINM [mergePPC64ClrlsldiSrw(int64(c),s)] x)
+	for {
+		c := auxIntToInt32(v.AuxInt)
+		if v_0.Op != OpPPC64SRWconst {
+			break
+		}
+		s := auxIntToInt64(v_0.AuxInt)
+		x := v_0.Args[0]
+		if !(mergePPC64ClrlsldiSrw(int64(c), s) != 0) {
+			break
+		}
+		v.reset(OpPPC64RLWINM)
+		v.AuxInt = int64ToAuxInt(mergePPC64ClrlsldiSrw(int64(c), s))
+		v.AddArg(x)
+		return true
+	}
+	// match: (CLRLSLDI [c] i:(RLWINM [s] x))
+	// cond: mergePPC64ClrlsldiRlwinm(c,s) != 0
+	// result: (RLWINM [mergePPC64ClrlsldiRlwinm(c,s)] x)
+	for {
+		c := auxIntToInt32(v.AuxInt)
+		i := v_0
+		if i.Op != OpPPC64RLWINM {
+			break
+		}
+		s := auxIntToInt64(i.AuxInt)
+		x := i.Args[0]
+		if !(mergePPC64ClrlsldiRlwinm(c, s) != 0) {
+			break
+		}
+		v.reset(OpPPC64RLWINM)
+		v.AuxInt = int64ToAuxInt(mergePPC64ClrlsldiRlwinm(c, s))
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
 func rewriteValuePPC64_OpPPC64CMP(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -12850,6 +13061,55 @@ func rewriteValuePPC64_OpPPC64ROTLW(v *Value) bool {
 	}
 	return false
 }
+func rewriteValuePPC64_OpPPC64ROTLWconst(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (ROTLWconst [r] (AND (MOVDconst [m]) x))
+	// cond: isPPC64WordRotateMask(m)
+	// result: (RLWINM [encodePPC64RotateMask(r,rotateLeft32(m,r),32)] x)
+	for {
+		r := auxIntToInt64(v.AuxInt)
+		if v_0.Op != OpPPC64AND {
+			break
+		}
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
+			if v_0_0.Op != OpPPC64MOVDconst {
+				continue
+			}
+			m := auxIntToInt64(v_0_0.AuxInt)
+			x := v_0_1
+			if !(isPPC64WordRotateMask(m)) {
+				continue
+			}
+			v.reset(OpPPC64RLWINM)
+			v.AuxInt = int64ToAuxInt(encodePPC64RotateMask(r, rotateLeft32(m, r), 32))
+			v.AddArg(x)
+			return true
+		}
+		break
+	}
+	// match: (ROTLWconst [r] (ANDconst [m] x))
+	// cond: isPPC64WordRotateMask(m)
+	// result: (RLWINM [encodePPC64RotateMask(r,rotateLeft32(m,r),32)] x)
+	for {
+		r := auxIntToInt64(v.AuxInt)
+		if v_0.Op != OpPPC64ANDconst {
+			break
+		}
+		m := auxIntToInt64(v_0.AuxInt)
+		x := v_0.Args[0]
+		if !(isPPC64WordRotateMask(m)) {
+			break
+		}
+		v.reset(OpPPC64RLWINM)
+		v.AuxInt = int64ToAuxInt(encodePPC64RotateMask(r, rotateLeft32(m, r), 32))
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
 func rewriteValuePPC64_OpPPC64SLD(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -12870,6 +13130,24 @@ func rewriteValuePPC64_OpPPC64SLD(v *Value) bool {
 }
 func rewriteValuePPC64_OpPPC64SLDconst(v *Value) bool {
 	v_0 := v.Args[0]
+	// match: (SLDconst [l] (SRWconst [r] x))
+	// cond: mergePPC64SldiSrw(l,r) != 0
+	// result: (RLWINM [mergePPC64SldiSrw(l,r)] x)
+	for {
+		l := auxIntToInt64(v.AuxInt)
+		if v_0.Op != OpPPC64SRWconst {
+			break
+		}
+		r := auxIntToInt64(v_0.AuxInt)
+		x := v_0.Args[0]
+		if !(mergePPC64SldiSrw(l, r) != 0) {
+			break
+		}
+		v.reset(OpPPC64RLWINM)
+		v.AuxInt = int64ToAuxInt(mergePPC64SldiSrw(l, r))
+		v.AddArg(x)
+		return true
+	}
 	// match: (SLDconst [c] z:(MOVBZreg x))
 	// cond: c < 8 && z.Uses == 1
 	// result: (CLRLSLDI [newPPC64ShiftAuxInt(c,56,63,64)] x)
@@ -13186,6 +13464,96 @@ func rewriteValuePPC64_OpPPC64SRW(v *Value) bool {
 	}
 	return false
 }
+func rewriteValuePPC64_OpPPC64SRWconst(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (SRWconst (ANDconst [m] x) [s])
+	// cond: mergePPC64RShiftMask(m>>uint(s),s,32) == 0
+	// result: (MOVDconst [0])
+	for {
+		s := auxIntToInt64(v.AuxInt)
+		if v_0.Op != OpPPC64ANDconst {
+			break
+		}
+		m := auxIntToInt64(v_0.AuxInt)
+		if !(mergePPC64RShiftMask(m>>uint(s), s, 32) == 0) {
+			break
+		}
+		v.reset(OpPPC64MOVDconst)
+		v.AuxInt = int64ToAuxInt(0)
+		return true
+	}
+	// match: (SRWconst (ANDconst [m] x) [s])
+	// cond: mergePPC64AndSrwi(m>>uint(s),s) != 0
+	// result: (RLWINM [mergePPC64AndSrwi(m>>uint(s),s)] x)
+	for {
+		s := auxIntToInt64(v.AuxInt)
+		if v_0.Op != OpPPC64ANDconst {
+			break
+		}
+		m := auxIntToInt64(v_0.AuxInt)
+		x := v_0.Args[0]
+		if !(mergePPC64AndSrwi(m>>uint(s), s) != 0) {
+			break
+		}
+		v.reset(OpPPC64RLWINM)
+		v.AuxInt = int64ToAuxInt(mergePPC64AndSrwi(m>>uint(s), s))
+		v.AddArg(x)
+		return true
+	}
+	// match: (SRWconst (AND (MOVDconst [m]) x) [s])
+	// cond: mergePPC64RShiftMask(m>>uint(s),s,32) == 0
+	// result: (MOVDconst [0])
+	for {
+		s := auxIntToInt64(v.AuxInt)
+		if v_0.Op != OpPPC64AND {
+			break
+		}
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
+			if v_0_0.Op != OpPPC64MOVDconst {
+				continue
+			}
+			m := auxIntToInt64(v_0_0.AuxInt)
+			if !(mergePPC64RShiftMask(m>>uint(s), s, 32) == 0) {
+				continue
+			}
+			v.reset(OpPPC64MOVDconst)
+			v.AuxInt = int64ToAuxInt(0)
+			return true
+		}
+		break
+	}
+	// match: (SRWconst (AND (MOVDconst [m]) x) [s])
+	// cond: mergePPC64AndSrwi(m>>uint(s),s) != 0
+	// result: (RLWINM [mergePPC64AndSrwi(m>>uint(s),s)] x)
+	for {
+		s := auxIntToInt64(v.AuxInt)
+		if v_0.Op != OpPPC64AND {
+			break
+		}
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
+			if v_0_0.Op != OpPPC64MOVDconst {
+				continue
+			}
+			m := auxIntToInt64(v_0_0.AuxInt)
+			x := v_0_1
+			if !(mergePPC64AndSrwi(m>>uint(s), s) != 0) {
+				continue
+			}
+			v.reset(OpPPC64RLWINM)
+			v.AuxInt = int64ToAuxInt(mergePPC64AndSrwi(m>>uint(s), s))
+			v.AddArg(x)
+			return true
+		}
+		break
+	}
+	return false
+}
 func rewriteValuePPC64_OpPPC64SUB(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
diff --git a/src/cmd/compile/internal/ssa/rewrite_test.go b/src/cmd/compile/internal/ssa/rewrite_test.go
index 1a15d8c940..6fe429e85a 100644
--- a/src/cmd/compile/internal/ssa/rewrite_test.go
+++ b/src/cmd/compile/internal/ssa/rewrite_test.go
@@ -36,3 +36,184 @@ func TestSubFlags(t *testing.T) {
 		t.Errorf("subFlags32(0,1).ult() returned false")
 	}
 }
+
+func TestIsPPC64WordRotateMask(t *testing.T) {
+	tests := []struct {
+		input    int64
+		expected bool
+	}{
+		{0x00000001, true},
+		{0x80000001, true},
+		{0x80010001, false},
+		{0xFFFFFFFA, false},
+		{0xF0F0F0F0, false},
+		{0xFFFFFFFD, true},
+		{0x80000000, true},
+		{0x00000000, false},
+		{0xFFFFFFFF, true},
+		{0x0000FFFF, true},
+		{0xFF0000FF, true},
+		{0x00FFFF00, true},
+	}
+
+	for _, v := range tests {
+		if v.expected != isPPC64WordRotateMask(v.input) {
+			t.Errorf("isPPC64WordRotateMask(0x%x) failed", v.input)
+		}
+	}
+}
+
+func TestEncodeDecodePPC64WordRotateMask(t *testing.T) {
+	tests := []struct {
+		rotate int64
+		mask   uint64
+		nbits,
+		mb,
+		me,
+		encoded int64
+	}{
+		{1, 0x00000001, 32, 31, 31, 0x20011f20},
+		{2, 0x80000001, 32, 31, 0, 0x20021f01},
+		{3, 0xFFFFFFFD, 32, 31, 29, 0x20031f1e},
+		{4, 0x80000000, 32, 0, 0, 0x20040001},
+		{5, 0xFFFFFFFF, 32, 0, 31, 0x20050020},
+		{6, 0x0000FFFF, 32, 16, 31, 0x20061020},
+		{7, 0xFF0000FF, 32, 24, 7, 0x20071808},
+		{8, 0x00FFFF00, 32, 8, 23, 0x20080818},
+
+		{9, 0x0000000000FFFF00, 64, 40, 55, 0x40092838},
+		{10, 0xFFFF000000000000, 64, 0, 15, 0x400A0010},
+		{10, 0xFFFF000000000001, 64, 63, 15, 0x400A3f10},
+	}
+
+	for i, v := range tests {
+		result := encodePPC64RotateMask(v.rotate, int64(v.mask), v.nbits)
+		if result != v.encoded {
+			t.Errorf("encodePPC64RotateMask(%d,0x%x,%d) = 0x%x, expected 0x%x", v.rotate, v.mask, v.nbits, result, v.encoded)
+		}
+		rotate, mb, me, mask := DecodePPC64RotateMask(result)
+		if rotate != v.rotate || mb != v.mb || me != v.me || mask != v.mask {
+			t.Errorf("DecodePPC64Failure(Test %d) got (%d, %d, %d, %x) expected (%d, %d, %d, %x)", i, rotate, mb, me, mask, v.rotate, v.mb, v.me, v.mask)
+		}
+	}
+}
+
+func TestMergePPC64ClrlsldiSrw(t *testing.T) {
+	tests := []struct {
+		clrlsldi int32
+		srw      int64
+		valid    bool
+		rotate   int64
+		mask     uint64
+	}{
+		// ((x>>4)&0xFF)<<4
+		{newPPC64ShiftAuxInt(4, 56, 63, 64), 4, true, 0, 0xFF0},
+		// ((x>>4)&0xFFFF)<<4
+		{newPPC64ShiftAuxInt(4, 48, 63, 64), 4, true, 0, 0xFFFF0},
+		// ((x>>4)&0xFFFF)<<17
+		{newPPC64ShiftAuxInt(17, 48, 63, 64), 4, false, 0, 0},
+		// ((x>>4)&0xFFFF)<<16
+		{newPPC64ShiftAuxInt(16, 48, 63, 64), 4, true, 12, 0xFFFF0000},
+		// ((x>>32)&0xFFFF)<<17
+		{newPPC64ShiftAuxInt(17, 48, 63, 64), 32, false, 0, 0},
+	}
+	for i, v := range tests {
+		result := mergePPC64ClrlsldiSrw(int64(v.clrlsldi), v.srw)
+		if v.valid && result == 0 {
+			t.Errorf("mergePPC64ClrlsldiSrw(Test %d) did not merge", i)
+		} else if !v.valid && result != 0 {
+			t.Errorf("mergePPC64ClrlsldiSrw(Test %d) should return 0", i)
+		} else if r, _, _, m := DecodePPC64RotateMask(result); v.rotate != r || v.mask != m {
+			t.Errorf("mergePPC64ClrlsldiSrw(Test %d) got (%d,0x%x) expected (%d,0x%x)", i, r, m, v.rotate, v.mask)
+		}
+	}
+}
+
+func TestMergePPC64ClrlsldiRlwinm(t *testing.T) {
+	tests := []struct {
+		clrlsldi int32
+		rlwinm   int64
+		valid    bool
+		rotate   int64
+		mask     uint64
+	}{
+		// ((x<<4)&0xFF00)<<4
+		{newPPC64ShiftAuxInt(4, 56, 63, 64), encodePPC64RotateMask(4, 0xFF00, 32), false, 0, 0},
+		// ((x>>4)&0xFF)<<4
+		{newPPC64ShiftAuxInt(4, 56, 63, 64), encodePPC64RotateMask(28, 0x0FFFFFFF, 32), true, 0, 0xFF0},
+		// ((x>>4)&0xFFFF)<<4
+		{newPPC64ShiftAuxInt(4, 48, 63, 64), encodePPC64RotateMask(28, 0xFFFF, 32), true, 0, 0xFFFF0},
+		// ((x>>4)&0xFFFF)<<17
+		{newPPC64ShiftAuxInt(17, 48, 63, 64), encodePPC64RotateMask(28, 0xFFFF, 32), false, 0, 0},
+		// ((x>>4)&0xFFFF)<<16
+		{newPPC64ShiftAuxInt(16, 48, 63, 64), encodePPC64RotateMask(28, 0xFFFF, 32), true, 12, 0xFFFF0000},
+		// ((x>>4)&0xF000FFFF)<<16
+		{newPPC64ShiftAuxInt(16, 48, 63, 64), encodePPC64RotateMask(28, 0xF000FFFF, 32), true, 12, 0xFFFF0000},
+	}
+	for i, v := range tests {
+		result := mergePPC64ClrlsldiRlwinm(v.clrlsldi, v.rlwinm)
+		if v.valid && result == 0 {
+			t.Errorf("mergePPC64ClrlsldiRlwinm(Test %d) did not merge", i)
+		} else if !v.valid && result != 0 {
+			t.Errorf("mergePPC64ClrlsldiRlwinm(Test %d) should return 0", i)
+		} else if r, _, _, m := DecodePPC64RotateMask(result); v.rotate != r || v.mask != m {
+			t.Errorf("mergePPC64ClrlsldiRlwinm(Test %d) got (%d,0x%x) expected (%d,0x%x)", i, r, m, v.rotate, v.mask)
+		}
+	}
+}
+
+func TestMergePPC64SldiSrw(t *testing.T) {
+	tests := []struct {
+		sld    int64
+		srw    int64
+		valid  bool
+		rotate int64
+		mask   uint64
+	}{
+		{4, 4, true, 0, 0xFFFFFFF0},
+		{4, 8, true, 28, 0x0FFFFFF0},
+		{0, 0, true, 0, 0xFFFFFFFF},
+		{8, 4, false, 0, 0},
+		{0, 32, false, 0, 0},
+		{0, 31, true, 1, 0x1},
+		{31, 31, true, 0, 0x80000000},
+		{32, 32, false, 0, 0},
+	}
+	for i, v := range tests {
+		result := mergePPC64SldiSrw(v.sld, v.srw)
+		if v.valid && result == 0 {
+			t.Errorf("mergePPC64SldiSrw(Test %d) did not merge", i)
+		} else if !v.valid && result != 0 {
+			t.Errorf("mergePPC64SldiSrw(Test %d) should return 0", i)
+		} else if r, _, _, m := DecodePPC64RotateMask(result); v.rotate != r || v.mask != m {
+			t.Errorf("mergePPC64SldiSrw(Test %d) got (%d,0x%x) expected (%d,0x%x)", i, r, m, v.rotate, v.mask)
+		}
+	}
+}
+
+func TestMergePPC64AndSrwi(t *testing.T) {
+	tests := []struct {
+		and    int64
+		srw    int64
+		valid  bool
+		rotate int64
+		mask   uint64
+	}{
+		{0x000000FF, 8, true, 24, 0xFF},
+		{0xF00000FF, 8, true, 24, 0xFF},
+		{0x0F0000FF, 4, false, 0, 0},
+		{0x00000000, 4, false, 0, 0},
+		{0xF0000000, 4, false, 0, 0},
+		{0xF0000000, 32, false, 0, 0},
+	}
+	for i, v := range tests {
+		result := mergePPC64AndSrwi(v.and, v.srw)
+		if v.valid && result == 0 {
+			t.Errorf("mergePPC64AndSrwi(Test %d) did not merge", i)
+		} else if !v.valid && result != 0 {
+			t.Errorf("mergePPC64AndSrwi(Test %d) should return 0", i)
+		} else if r, _, _, m := DecodePPC64RotateMask(result); v.rotate != r || v.mask != m {
+			t.Errorf("mergePPC64AndSrwi(Test %d) got (%d,0x%x) expected (%d,0x%x)", i, r, m, v.rotate, v.mask)
+		}
+	}
+}
diff --git a/test/codegen/rotate.go b/test/codegen/rotate.go
index ce24b57877..0c8b030970 100644
--- a/test/codegen/rotate.go
+++ b/test/codegen/rotate.go
@@ -6,6 +6,8 @@
 
 package codegen
 
+import "math/bits"
+
 // ------------------- //
 //    const rotates    //
 // ------------------- //
@@ -166,3 +168,46 @@ func f32(x uint32) uint32 {
 	// amd64:"ROLL\t[$]7"
 	return rot32nc(x, 7)
 }
+
+// --------------------------------------- //
+//    Combined Rotate + Masking operations //
+// --------------------------------------- //
+
+func checkMaskedRotate32(a []uint32, r int) {
+	i := 0
+
+	// ppc64le: "RLWNM\t[$]16, R[0-9]+, [$]16711680, R[0-9]+"
+	// ppc64: "RLWNM\t[$]16, R[0-9]+, [$]16711680, R[0-9]+"
+	a[i] = bits.RotateLeft32(a[i], 16) & 0xFF0000
+	i++
+	// ppc64le: "RLWNM\t[$]16, R[0-9]+, [$]16711680, R[0-9]+"
+	// ppc64: "RLWNM\t[$]16, R[0-9]+, [$]16711680, R[0-9]+"
+	a[i] = bits.RotateLeft32(a[i]&0xFF, 16)
+	i++
+	// ppc64le: "RLWNM\t[$]4, R[0-9]+, [$]4080, R[0-9]+"
+	// ppc64: "RLWNM\t[$]4, R[0-9]+, [$]4080, R[0-9]+"
+	a[i] = bits.RotateLeft32(a[i], 4) & 0xFF0
+	i++
+	// ppc64le: "RLWNM\t[$]16, R[0-9]+, [$]255, R[0-9]+"
+	// ppc64: "RLWNM\t[$]16, R[0-9]+, [$]255, R[0-9]+"
+	a[i] = bits.RotateLeft32(a[i]&0xFF0000, 16)
+	i++
+
+	// ppc64le: "RLWNM\tR[0-9]+, R[0-9]+, [$]16711680, R[0-9]+"
+	// ppc64: "RLWNM\tR[0-9]+, R[0-9]+, [$]16711680, R[0-9]+"
+	a[i] = bits.RotateLeft32(a[i], r) & 0xFF0000
+	i++
+	// ppc64le: "RLWNM\tR[0-9]+, R[0-9]+, [$]65280, R[0-9]+"
+	// ppc64: "RLWNM\tR[0-9]+, R[0-9]+, [$]65280, R[0-9]+"
+	a[i] = bits.RotateLeft32(a[3], r) & 0xFF00
+	i++
+
+	// ppc64le: "RLWNM\tR[0-9]+, R[0-9]+, [$]4293922815, R[0-9]+"
+	// ppc64: "RLWNM\tR[0-9]+, R[0-9]+, [$]4293922815, R[0-9]+"
+	a[i] = bits.RotateLeft32(a[3], r) & 0xFFF00FFF
+	i++
+	// ppc64le: "RLWNM\t[$]4, R[0-9]+, [$]4293922815, R[0-9]+"
+	// ppc64: "RLWNM\t[$]4, R[0-9]+, [$]4293922815, R[0-9]+"
+	a[i] = bits.RotateLeft32(a[3], 4) & 0xFFF00FFF
+	i++
+}
diff --git a/test/codegen/shift.go b/test/codegen/shift.go
index bbfc85ffbb..a45f27c9cf 100644
--- a/test/codegen/shift.go
+++ b/test/codegen/shift.go
@@ -156,29 +156,29 @@ func checkUnneededTrunc(tab *[100000]uint32, d uint64, v uint32, h uint16, b byt
 	// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
 	f := tab[byte(v)^b]
 	// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
-        // ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+	// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
 	f += tab[byte(v)&b]
 	// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
-        // ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+	// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
 	f += tab[byte(v)|b]
 	// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
-        // ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+	// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
 	f += tab[uint16(v)&h]
 	// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
-        // ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+	// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
 	f += tab[uint16(v)^h]
 	// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
-        // ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
+	// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
 	f += tab[uint16(v)|h]
 	// ppc64le:-".*AND",-"RLDICR",".*CLRLSLDI"
 	// ppc64:-".*AND",-"RLDICR",".*CLRLSLDI"
 	f += tab[v&0xff]
 	// ppc64le:-".*AND",".*CLRLSLWI"
-        // ppc64:-".*AND",".*CLRLSLWI"
-        f += 2*uint32(uint16(d))
+	// ppc64:-".*AND",".*CLRLSLWI"
+	f += 2 * uint32(uint16(d))
 	// ppc64le:-".*AND",-"RLDICR",".*CLRLSLDI"
 	// ppc64:-".*AND",-"RLDICR",".*CLRLSLDI"
-	g := 2*uint64(uint32(d))
+	g := 2 * uint64(uint32(d))
 	return f, g
 }
 
@@ -186,10 +186,10 @@ func checkCombinedShifts(v8 uint8, v16 uint16, v32 uint32, x32 int32, v64 uint64
 
 	// ppc64le:-"AND","CLRLSLWI"
 	// ppc64:-"AND","CLRLSLWI"
-	f := (v8 &0xF) << 2
+	f := (v8 & 0xF) << 2
 	// ppc64le:"CLRLSLWI"
 	// ppc64:"CLRLSLWI"
-	f += byte(v16)<<3
+	f += byte(v16) << 3
 	// ppc64le:-"AND","CLRLSLWI"
 	// ppc64:-"AND","CLRLSLWI"
 	g := (v16 & 0xFF) << 3
@@ -207,29 +207,81 @@ func checkCombinedShifts(v8 uint8, v16 uint16, v32 uint32, x32 int32, v64 uint64
 	i += (v64 & 0xFFFF00) << 10
 	// ppc64le/power9:-"SLD","EXTSWSLI"
 	// ppc64/power9:-"SLD","EXTSWSLI"
-	j := int64(x32+32)*8
+	j := int64(x32+32) * 8
 	return f, g, h, i, j
 }
 
 func checkWidenAfterShift(v int64, u uint64) (int64, uint64) {
 
 	// ppc64le:-".*MOVW"
-	f := int32(v>>32)
+	f := int32(v >> 32)
 	// ppc64le:".*MOVW"
-	f += int32(v>>31)
+	f += int32(v >> 31)
 	// ppc64le:-".*MOVH"
-	g := int16(v>>48)
+	g := int16(v >> 48)
 	// ppc64le:".*MOVH"
-	g += int16(v>>30)
+	g += int16(v >> 30)
 	// ppc64le:-".*MOVH"
-	g += int16(f>>16)
+	g += int16(f >> 16)
 	// ppc64le:-".*MOVB"
-	h := int8(v>>56)
+	h := int8(v >> 56)
 	// ppc64le:".*MOVB"
-	h += int8(v>>28)
+	h += int8(v >> 28)
 	// ppc64le:-".*MOVB"
-	h += int8(f>>24)
+	h += int8(f >> 24)
 	// ppc64le:".*MOVB"
-	h += int8(f>>16)
-	return int64(h),uint64(g)
+	h += int8(f >> 16)
+	return int64(h), uint64(g)
+}
+
+func checkShiftAndMask32(v []uint32) {
+	i := 0
+
+	// ppc64le: "RLWNM\t[$]24, R[0-9]+, [$]1044480, R[0-9]+"
+	// ppc64: "RLWNM\t[$]24, R[0-9]+, [$]1044480, R[0-9]+"
+	v[i] = (v[i] & 0xFF00000) >> 8
+	i++
+	// ppc64le: "RLWNM\t[$]26, R[0-9]+, [$]1020, R[0-9]+"
+	// ppc64: "RLWNM\t[$]26, R[0-9]+, [$]1020, R[0-9]+"
+	v[i] = (v[i] & 0xFF00) >> 6
+	i++
+	// ppc64le: "MOVW\tR0"
+	// ppc64: "MOVW\tR0"
+	v[i] = (v[i] & 0xFF) >> 8
+	i++
+	// ppc64le: "MOVW\tR0"
+	// ppc64: "MOVW\tR0"
+	v[i] = (v[i] & 0xF000000) >> 28
+	i++
+	// ppc64le: "RLWNM\t[$]26, R[0-9]+, [$]255, R[0-9]+"
+	// ppc64: "RLWNM\t[$]26, R[0-9]+, [$]255, R[0-9]+"
+	v[i] = (v[i] >> 6) & 0xFF
+	i++
+	// ppc64le: "RLWNM\t[$]26, R[0-9]+, [$]1044480, R[0-9]+"
+	// ppc64: "RLWNM\t[$]26, R[0-9]+, [$]1044480, R[0-9]+"
+	v[i] = (v[i] >> 6) & 0xFF000
+	i++
+	// ppc64le: "MOVW\tR0"
+	// ppc64: "MOVW\tR0"
+	v[i] = (v[i] >> 20) & 0xFF000
+	i++
+	// ppc64le: "MOVW\tR0"
+	// ppc64: "MOVW\tR0"
+	v[i] = (v[i] >> 24) & 0xFF00
+	i++
+}
+
+func checkMergedShifts32(a [256]uint32, b [256]uint64, u uint32, v uint32) {
+	//ppc64le: -"CLRLSLDI", "RLWNM\t[$]10, R[0-9]+, [$]1020, R[0-9]+"
+	//ppc64: -"CLRLSLDI", "RLWNM\t[$]10, R[0-9]+, [$]1020, R[0-9]+"
+	a[0] = a[uint8(v>>24)]
+	//ppc64le: -"CLRLSLDI", "RLWNM\t[$]11, R[0-9]+, [$]2040, R[0-9]+"
+	//ppc64: -"CLRLSLDI", "RLWNM\t[$]11, R[0-9]+, [$]2040, R[0-9]+"
+	b[0] = b[uint8(v>>24)]
+	//ppc64le: -"CLRLSLDI", "RLWNM\t[$]15, R[0-9]+, [$]2040, R[0-9]+"
+	//ppc64: -"CLRLSLDI", "RLWNM\t[$]15, R[0-9]+, [$]2040, R[0-9]+"
+	b[1] = b[(v>>20)&0xFF]
+	//ppc64le: -"SLD", "RLWNM\t[$]10, R[0-9]+, [$]1016, R[0-9]+"
+	//ppc64: -"SLD", "RLWNM\t[$]10, R[0-9]+, [$]1016, R[0-9]+"
+	b[2] = b[v>>25]
 }
-- 
cgit v1.3


From 0387bedadf8ec0ec4139af7c1361abaa47a6c03a Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Sun, 1 Nov 2020 17:05:32 -0500
Subject: cmd/compile: remove racefuncenterfp when it is not needed

We already remove racefuncenter and racefuncexit if they are not
needed (i.e. the function doesn't have any other race  calls).
racefuncenterfp is like racefuncenter but used on LR machines.
Remove unnecessary racefuncenterfp as well.

Change-Id: I65edb00e19c6d9ab55a204cbbb93e9fb710559f1
Reviewed-on: https://go-review.googlesource.com/c/go/+/267099
Trust: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: David Chase <drchase@google.com>
---
 src/cmd/compile/internal/ssa/rewrite.go | 6 +++---
 test/codegen/race.go                    | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'test/codegen')

diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
index 974c5ac8c3..39aa63d947 100644
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -1573,18 +1573,18 @@ func needRaceCleanup(sym *AuxCall, v *Value) bool {
 	if !f.Config.Race {
 		return false
 	}
-	if !isSameCall(sym, "runtime.racefuncenter") && !isSameCall(sym, "runtime.racefuncexit") {
+	if !isSameCall(sym, "runtime.racefuncenter") && !isSameCall(sym, "runtime.racefuncenterfp") && !isSameCall(sym, "runtime.racefuncexit") {
 		return false
 	}
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
 			switch v.Op {
 			case OpStaticCall:
-				// Check for racefuncenter will encounter racefuncexit and vice versa.
+				// Check for racefuncenter/racefuncenterfp will encounter racefuncexit and vice versa.
 				// Allow calls to panic*
 				s := v.Aux.(*AuxCall).Fn.String()
 				switch s {
-				case "runtime.racefuncenter", "runtime.racefuncexit",
+				case "runtime.racefuncenter", "runtime.racefuncenterfp", "runtime.racefuncexit",
 					"runtime.panicdivide", "runtime.panicwrap",
 					"runtime.panicshift":
 					continue
diff --git a/test/codegen/race.go b/test/codegen/race.go
index ed6706f880..b977823906 100644
--- a/test/codegen/race.go
+++ b/test/codegen/race.go
@@ -10,6 +10,8 @@ package codegen
 // functions with no calls (but which might panic
 // in various ways). See issue 31219.
 // amd64:-"CALL.*racefuncenter.*"
+// arm64:-"CALL.*racefuncenter.*"
+// ppc64le:-"CALL.*racefuncenter.*"
 func RaceMightPanic(a []int, i, j, k, s int) {
 	var b [4]int
 	_ = b[i]     // panicIndex
-- 
cgit v1.3


From 854e892ce17e2555c59fce5b92f64bc505ba5d8c Mon Sep 17 00:00:00 2001
From: Michael Munday <mike.munday@ibm.com>
Date: Mon, 11 May 2020 09:44:48 -0700
Subject: cmd/compile: optimize shift pairs and masks on s390x
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Optimize combinations of left and right shifts by a constant value
into a 'rotate then insert selected bits [into zero]' instruction.
Use the same instruction for contiguous masks since it has some
benefits over 'and immediate' (not restricted to 32-bits, does not
overwrite source register).

To keep the complexity of this change under control I've only
implemented 64 bit operations for now.

There are a lot more optimizations that can be done with this
instruction family. However, since their function overlaps with other
instructions we need to be somewhat careful not to break existing
optimization rules by creating optimization dead ends. This is
particularly true of the load/store merging rules which contain lots
of zero extensions and shifts.

This CL does interfere with the store merging rules when an operand
is shifted left before it is stored:

  binary.BigEndian.PutUint64(b, x << 1)

This is unfortunate but it's not critical and somewhat complex so
I plan to fix that in a follow up CL.

file      before    after     Δ       %
addr2line 4117446   4117282   -164    -0.004%
api       4945184   4942752   -2432   -0.049%
asm       4998079   4991891   -6188   -0.124%
buildid   2685158   2684074   -1084   -0.040%
cgo       4553732   4553394   -338    -0.007%
compile   19294446  19245070  -49376  -0.256%
cover     4897105   4891319   -5786   -0.118%
dist      3544389   3542785   -1604   -0.045%
doc       3926795   3927617   +822    +0.021%
fix       3302958   3293868   -9090   -0.275%
link      6546274   6543456   -2818   -0.043%
nm        4102021   4100825   -1196   -0.029%
objdump   4542431   4548483   +6052   +0.133%
pack      2482465   2416389   -66076  -2.662%
pprof     13366541  13363915  -2626   -0.020%
test2json 2829007   2761515   -67492  -2.386%
trace     10216164  10219684  +3520   +0.034%
vet       6773956   6773572   -384    -0.006%
total     107124151 106917891 -206260 -0.193%

Change-Id: I7591cce41e06867ba10a745daae9333513062746
Reviewed-on: https://go-review.googlesource.com/c/go/+/233317
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Trust: Michael Munday <mike.munday@ibm.com>
---
 src/cmd/compile/internal/s390x/ssa.go        |  14 +-
 src/cmd/compile/internal/ssa/gen/S390X.rules | 162 ++++-
 src/cmd/compile/internal/ssa/gen/S390XOps.go |  20 +-
 src/cmd/compile/internal/ssa/opGen.go        |  25 +-
 src/cmd/compile/internal/ssa/rewriteS390X.go | 844 +++++++++++++++++++--------
 src/cmd/internal/obj/s390x/rotate.go         |  82 ++-
 src/cmd/internal/obj/s390x/rotate_test.go    | 122 ++++
 test/codegen/bitfield.go                     |  18 +-
 test/codegen/bits.go                         |  12 +
 test/codegen/mathbits.go                     |   2 +-
 test/codegen/rotate.go                       |   6 +-
 test/codegen/shift.go                        |  33 +-
 12 files changed, 1007 insertions(+), 333 deletions(-)
 create mode 100644 src/cmd/internal/obj/s390x/rotate_test.go

(limited to 'test/codegen')

diff --git a/src/cmd/compile/internal/s390x/ssa.go b/src/cmd/compile/internal/s390x/ssa.go
index 84b9f491e4..8037357131 100644
--- a/src/cmd/compile/internal/s390x/ssa.go
+++ b/src/cmd/compile/internal/s390x/ssa.go
@@ -188,6 +188,18 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			{Type: obj.TYPE_REG, Reg: r2},
 		})
 		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: r1}
+	case ssa.OpS390XRISBGZ:
+		r1 := v.Reg()
+		r2 := v.Args[0].Reg()
+		i := v.Aux.(s390x.RotateParams)
+		p := s.Prog(v.Op.Asm())
+		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: int64(i.Start)}
+		p.SetRestArgs([]obj.Addr{
+			{Type: obj.TYPE_CONST, Offset: int64(i.End)},
+			{Type: obj.TYPE_CONST, Offset: int64(i.Amount)},
+			{Type: obj.TYPE_REG, Reg: r2},
+		})
+		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: r1}
 	case ssa.OpS390XADD, ssa.OpS390XADDW,
 		ssa.OpS390XSUB, ssa.OpS390XSUBW,
 		ssa.OpS390XAND, ssa.OpS390XANDW,
@@ -360,7 +372,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	case ssa.OpS390XSLDconst, ssa.OpS390XSLWconst,
 		ssa.OpS390XSRDconst, ssa.OpS390XSRWconst,
 		ssa.OpS390XSRADconst, ssa.OpS390XSRAWconst,
-		ssa.OpS390XRLLGconst, ssa.OpS390XRLLconst:
+		ssa.OpS390XRLLconst:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_CONST
 		p.From.Offset = v.AuxInt
diff --git a/src/cmd/compile/internal/ssa/gen/S390X.rules b/src/cmd/compile/internal/ssa/gen/S390X.rules
index 1b56361c00..39949edbc2 100644
--- a/src/cmd/compile/internal/ssa/gen/S390X.rules
+++ b/src/cmd/compile/internal/ssa/gen/S390X.rules
@@ -643,8 +643,18 @@
 // equivalent to the leftmost 32 bits being set.
 // TODO(mundaym): modify the assembler to accept 64-bit values
 // and use isU32Bit(^c).
-(AND x (MOVDconst [c])) && is32Bit(c) && c < 0 => (ANDconst [c] x)
-(AND x (MOVDconst [c])) && is32Bit(c) && c >= 0 => (MOVWZreg (ANDWconst <typ.UInt32> [int32(c)] x))
+(AND x (MOVDconst [c]))
+  && s390x.NewRotateParams(0, 63, 0).OutMerge(uint64(c)) != nil
+  => (RISBGZ x {*s390x.NewRotateParams(0, 63, 0).OutMerge(uint64(c))})
+(AND x (MOVDconst [c]))
+  && is32Bit(c)
+  && c < 0
+  => (ANDconst [c] x)
+(AND x (MOVDconst [c]))
+  && is32Bit(c)
+  && c >= 0
+  => (MOVWZreg (ANDWconst <typ.UInt32> [int32(c)] x))
+
 (ANDW x (MOVDconst [c])) => (ANDWconst [int32(c)] x)
 
 ((AND|ANDW)const [c] ((AND|ANDW)const [d] x)) => ((AND|ANDW)const [c&d] x)
@@ -653,14 +663,20 @@
 ((OR|XOR)W x (MOVDconst [c])) => ((OR|XOR)Wconst [int32(c)] x)
 
 // Constant shifts.
-(S(LD|RD|RAD|LW|RW|RAW) x (MOVDconst [c]))
-	=> (S(LD|RD|RAD|LW|RW|RAW)const x [int8(c&63)])
+(S(LD|RD|RAD) x (MOVDconst [c])) => (S(LD|RD|RAD)const x [int8(c&63)])
+(S(LW|RW|RAW) x (MOVDconst [c])) && c&32 == 0 => (S(LW|RW|RAW)const x [int8(c&31)])
+(S(LW|RW)     _ (MOVDconst [c])) && c&32 != 0 => (MOVDconst [0])
+(SRAW         x (MOVDconst [c])) && c&32 != 0 => (SRAWconst x [31])
 
 // Shifts only use the rightmost 6 bits of the shift value.
+(S(LD|RD|RAD|LW|RW|RAW) x (RISBGZ y {r}))
+  && r.Amount == 0
+  && r.OutMask()&63 == 63
+  => (S(LD|RD|RAD|LW|RW|RAW) x y)
 (S(LD|RD|RAD|LW|RW|RAW) x (AND (MOVDconst [c]) y))
-	=> (S(LD|RD|RAD|LW|RW|RAW) x (ANDWconst <typ.UInt32> [int32(c&63)] y))
+  => (S(LD|RD|RAD|LW|RW|RAW) x (ANDWconst <typ.UInt32> [int32(c&63)] y))
 (S(LD|RD|RAD|LW|RW|RAW) x (ANDWconst [c] y)) && c&63 == 63
-	=> (S(LD|RD|RAD|LW|RW|RAW) x y)
+  => (S(LD|RD|RAD|LW|RW|RAW) x y)
 (SLD  x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SLD  x y)
 (SRD  x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRD  x y)
 (SRAD x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRAD x y)
@@ -668,17 +684,13 @@
 (SRW  x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRW  x y)
 (SRAW x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRAW x y)
 
-// Constant rotate generation
-(RLL  x (MOVDconst [c])) => (RLLconst  x [int8(c&31)])
-(RLLG x (MOVDconst [c])) => (RLLGconst x [int8(c&63)])
-
-(ADD (SLDconst x [c]) (SRDconst x [d])) && d == 64-c => (RLLGconst [c] x)
-( OR (SLDconst x [c]) (SRDconst x [d])) && d == 64-c => (RLLGconst [c] x)
-(XOR (SLDconst x [c]) (SRDconst x [d])) && d == 64-c => (RLLGconst [c] x)
+// Match rotate by constant.
+(RLLG x (MOVDconst [c])) => (RISBGZ x {s390x.NewRotateParams(0, 63, int8(c&63))})
+(RLL  x (MOVDconst [c])) => (RLLconst x [int8(c&31)])
 
-(ADDW (SLWconst x [c]) (SRWconst x [d])) && d == 32-c => (RLLconst [c] x)
-( ORW (SLWconst x [c]) (SRWconst x [d])) && d == 32-c => (RLLconst [c] x)
-(XORW (SLWconst x [c]) (SRWconst x [d])) && d == 32-c => (RLLconst [c] x)
+// Match rotate by constant pattern.
+((ADD|OR|XOR)  (SLDconst x [c]) (SRDconst x [64-c])) => (RISBGZ x {s390x.NewRotateParams(0, 63, c)})
+((ADD|OR|XOR)W (SLWconst x [c]) (SRWconst x [32-c])) => (RLLconst x [c])
 
 // Signed 64-bit comparison with immediate.
 (CMP x (MOVDconst [c])) && is32Bit(c) => (CMPconst x [int32(c)])
@@ -692,15 +704,97 @@
 (CMP(W|WU) x (MOVDconst [c])) => (CMP(W|WU)const x [int32(c)])
 (CMP(W|WU) (MOVDconst [c]) x) => (InvertFlags (CMP(W|WU)const x [int32(c)]))
 
+// Match (x >> c) << d to 'rotate then insert selected bits [into zero]'.
+(SLDconst (SRDconst x [c]) [d]) => (RISBGZ x {s390x.NewRotateParams(max8(0, c-d), 63-d, (d-c)&63)})
+
+// Match (x << c) >> d to 'rotate then insert selected bits [into zero]'.
+(SRDconst (SLDconst x [c]) [d]) => (RISBGZ x {s390x.NewRotateParams(d, min8(63, 63-c+d), (c-d)&63)})
+
+// Absorb input zero extension into 'rotate then insert selected bits [into zero]'.
+(RISBGZ (MOVWZreg x) {r}) && r.InMerge(0xffffffff) != nil => (RISBGZ x {*r.InMerge(0xffffffff)})
+(RISBGZ (MOVHZreg x) {r}) && r.InMerge(0x0000ffff) != nil => (RISBGZ x {*r.InMerge(0x0000ffff)})
+(RISBGZ (MOVBZreg x) {r}) && r.InMerge(0x000000ff) != nil => (RISBGZ x {*r.InMerge(0x000000ff)})
+
+// Absorb 'rotate then insert selected bits [into zero]' into zero extension.
+(MOVWZreg (RISBGZ x {r})) && r.OutMerge(0xffffffff) != nil => (RISBGZ x {*r.OutMerge(0xffffffff)})
+(MOVHZreg (RISBGZ x {r})) && r.OutMerge(0x0000ffff) != nil => (RISBGZ x {*r.OutMerge(0x0000ffff)})
+(MOVBZreg (RISBGZ x {r})) && r.OutMerge(0x000000ff) != nil => (RISBGZ x {*r.OutMerge(0x000000ff)})
+
+// Absorb shift into 'rotate then insert selected bits [into zero]'.
+//
+// Any unsigned shift can be represented as a rotate and mask operation:
+//
+//   x << c => RotateLeft64(x, c) & (^uint64(0) << c)
+//   x >> c => RotateLeft64(x, -c) & (^uint64(0) >> c)
+//
+// Therefore when a shift is used as the input to a rotate then insert
+// selected bits instruction we can merge the two together. We just have
+// to be careful that the resultant mask is representable (non-zero and
+// contiguous). For example, assuming that x is variable and c, y and m
+// are constants, a shift followed by a rotate then insert selected bits
+// could be represented as:
+//
+//   RotateLeft64(RotateLeft64(x, c) & (^uint64(0) << c), y) & m
+//
+// We can split the rotation by y into two, one rotate for x and one for
+// the mask:
+//
+//   RotateLeft64(RotateLeft64(x, c), y) & (RotateLeft64(^uint64(0) << c, y)) & m
+//
+// The rotations of x by c followed by y can then be combined:
+//
+//   RotateLeft64(x, c+y) & (RotateLeft64(^uint64(0) << c, y)) & m
+//   ^^^^^^^^^^^^^^^^^^^^   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+//          rotate                          mask
+//
+// To perform this optimization we therefore just need to check that it
+// is valid to merge the shift mask (^(uint64(0)<<c)) into the selected
+// bits mask (i.e. that the resultant mask is non-zero and contiguous).
+//
+(RISBGZ (SLDconst x [c]) {r}) && r.InMerge(^uint64(0)<<c) != nil => (RISBGZ x {(*r.InMerge(^uint64(0)<<c)).RotateLeft(c)})
+(RISBGZ (SRDconst x [c]) {r}) && r.InMerge(^uint64(0)>>c) != nil => (RISBGZ x {(*r.InMerge(^uint64(0)>>c)).RotateLeft(-c)})
+
+// Absorb 'rotate then insert selected bits [into zero]' into left shift.
+(SLDconst (RISBGZ x {r}) [c])
+  && s390x.NewRotateParams(0, 63-c, c).InMerge(r.OutMask()) != nil
+  => (RISBGZ x {(*s390x.NewRotateParams(0, 63-c, c).InMerge(r.OutMask())).RotateLeft(r.Amount)})
+
+// Absorb 'rotate then insert selected bits [into zero]' into right shift.
+(SRDconst (RISBGZ x {r}) [c])
+  && s390x.NewRotateParams(c, 63, -c&63).InMerge(r.OutMask()) != nil
+  => (RISBGZ x {(*s390x.NewRotateParams(c, 63, -c&63).InMerge(r.OutMask())).RotateLeft(r.Amount)})
+
+// Merge 'rotate then insert selected bits [into zero]' instructions together.
+(RISBGZ (RISBGZ x {y}) {z})
+  && z.InMerge(y.OutMask()) != nil
+  => (RISBGZ x {(*z.InMerge(y.OutMask())).RotateLeft(y.Amount)})
+
+// Convert RISBGZ into 64-bit shift (helps CSE).
+(RISBGZ x {r}) && r.End == 63 && r.Start == -r.Amount&63 => (SRDconst x [-r.Amount&63])
+(RISBGZ x {r}) && r.Start == 0 && r.End == 63-r.Amount => (SLDconst x [r.Amount])
+
+// Optimize single bit isolation when it is known to be equivalent to
+// the most significant bit due to mask produced by arithmetic shift.
+// Simply isolate the most significant bit itself and place it in the
+// correct position.
+//
+// Example: (int64(x) >> 63) & 0x8 -> RISBGZ $60, $60, $4, Rsrc, Rdst
+(RISBGZ (SRADconst x [c]) {r})
+  && r.Start == r.End           // single bit selected
+  && (r.Start+r.Amount)&63 <= c // equivalent to most significant bit of x
+  => (RISBGZ x {s390x.NewRotateParams(r.Start, r.Start, -r.Start&63)})
+
 // Canonicalize the order of arguments to comparisons - helps with CSE.
 ((CMP|CMPW|CMPU|CMPWU) x y) && x.ID > y.ID => (InvertFlags ((CMP|CMPW|CMPU|CMPWU) y x))
 
-// Using MOV{W,H,B}Zreg instead of AND is cheaper.
-(AND x (MOVDconst [0xFF])) => (MOVBZreg x)
-(AND x (MOVDconst [0xFFFF])) => (MOVHZreg x)
-(AND x (MOVDconst [0xFFFFFFFF])) => (MOVWZreg x)
-(ANDWconst [0xFF] x) => (MOVBZreg x)
-(ANDWconst [0xFFFF] x) => (MOVHZreg x)
+// Use sign/zero extend instead of RISBGZ.
+(RISBGZ x {r}) && r == s390x.NewRotateParams(56, 63, 0) => (MOVBZreg x)
+(RISBGZ x {r}) && r == s390x.NewRotateParams(48, 63, 0) => (MOVHZreg x)
+(RISBGZ x {r}) && r == s390x.NewRotateParams(32, 63, 0) => (MOVWZreg x)
+
+// Use sign/zero extend instead of ANDW.
+(ANDWconst [0x00ff] x) => (MOVBZreg x)
+(ANDWconst [0xffff] x) => (MOVHZreg x)
 
 // Strength reduce multiplication to the sum (or difference) of two powers of two.
 //
@@ -773,21 +867,22 @@
 
 // detect attempts to set/clear the sign bit
 // may need to be reworked when NIHH/OIHH are added
-(SRDconst [1] (SLDconst [1] (LGDR <t> x))) => (LGDR <t> (LPDFR <x.Type> x))
-(LDGR <t> (SRDconst [1] (SLDconst [1] x))) => (LPDFR (LDGR <t> x))
-(AND (MOVDconst [^(-1<<63)]) (LGDR <t> x)) => (LGDR <t> (LPDFR <x.Type> x))
-(LDGR <t> (AND (MOVDconst [^(-1<<63)]) x)) => (LPDFR (LDGR <t> x))
-(OR (MOVDconst [-1<<63]) (LGDR <t> x))     => (LGDR <t> (LNDFR <x.Type> x))
-(LDGR <t> (OR (MOVDconst [-1<<63]) x))     => (LNDFR (LDGR <t> x))
+(RISBGZ (LGDR <t> x) {r}) && r == s390x.NewRotateParams(1, 63, 0) => (LGDR <t> (LPDFR <x.Type> x))
+(LDGR <t> (RISBGZ x {r})) && r == s390x.NewRotateParams(1, 63, 0) => (LPDFR (LDGR <t> x))
+(OR (MOVDconst [-1<<63]) (LGDR <t> x)) => (LGDR <t> (LNDFR <x.Type> x))
+(LDGR <t> (OR (MOVDconst [-1<<63]) x)) => (LNDFR (LDGR <t> x))
 
 // detect attempts to set the sign bit with load
 (LDGR <t> x:(ORload <t1> [off] {sym} (MOVDconst [-1<<63]) ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (LNDFR <t> (LDGR <t> (MOVDload <t1> [off] {sym} ptr mem)))
 
 // detect copysign
-(OR (SLDconst [63] (SRDconst [63] (LGDR x))) (LGDR (LPDFR <t> y))) => (LGDR (CPSDR <t> y x))
-(OR (SLDconst [63] (SRDconst [63] (LGDR x))) (MOVDconst [c])) && c & -1<<63 == 0 => (LGDR (CPSDR <x.Type> (FMOVDconst <x.Type> [math.Float64frombits(uint64(c))]) x))
-(OR (AND (MOVDconst [-1<<63]) (LGDR x)) (LGDR (LPDFR <t> y))) => (LGDR (CPSDR <t> y x))
-(OR (AND (MOVDconst [-1<<63]) (LGDR x)) (MOVDconst [c])) && c & -1<<63 == 0 => (LGDR (CPSDR <x.Type> (FMOVDconst <x.Type> [math.Float64frombits(uint64(c))]) x))
+(OR (RISBGZ (LGDR x) {r}) (LGDR (LPDFR <t> y)))
+  && r == s390x.NewRotateParams(0, 0, 0)
+  => (LGDR (CPSDR <t> y x))
+(OR (RISBGZ (LGDR x) {r}) (MOVDconst [c]))
+  && c >= 0
+  && r == s390x.NewRotateParams(0, 0, 0)
+  => (LGDR (CPSDR <x.Type> (FMOVDconst <x.Type> [math.Float64frombits(uint64(c))]) x))
 (CPSDR y (FMOVDconst [c])) && !math.Signbit(c) => (LPDFR y)
 (CPSDR y (FMOVDconst [c])) && math.Signbit(c)  => (LNDFR y)
 
@@ -966,6 +1061,9 @@
 (CMPWconst  (ANDWconst _ [m]) [n]) && int32(m) >= 0 &&  int32(m) <  int32(n) => (FlagLT)
 (CMPWUconst (ANDWconst _ [m]) [n]) && uint32(m) < uint32(n) => (FlagLT)
 
+(CMPconst  (RISBGZ x {r}) [c]) && c > 0 && r.OutMask() < uint64(c) => (FlagLT)
+(CMPUconst (RISBGZ x {r}) [c]) && r.OutMask() < uint64(uint32(c)) => (FlagLT)
+
 // Constant compare-and-branch with immediate.
 (CGIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal   != 0 &&  int64(x) ==  int64(y) => (First yes no)
 (CGIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less    != 0 &&  int64(x) <   int64(y) => (First yes no)
diff --git a/src/cmd/compile/internal/ssa/gen/S390XOps.go b/src/cmd/compile/internal/ssa/gen/S390XOps.go
index 728cfb5508..f0cf2f2f6e 100644
--- a/src/cmd/compile/internal/ssa/gen/S390XOps.go
+++ b/src/cmd/compile/internal/ssa/gen/S390XOps.go
@@ -331,25 +331,26 @@ func init() {
 		{name: "LTEBR", argLength: 1, reg: fp1flags, asm: "LTEBR", typ: "Flags"}, // arg0 compare to 0, f32
 
 		{name: "SLD", argLength: 2, reg: sh21, asm: "SLD"},                   // arg0 << arg1, shift amount is mod 64
-		{name: "SLW", argLength: 2, reg: sh21, asm: "SLW"},                   // arg0 << arg1, shift amount is mod 32
+		{name: "SLW", argLength: 2, reg: sh21, asm: "SLW"},                   // arg0 << arg1, shift amount is mod 64
 		{name: "SLDconst", argLength: 1, reg: gp11, asm: "SLD", aux: "Int8"}, // arg0 << auxint, shift amount 0-63
 		{name: "SLWconst", argLength: 1, reg: gp11, asm: "SLW", aux: "Int8"}, // arg0 << auxint, shift amount 0-31
 
 		{name: "SRD", argLength: 2, reg: sh21, asm: "SRD"},                   // unsigned arg0 >> arg1, shift amount is mod 64
-		{name: "SRW", argLength: 2, reg: sh21, asm: "SRW"},                   // unsigned uint32(arg0) >> arg1, shift amount is mod 32
+		{name: "SRW", argLength: 2, reg: sh21, asm: "SRW"},                   // unsigned uint32(arg0) >> arg1, shift amount is mod 64
 		{name: "SRDconst", argLength: 1, reg: gp11, asm: "SRD", aux: "Int8"}, // unsigned arg0 >> auxint, shift amount 0-63
 		{name: "SRWconst", argLength: 1, reg: gp11, asm: "SRW", aux: "Int8"}, // unsigned uint32(arg0) >> auxint, shift amount 0-31
 
 		// Arithmetic shifts clobber flags.
 		{name: "SRAD", argLength: 2, reg: sh21, asm: "SRAD", clobberFlags: true},                   // signed arg0 >> arg1, shift amount is mod 64
-		{name: "SRAW", argLength: 2, reg: sh21, asm: "SRAW", clobberFlags: true},                   // signed int32(arg0) >> arg1, shift amount is mod 32
+		{name: "SRAW", argLength: 2, reg: sh21, asm: "SRAW", clobberFlags: true},                   // signed int32(arg0) >> arg1, shift amount is mod 64
 		{name: "SRADconst", argLength: 1, reg: gp11, asm: "SRAD", aux: "Int8", clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-63
 		{name: "SRAWconst", argLength: 1, reg: gp11, asm: "SRAW", aux: "Int8", clobberFlags: true}, // signed int32(arg0) >> auxint, shift amount 0-31
 
-		{name: "RLLG", argLength: 2, reg: sh21, asm: "RLLG"},                   // arg0 rotate left arg1, rotate amount 0-63
-		{name: "RLL", argLength: 2, reg: sh21, asm: "RLL"},                     // arg0 rotate left arg1, rotate amount 0-31
-		{name: "RLLGconst", argLength: 1, reg: gp11, asm: "RLLG", aux: "Int8"}, // arg0 rotate left auxint, rotate amount 0-63
-		{name: "RLLconst", argLength: 1, reg: gp11, asm: "RLL", aux: "Int8"},   // arg0 rotate left auxint, rotate amount 0-31
+		// Rotate instructions.
+		// Note: no RLLGconst - use RISBGZ instead.
+		{name: "RLLG", argLength: 2, reg: sh21, asm: "RLLG"},                 // arg0 rotate left arg1, rotate amount 0-63
+		{name: "RLL", argLength: 2, reg: sh21, asm: "RLL"},                   // arg0 rotate left arg1, rotate amount 0-31
+		{name: "RLLconst", argLength: 1, reg: gp11, asm: "RLL", aux: "Int8"}, // arg0 rotate left auxint, rotate amount 0-31
 
 		// Rotate then (and|or|xor|insert) selected bits instructions.
 		//
@@ -371,6 +372,7 @@ func init() {
 		// +-------------+-------+-----+--------+-----------------------+-----------------------+-----------------------+
 		//
 		{name: "RXSBG", argLength: 2, reg: gp21, asm: "RXSBG", resultInArg0: true, aux: "S390XRotateParams", clobberFlags: true}, // rotate then xor selected bits
+		{name: "RISBGZ", argLength: 1, reg: gp11, asm: "RISBGZ", aux: "S390XRotateParams", clobberFlags: true},                   // rotate then insert selected bits [into zero]
 
 		// unary ops
 		{name: "NEG", argLength: 1, reg: gp11, asm: "NEG", clobberFlags: true},   // -arg0
@@ -547,9 +549,9 @@ func init() {
 		// Atomic bitwise operations.
 		// Note: 'floor' operations round the pointer down to the nearest word boundary
 		// which reflects how they are used in the runtime.
-		{name: "LAN", argLength: 3, reg: gpstore, asm: "LAN", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *arg0 &= arg1. arg2 = mem.
+		{name: "LAN", argLength: 3, reg: gpstore, asm: "LAN", typ: "Mem", clobberFlags: true, hasSideEffects: true},         // *arg0 &= arg1. arg2 = mem.
 		{name: "LANfloor", argLength: 3, reg: gpstorelab, asm: "LAN", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) &= arg1. arg2 = mem.
-		{name: "LAO", argLength: 3, reg: gpstore, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *arg0 |= arg1. arg2 = mem.
+		{name: "LAO", argLength: 3, reg: gpstore, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true},         // *arg0 |= arg1. arg2 = mem.
 		{name: "LAOfloor", argLength: 3, reg: gpstorelab, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) |= arg1. arg2 = mem.
 
 		// Compare and swap.
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index c0b663cd8f..eceef1d91a 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -2285,9 +2285,9 @@ const (
 	OpS390XSRAWconst
 	OpS390XRLLG
 	OpS390XRLL
-	OpS390XRLLGconst
 	OpS390XRLLconst
 	OpS390XRXSBG
+	OpS390XRISBGZ
 	OpS390XNEG
 	OpS390XNEGW
 	OpS390XNOT
@@ -30740,10 +30740,10 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
-		name:    "RLLGconst",
+		name:    "RLLconst",
 		auxType: auxInt8,
 		argLen:  1,
-		asm:     s390x.ARLLG,
+		asm:     s390x.ARLL,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
@@ -30754,13 +30754,16 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
-		name:    "RLLconst",
-		auxType: auxInt8,
-		argLen:  1,
-		asm:     s390x.ARLL,
+		name:         "RXSBG",
+		auxType:      auxS390XRotateParams,
+		argLen:       2,
+		resultInArg0: true,
+		clobberFlags: true,
+		asm:          s390x.ARXSBG,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
+				{1, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
 			},
 			outputs: []outputInfo{
 				{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
@@ -30768,16 +30771,14 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
-		name:         "RXSBG",
+		name:         "RISBGZ",
 		auxType:      auxS390XRotateParams,
-		argLen:       2,
-		resultInArg0: true,
+		argLen:       1,
 		clobberFlags: true,
-		asm:          s390x.ARXSBG,
+		asm:          s390x.ARISBGZ,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
-				{1, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
 			},
 			outputs: []outputInfo{
 				{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
diff --git a/src/cmd/compile/internal/ssa/rewriteS390X.go b/src/cmd/compile/internal/ssa/rewriteS390X.go
index 8c3c61d584..d66113d111 100644
--- a/src/cmd/compile/internal/ssa/rewriteS390X.go
+++ b/src/cmd/compile/internal/ssa/rewriteS390X.go
@@ -699,6 +699,8 @@ func rewriteValueS390X(v *Value) bool {
 		return rewriteValueS390X_OpS390XORconst(v)
 	case OpS390XORload:
 		return rewriteValueS390X_OpS390XORload(v)
+	case OpS390XRISBGZ:
+		return rewriteValueS390X_OpS390XRISBGZ(v)
 	case OpS390XRLL:
 		return rewriteValueS390X_OpS390XRLL(v)
 	case OpS390XRLLG:
@@ -5272,9 +5274,8 @@ func rewriteValueS390X_OpS390XADD(v *Value) bool {
 		}
 		break
 	}
-	// match: (ADD (SLDconst x [c]) (SRDconst x [d]))
-	// cond: d == 64-c
-	// result: (RLLGconst [c] x)
+	// match: (ADD (SLDconst x [c]) (SRDconst x [64-c]))
+	// result: (RISBGZ x {s390x.NewRotateParams(0, 63, c)})
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			if v_0.Op != OpS390XSLDconst {
@@ -5282,15 +5283,11 @@ func rewriteValueS390X_OpS390XADD(v *Value) bool {
 			}
 			c := auxIntToInt8(v_0.AuxInt)
 			x := v_0.Args[0]
-			if v_1.Op != OpS390XSRDconst {
+			if v_1.Op != OpS390XSRDconst || auxIntToInt8(v_1.AuxInt) != 64-c || x != v_1.Args[0] {
 				continue
 			}
-			d := auxIntToInt8(v_1.AuxInt)
-			if x != v_1.Args[0] || !(d == 64-c) {
-				continue
-			}
-			v.reset(OpS390XRLLGconst)
-			v.AuxInt = int8ToAuxInt(c)
+			v.reset(OpS390XRISBGZ)
+			v.Aux = s390xRotateParamsToAux(s390x.NewRotateParams(0, 63, c))
 			v.AddArg(x)
 			return true
 		}
@@ -5470,9 +5467,8 @@ func rewriteValueS390X_OpS390XADDW(v *Value) bool {
 		}
 		break
 	}
-	// match: (ADDW (SLWconst x [c]) (SRWconst x [d]))
-	// cond: d == 32-c
-	// result: (RLLconst [c] x)
+	// match: (ADDW (SLWconst x [c]) (SRWconst x [32-c]))
+	// result: (RLLconst x [c])
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			if v_0.Op != OpS390XSLWconst {
@@ -5480,11 +5476,7 @@ func rewriteValueS390X_OpS390XADDW(v *Value) bool {
 			}
 			c := auxIntToInt8(v_0.AuxInt)
 			x := v_0.Args[0]
-			if v_1.Op != OpS390XSRWconst {
-				continue
-			}
-			d := auxIntToInt8(v_1.AuxInt)
-			if x != v_1.Args[0] || !(d == 32-c) {
+			if v_1.Op != OpS390XSRWconst || auxIntToInt8(v_1.AuxInt) != 32-c || x != v_1.Args[0] {
 				continue
 			}
 			v.reset(OpS390XRLLconst)
@@ -5844,8 +5836,8 @@ func rewriteValueS390X_OpS390XAND(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (AND x (MOVDconst [c]))
-	// cond: is32Bit(c) && c < 0
-	// result: (ANDconst [c] x)
+	// cond: s390x.NewRotateParams(0, 63, 0).OutMerge(uint64(c)) != nil
+	// result: (RISBGZ x {*s390x.NewRotateParams(0, 63, 0).OutMerge(uint64(c))})
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			x := v_0
@@ -5853,19 +5845,19 @@ func rewriteValueS390X_OpS390XAND(v *Value) bool {
 				continue
 			}
 			c := auxIntToInt64(v_1.AuxInt)
-			if !(is32Bit(c) && c < 0) {
+			if !(s390x.NewRotateParams(0, 63, 0).OutMerge(uint64(c)) != nil) {
 				continue
 			}
-			v.reset(OpS390XANDconst)
-			v.AuxInt = int64ToAuxInt(c)
+			v.reset(OpS390XRISBGZ)
+			v.Aux = s390xRotateParamsToAux(*s390x.NewRotateParams(0, 63, 0).OutMerge(uint64(c)))
 			v.AddArg(x)
 			return true
 		}
 		break
 	}
 	// match: (AND x (MOVDconst [c]))
-	// cond: is32Bit(c) && c >= 0
-	// result: (MOVWZreg (ANDWconst <typ.UInt32> [int32(c)] x))
+	// cond: is32Bit(c) && c < 0
+	// result: (ANDconst [c] x)
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			x := v_0
@@ -5873,72 +5865,32 @@ func rewriteValueS390X_OpS390XAND(v *Value) bool {
 				continue
 			}
 			c := auxIntToInt64(v_1.AuxInt)
-			if !(is32Bit(c) && c >= 0) {
-				continue
-			}
-			v.reset(OpS390XMOVWZreg)
-			v0 := b.NewValue0(v.Pos, OpS390XANDWconst, typ.UInt32)
-			v0.AuxInt = int32ToAuxInt(int32(c))
-			v0.AddArg(x)
-			v.AddArg(v0)
-			return true
-		}
-		break
-	}
-	// match: (AND x (MOVDconst [0xFF]))
-	// result: (MOVBZreg x)
-	for {
-		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			x := v_0
-			if v_1.Op != OpS390XMOVDconst || auxIntToInt64(v_1.AuxInt) != 0xFF {
+			if !(is32Bit(c) && c < 0) {
 				continue
 			}
-			v.reset(OpS390XMOVBZreg)
+			v.reset(OpS390XANDconst)
+			v.AuxInt = int64ToAuxInt(c)
 			v.AddArg(x)
 			return true
 		}
 		break
 	}
-	// match: (AND x (MOVDconst [0xFFFF]))
-	// result: (MOVHZreg x)
+	// match: (AND x (MOVDconst [c]))
+	// cond: is32Bit(c) && c >= 0
+	// result: (MOVWZreg (ANDWconst <typ.UInt32> [int32(c)] x))
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			x := v_0
-			if v_1.Op != OpS390XMOVDconst || auxIntToInt64(v_1.AuxInt) != 0xFFFF {
+			if v_1.Op != OpS390XMOVDconst {
 				continue
 			}
-			v.reset(OpS390XMOVHZreg)
-			v.AddArg(x)
-			return true
-		}
-		break
-	}
-	// match: (AND x (MOVDconst [0xFFFFFFFF]))
-	// result: (MOVWZreg x)
-	for {
-		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			x := v_0
-			if v_1.Op != OpS390XMOVDconst || auxIntToInt64(v_1.AuxInt) != 0xFFFFFFFF {
+			c := auxIntToInt64(v_1.AuxInt)
+			if !(is32Bit(c) && c >= 0) {
 				continue
 			}
 			v.reset(OpS390XMOVWZreg)
-			v.AddArg(x)
-			return true
-		}
-		break
-	}
-	// match: (AND (MOVDconst [^(-1<<63)]) (LGDR <t> x))
-	// result: (LGDR <t> (LPDFR <x.Type> x))
-	for {
-		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			if v_0.Op != OpS390XMOVDconst || auxIntToInt64(v_0.AuxInt) != ^(-1<<63) || v_1.Op != OpS390XLGDR {
-				continue
-			}
-			t := v_1.Type
-			x := v_1.Args[0]
-			v.reset(OpS390XLGDR)
-			v.Type = t
-			v0 := b.NewValue0(v.Pos, OpS390XLPDFR, x.Type)
+			v0 := b.NewValue0(v.Pos, OpS390XANDWconst, typ.UInt32)
+			v0.AuxInt = int32ToAuxInt(int32(c))
 			v0.AddArg(x)
 			v.AddArg(v0)
 			return true
@@ -6103,10 +6055,10 @@ func rewriteValueS390X_OpS390XANDWconst(v *Value) bool {
 		v.AddArg(x)
 		return true
 	}
-	// match: (ANDWconst [0xFF] x)
+	// match: (ANDWconst [0x00ff] x)
 	// result: (MOVBZreg x)
 	for {
-		if auxIntToInt32(v.AuxInt) != 0xFF {
+		if auxIntToInt32(v.AuxInt) != 0x00ff {
 			break
 		}
 		x := v_0
@@ -6114,10 +6066,10 @@ func rewriteValueS390X_OpS390XANDWconst(v *Value) bool {
 		v.AddArg(x)
 		return true
 	}
-	// match: (ANDWconst [0xFFFF] x)
+	// match: (ANDWconst [0xffff] x)
 	// result: (MOVHZreg x)
 	for {
-		if auxIntToInt32(v.AuxInt) != 0xFFFF {
+		if auxIntToInt32(v.AuxInt) != 0xffff {
 			break
 		}
 		x := v_0
@@ -6515,6 +6467,21 @@ func rewriteValueS390X_OpS390XCMPUconst(v *Value) bool {
 		v.reset(OpS390XFlagLT)
 		return true
 	}
+	// match: (CMPUconst (RISBGZ x {r}) [c])
+	// cond: r.OutMask() < uint64(uint32(c))
+	// result: (FlagLT)
+	for {
+		c := auxIntToInt32(v.AuxInt)
+		if v_0.Op != OpS390XRISBGZ {
+			break
+		}
+		r := auxToS390xRotateParams(v_0.Aux)
+		if !(r.OutMask() < uint64(uint32(c))) {
+			break
+		}
+		v.reset(OpS390XFlagLT)
+		return true
+	}
 	// match: (CMPUconst (MOVWZreg x) [c])
 	// result: (CMPWUconst x [c])
 	for {
@@ -7152,6 +7119,21 @@ func rewriteValueS390X_OpS390XCMPconst(v *Value) bool {
 		v.reset(OpS390XFlagGT)
 		return true
 	}
+	// match: (CMPconst (RISBGZ x {r}) [c])
+	// cond: c > 0 && r.OutMask() < uint64(c)
+	// result: (FlagLT)
+	for {
+		c := auxIntToInt32(v.AuxInt)
+		if v_0.Op != OpS390XRISBGZ {
+			break
+		}
+		r := auxToS390xRotateParams(v_0.Aux)
+		if !(c > 0 && r.OutMask() < uint64(c)) {
+			break
+		}
+		v.reset(OpS390XFlagLT)
+		return true
+	}
 	// match: (CMPconst (MOVWreg x) [c])
 	// result: (CMPWconst x [c])
 	for {
@@ -7684,47 +7666,25 @@ func rewriteValueS390X_OpS390XFNEGS(v *Value) bool {
 func rewriteValueS390X_OpS390XLDGR(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (LDGR <t> (SRDconst [1] (SLDconst [1] x)))
+	// match: (LDGR <t> (RISBGZ x {r}))
+	// cond: r == s390x.NewRotateParams(1, 63, 0)
 	// result: (LPDFR (LDGR <t> x))
 	for {
 		t := v.Type
-		if v_0.Op != OpS390XSRDconst || auxIntToInt8(v_0.AuxInt) != 1 {
+		if v_0.Op != OpS390XRISBGZ {
 			break
 		}
-		v_0_0 := v_0.Args[0]
-		if v_0_0.Op != OpS390XSLDconst || auxIntToInt8(v_0_0.AuxInt) != 1 {
+		r := auxToS390xRotateParams(v_0.Aux)
+		x := v_0.Args[0]
+		if !(r == s390x.NewRotateParams(1, 63, 0)) {
 			break
 		}
-		x := v_0_0.Args[0]
 		v.reset(OpS390XLPDFR)
 		v0 := b.NewValue0(v.Pos, OpS390XLDGR, t)
 		v0.AddArg(x)
 		v.AddArg(v0)
 		return true
 	}
-	// match: (LDGR <t> (AND (MOVDconst [^(-1<<63)]) x))
-	// result: (LPDFR (LDGR <t> x))
-	for {
-		t := v.Type
-		if v_0.Op != OpS390XAND {
-			break
-		}
-		_ = v_0.Args[1]
-		v_0_0 := v_0.Args[0]
-		v_0_1 := v_0.Args[1]
-		for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
-			if v_0_0.Op != OpS390XMOVDconst || auxIntToInt64(v_0_0.AuxInt) != ^(-1<<63) {
-				continue
-			}
-			x := v_0_1
-			v.reset(OpS390XLPDFR)
-			v0 := b.NewValue0(v.Pos, OpS390XLDGR, t)
-			v0.AddArg(x)
-			v.AddArg(v0)
-			return true
-		}
-		break
-	}
 	// match: (LDGR <t> (OR (MOVDconst [-1<<63]) x))
 	// result: (LNDFR (LDGR <t> x))
 	for {
@@ -8309,6 +8269,23 @@ func rewriteValueS390X_OpS390XMOVBZreg(v *Value) bool {
 		v.copyOf(x)
 		return true
 	}
+	// match: (MOVBZreg (RISBGZ x {r}))
+	// cond: r.OutMerge(0x000000ff) != nil
+	// result: (RISBGZ x {*r.OutMerge(0x000000ff)})
+	for {
+		if v_0.Op != OpS390XRISBGZ {
+			break
+		}
+		r := auxToS390xRotateParams(v_0.Aux)
+		x := v_0.Args[0]
+		if !(r.OutMerge(0x000000ff) != nil) {
+			break
+		}
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux(*r.OutMerge(0x000000ff))
+		v.AddArg(x)
+		return true
+	}
 	// match: (MOVBZreg (ANDWconst [m] x))
 	// result: (MOVWZreg (ANDWconst <typ.UInt32> [int32( uint8(m))] x))
 	for {
@@ -9697,6 +9674,23 @@ func rewriteValueS390X_OpS390XMOVHZreg(v *Value) bool {
 		v.AuxInt = int64ToAuxInt(int64(uint16(c)))
 		return true
 	}
+	// match: (MOVHZreg (RISBGZ x {r}))
+	// cond: r.OutMerge(0x0000ffff) != nil
+	// result: (RISBGZ x {*r.OutMerge(0x0000ffff)})
+	for {
+		if v_0.Op != OpS390XRISBGZ {
+			break
+		}
+		r := auxToS390xRotateParams(v_0.Aux)
+		x := v_0.Args[0]
+		if !(r.OutMerge(0x0000ffff) != nil) {
+			break
+		}
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux(*r.OutMerge(0x0000ffff))
+		v.AddArg(x)
+		return true
+	}
 	// match: (MOVHZreg (ANDWconst [m] x))
 	// result: (MOVWZreg (ANDWconst <typ.UInt32> [int32(uint16(m))] x))
 	for {
@@ -10547,6 +10541,23 @@ func rewriteValueS390X_OpS390XMOVWZreg(v *Value) bool {
 		v.AuxInt = int64ToAuxInt(int64(uint32(c)))
 		return true
 	}
+	// match: (MOVWZreg (RISBGZ x {r}))
+	// cond: r.OutMerge(0xffffffff) != nil
+	// result: (RISBGZ x {*r.OutMerge(0xffffffff)})
+	for {
+		if v_0.Op != OpS390XRISBGZ {
+			break
+		}
+		r := auxToS390xRotateParams(v_0.Aux)
+		x := v_0.Args[0]
+		if !(r.OutMerge(0xffffffff) != nil) {
+			break
+		}
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux(*r.OutMerge(0xffffffff))
+		v.AddArg(x)
+		return true
+	}
 	return false
 }
 func rewriteValueS390X_OpS390XMOVWload(v *Value) bool {
@@ -11622,9 +11633,8 @@ func rewriteValueS390X_OpS390XOR(v *Value) bool {
 		}
 		break
 	}
-	// match: ( OR (SLDconst x [c]) (SRDconst x [d]))
-	// cond: d == 64-c
-	// result: (RLLGconst [c] x)
+	// match: (OR (SLDconst x [c]) (SRDconst x [64-c]))
+	// result: (RISBGZ x {s390x.NewRotateParams(0, 63, c)})
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			if v_0.Op != OpS390XSLDconst {
@@ -11632,15 +11642,11 @@ func rewriteValueS390X_OpS390XOR(v *Value) bool {
 			}
 			c := auxIntToInt8(v_0.AuxInt)
 			x := v_0.Args[0]
-			if v_1.Op != OpS390XSRDconst {
-				continue
-			}
-			d := auxIntToInt8(v_1.AuxInt)
-			if x != v_1.Args[0] || !(d == 64-c) {
+			if v_1.Op != OpS390XSRDconst || auxIntToInt8(v_1.AuxInt) != 64-c || x != v_1.Args[0] {
 				continue
 			}
-			v.reset(OpS390XRLLGconst)
-			v.AuxInt = int8ToAuxInt(c)
+			v.reset(OpS390XRISBGZ)
+			v.Aux = s390xRotateParamsToAux(s390x.NewRotateParams(0, 63, c))
 			v.AddArg(x)
 			return true
 		}
@@ -11664,22 +11670,20 @@ func rewriteValueS390X_OpS390XOR(v *Value) bool {
 		}
 		break
 	}
-	// match: (OR (SLDconst [63] (SRDconst [63] (LGDR x))) (LGDR (LPDFR <t> y)))
+	// match: (OR (RISBGZ (LGDR x) {r}) (LGDR (LPDFR <t> y)))
+	// cond: r == s390x.NewRotateParams(0, 0, 0)
 	// result: (LGDR (CPSDR <t> y x))
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			if v_0.Op != OpS390XSLDconst || auxIntToInt8(v_0.AuxInt) != 63 {
+			if v_0.Op != OpS390XRISBGZ {
 				continue
 			}
+			r := auxToS390xRotateParams(v_0.Aux)
 			v_0_0 := v_0.Args[0]
-			if v_0_0.Op != OpS390XSRDconst || auxIntToInt8(v_0_0.AuxInt) != 63 {
-				continue
-			}
-			v_0_0_0 := v_0_0.Args[0]
-			if v_0_0_0.Op != OpS390XLGDR {
+			if v_0_0.Op != OpS390XLGDR {
 				continue
 			}
-			x := v_0_0_0.Args[0]
+			x := v_0_0.Args[0]
 			if v_1.Op != OpS390XLGDR {
 				continue
 			}
@@ -11689,6 +11693,9 @@ func rewriteValueS390X_OpS390XOR(v *Value) bool {
 			}
 			t := v_1_0.Type
 			y := v_1_0.Args[0]
+			if !(r == s390x.NewRotateParams(0, 0, 0)) {
+				continue
+			}
 			v.reset(OpS390XLGDR)
 			v0 := b.NewValue0(v.Pos, OpS390XCPSDR, t)
 			v0.AddArg2(y, x)
@@ -11697,28 +11704,25 @@ func rewriteValueS390X_OpS390XOR(v *Value) bool {
 		}
 		break
 	}
-	// match: (OR (SLDconst [63] (SRDconst [63] (LGDR x))) (MOVDconst [c]))
-	// cond: c & -1<<63 == 0
+	// match: (OR (RISBGZ (LGDR x) {r}) (MOVDconst [c]))
+	// cond: c >= 0 && r == s390x.NewRotateParams(0, 0, 0)
 	// result: (LGDR (CPSDR <x.Type> (FMOVDconst <x.Type> [math.Float64frombits(uint64(c))]) x))
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			if v_0.Op != OpS390XSLDconst || auxIntToInt8(v_0.AuxInt) != 63 {
+			if v_0.Op != OpS390XRISBGZ {
 				continue
 			}
+			r := auxToS390xRotateParams(v_0.Aux)
 			v_0_0 := v_0.Args[0]
-			if v_0_0.Op != OpS390XSRDconst || auxIntToInt8(v_0_0.AuxInt) != 63 {
+			if v_0_0.Op != OpS390XLGDR {
 				continue
 			}
-			v_0_0_0 := v_0_0.Args[0]
-			if v_0_0_0.Op != OpS390XLGDR {
-				continue
-			}
-			x := v_0_0_0.Args[0]
+			x := v_0_0.Args[0]
 			if v_1.Op != OpS390XMOVDconst {
 				continue
 			}
 			c := auxIntToInt64(v_1.AuxInt)
-			if !(c&-1<<63 == 0) {
+			if !(c >= 0 && r == s390x.NewRotateParams(0, 0, 0)) {
 				continue
 			}
 			v.reset(OpS390XLGDR)
@@ -11731,73 +11735,6 @@ func rewriteValueS390X_OpS390XOR(v *Value) bool {
 		}
 		break
 	}
-	// match: (OR (AND (MOVDconst [-1<<63]) (LGDR x)) (LGDR (LPDFR <t> y)))
-	// result: (LGDR (CPSDR <t> y x))
-	for {
-		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			if v_0.Op != OpS390XAND {
-				continue
-			}
-			_ = v_0.Args[1]
-			v_0_0 := v_0.Args[0]
-			v_0_1 := v_0.Args[1]
-			for _i1 := 0; _i1 <= 1; _i1, v_0_0, v_0_1 = _i1+1, v_0_1, v_0_0 {
-				if v_0_0.Op != OpS390XMOVDconst || auxIntToInt64(v_0_0.AuxInt) != -1<<63 || v_0_1.Op != OpS390XLGDR {
-					continue
-				}
-				x := v_0_1.Args[0]
-				if v_1.Op != OpS390XLGDR {
-					continue
-				}
-				v_1_0 := v_1.Args[0]
-				if v_1_0.Op != OpS390XLPDFR {
-					continue
-				}
-				t := v_1_0.Type
-				y := v_1_0.Args[0]
-				v.reset(OpS390XLGDR)
-				v0 := b.NewValue0(v.Pos, OpS390XCPSDR, t)
-				v0.AddArg2(y, x)
-				v.AddArg(v0)
-				return true
-			}
-		}
-		break
-	}
-	// match: (OR (AND (MOVDconst [-1<<63]) (LGDR x)) (MOVDconst [c]))
-	// cond: c & -1<<63 == 0
-	// result: (LGDR (CPSDR <x.Type> (FMOVDconst <x.Type> [math.Float64frombits(uint64(c))]) x))
-	for {
-		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			if v_0.Op != OpS390XAND {
-				continue
-			}
-			_ = v_0.Args[1]
-			v_0_0 := v_0.Args[0]
-			v_0_1 := v_0.Args[1]
-			for _i1 := 0; _i1 <= 1; _i1, v_0_0, v_0_1 = _i1+1, v_0_1, v_0_0 {
-				if v_0_0.Op != OpS390XMOVDconst || auxIntToInt64(v_0_0.AuxInt) != -1<<63 || v_0_1.Op != OpS390XLGDR {
-					continue
-				}
-				x := v_0_1.Args[0]
-				if v_1.Op != OpS390XMOVDconst {
-					continue
-				}
-				c := auxIntToInt64(v_1.AuxInt)
-				if !(c&-1<<63 == 0) {
-					continue
-				}
-				v.reset(OpS390XLGDR)
-				v0 := b.NewValue0(v.Pos, OpS390XCPSDR, x.Type)
-				v1 := b.NewValue0(v.Pos, OpS390XFMOVDconst, x.Type)
-				v1.AuxInt = float64ToAuxInt(math.Float64frombits(uint64(c)))
-				v0.AddArg2(v1, x)
-				v.AddArg(v0)
-				return true
-			}
-		}
-		break
-	}
 	// match: (OR (MOVDconst [c]) (MOVDconst [d]))
 	// result: (MOVDconst [c|d])
 	for {
@@ -12394,9 +12331,8 @@ func rewriteValueS390X_OpS390XORW(v *Value) bool {
 		}
 		break
 	}
-	// match: ( ORW (SLWconst x [c]) (SRWconst x [d]))
-	// cond: d == 32-c
-	// result: (RLLconst [c] x)
+	// match: (ORW (SLWconst x [c]) (SRWconst x [32-c]))
+	// result: (RLLconst x [c])
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			if v_0.Op != OpS390XSLWconst {
@@ -12404,11 +12340,7 @@ func rewriteValueS390X_OpS390XORW(v *Value) bool {
 			}
 			c := auxIntToInt8(v_0.AuxInt)
 			x := v_0.Args[0]
-			if v_1.Op != OpS390XSRWconst {
-				continue
-			}
-			d := auxIntToInt8(v_1.AuxInt)
-			if x != v_1.Args[0] || !(d == 32-c) {
+			if v_1.Op != OpS390XSRWconst || auxIntToInt8(v_1.AuxInt) != 32-c || x != v_1.Args[0] {
 				continue
 			}
 			v.reset(OpS390XRLLconst)
@@ -12980,6 +12912,221 @@ func rewriteValueS390X_OpS390XORload(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueS390X_OpS390XRISBGZ(v *Value) bool {
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (RISBGZ (MOVWZreg x) {r})
+	// cond: r.InMerge(0xffffffff) != nil
+	// result: (RISBGZ x {*r.InMerge(0xffffffff)})
+	for {
+		r := auxToS390xRotateParams(v.Aux)
+		if v_0.Op != OpS390XMOVWZreg {
+			break
+		}
+		x := v_0.Args[0]
+		if !(r.InMerge(0xffffffff) != nil) {
+			break
+		}
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux(*r.InMerge(0xffffffff))
+		v.AddArg(x)
+		return true
+	}
+	// match: (RISBGZ (MOVHZreg x) {r})
+	// cond: r.InMerge(0x0000ffff) != nil
+	// result: (RISBGZ x {*r.InMerge(0x0000ffff)})
+	for {
+		r := auxToS390xRotateParams(v.Aux)
+		if v_0.Op != OpS390XMOVHZreg {
+			break
+		}
+		x := v_0.Args[0]
+		if !(r.InMerge(0x0000ffff) != nil) {
+			break
+		}
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux(*r.InMerge(0x0000ffff))
+		v.AddArg(x)
+		return true
+	}
+	// match: (RISBGZ (MOVBZreg x) {r})
+	// cond: r.InMerge(0x000000ff) != nil
+	// result: (RISBGZ x {*r.InMerge(0x000000ff)})
+	for {
+		r := auxToS390xRotateParams(v.Aux)
+		if v_0.Op != OpS390XMOVBZreg {
+			break
+		}
+		x := v_0.Args[0]
+		if !(r.InMerge(0x000000ff) != nil) {
+			break
+		}
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux(*r.InMerge(0x000000ff))
+		v.AddArg(x)
+		return true
+	}
+	// match: (RISBGZ (SLDconst x [c]) {r})
+	// cond: r.InMerge(^uint64(0)<<c) != nil
+	// result: (RISBGZ x {(*r.InMerge(^uint64(0)<<c)).RotateLeft(c)})
+	for {
+		r := auxToS390xRotateParams(v.Aux)
+		if v_0.Op != OpS390XSLDconst {
+			break
+		}
+		c := auxIntToInt8(v_0.AuxInt)
+		x := v_0.Args[0]
+		if !(r.InMerge(^uint64(0)<<c) != nil) {
+			break
+		}
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux((*r.InMerge(^uint64(0) << c)).RotateLeft(c))
+		v.AddArg(x)
+		return true
+	}
+	// match: (RISBGZ (SRDconst x [c]) {r})
+	// cond: r.InMerge(^uint64(0)>>c) != nil
+	// result: (RISBGZ x {(*r.InMerge(^uint64(0)>>c)).RotateLeft(-c)})
+	for {
+		r := auxToS390xRotateParams(v.Aux)
+		if v_0.Op != OpS390XSRDconst {
+			break
+		}
+		c := auxIntToInt8(v_0.AuxInt)
+		x := v_0.Args[0]
+		if !(r.InMerge(^uint64(0)>>c) != nil) {
+			break
+		}
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux((*r.InMerge(^uint64(0) >> c)).RotateLeft(-c))
+		v.AddArg(x)
+		return true
+	}
+	// match: (RISBGZ (RISBGZ x {y}) {z})
+	// cond: z.InMerge(y.OutMask()) != nil
+	// result: (RISBGZ x {(*z.InMerge(y.OutMask())).RotateLeft(y.Amount)})
+	for {
+		z := auxToS390xRotateParams(v.Aux)
+		if v_0.Op != OpS390XRISBGZ {
+			break
+		}
+		y := auxToS390xRotateParams(v_0.Aux)
+		x := v_0.Args[0]
+		if !(z.InMerge(y.OutMask()) != nil) {
+			break
+		}
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux((*z.InMerge(y.OutMask())).RotateLeft(y.Amount))
+		v.AddArg(x)
+		return true
+	}
+	// match: (RISBGZ x {r})
+	// cond: r.End == 63 && r.Start == -r.Amount&63
+	// result: (SRDconst x [-r.Amount&63])
+	for {
+		r := auxToS390xRotateParams(v.Aux)
+		x := v_0
+		if !(r.End == 63 && r.Start == -r.Amount&63) {
+			break
+		}
+		v.reset(OpS390XSRDconst)
+		v.AuxInt = int8ToAuxInt(-r.Amount & 63)
+		v.AddArg(x)
+		return true
+	}
+	// match: (RISBGZ x {r})
+	// cond: r.Start == 0 && r.End == 63-r.Amount
+	// result: (SLDconst x [r.Amount])
+	for {
+		r := auxToS390xRotateParams(v.Aux)
+		x := v_0
+		if !(r.Start == 0 && r.End == 63-r.Amount) {
+			break
+		}
+		v.reset(OpS390XSLDconst)
+		v.AuxInt = int8ToAuxInt(r.Amount)
+		v.AddArg(x)
+		return true
+	}
+	// match: (RISBGZ (SRADconst x [c]) {r})
+	// cond: r.Start == r.End && (r.Start+r.Amount)&63 <= c
+	// result: (RISBGZ x {s390x.NewRotateParams(r.Start, r.Start, -r.Start&63)})
+	for {
+		r := auxToS390xRotateParams(v.Aux)
+		if v_0.Op != OpS390XSRADconst {
+			break
+		}
+		c := auxIntToInt8(v_0.AuxInt)
+		x := v_0.Args[0]
+		if !(r.Start == r.End && (r.Start+r.Amount)&63 <= c) {
+			break
+		}
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux(s390x.NewRotateParams(r.Start, r.Start, -r.Start&63))
+		v.AddArg(x)
+		return true
+	}
+	// match: (RISBGZ x {r})
+	// cond: r == s390x.NewRotateParams(56, 63, 0)
+	// result: (MOVBZreg x)
+	for {
+		r := auxToS390xRotateParams(v.Aux)
+		x := v_0
+		if !(r == s390x.NewRotateParams(56, 63, 0)) {
+			break
+		}
+		v.reset(OpS390XMOVBZreg)
+		v.AddArg(x)
+		return true
+	}
+	// match: (RISBGZ x {r})
+	// cond: r == s390x.NewRotateParams(48, 63, 0)
+	// result: (MOVHZreg x)
+	for {
+		r := auxToS390xRotateParams(v.Aux)
+		x := v_0
+		if !(r == s390x.NewRotateParams(48, 63, 0)) {
+			break
+		}
+		v.reset(OpS390XMOVHZreg)
+		v.AddArg(x)
+		return true
+	}
+	// match: (RISBGZ x {r})
+	// cond: r == s390x.NewRotateParams(32, 63, 0)
+	// result: (MOVWZreg x)
+	for {
+		r := auxToS390xRotateParams(v.Aux)
+		x := v_0
+		if !(r == s390x.NewRotateParams(32, 63, 0)) {
+			break
+		}
+		v.reset(OpS390XMOVWZreg)
+		v.AddArg(x)
+		return true
+	}
+	// match: (RISBGZ (LGDR <t> x) {r})
+	// cond: r == s390x.NewRotateParams(1, 63, 0)
+	// result: (LGDR <t> (LPDFR <x.Type> x))
+	for {
+		r := auxToS390xRotateParams(v.Aux)
+		if v_0.Op != OpS390XLGDR {
+			break
+		}
+		t := v_0.Type
+		x := v_0.Args[0]
+		if !(r == s390x.NewRotateParams(1, 63, 0)) {
+			break
+		}
+		v.reset(OpS390XLGDR)
+		v.Type = t
+		v0 := b.NewValue0(v.Pos, OpS390XLPDFR, x.Type)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	return false
+}
 func rewriteValueS390X_OpS390XRLL(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -13002,15 +13149,15 @@ func rewriteValueS390X_OpS390XRLLG(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	// match: (RLLG x (MOVDconst [c]))
-	// result: (RLLGconst x [int8(c&63)])
+	// result: (RISBGZ x {s390x.NewRotateParams(0, 63, int8(c&63))})
 	for {
 		x := v_0
 		if v_1.Op != OpS390XMOVDconst {
 			break
 		}
 		c := auxIntToInt64(v_1.AuxInt)
-		v.reset(OpS390XRLLGconst)
-		v.AuxInt = int8ToAuxInt(int8(c & 63))
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux(s390x.NewRotateParams(0, 63, int8(c&63)))
 		v.AddArg(x)
 		return true
 	}
@@ -13034,6 +13181,23 @@ func rewriteValueS390X_OpS390XSLD(v *Value) bool {
 		v.AddArg(x)
 		return true
 	}
+	// match: (SLD x (RISBGZ y {r}))
+	// cond: r.Amount == 0 && r.OutMask()&63 == 63
+	// result: (SLD x y)
+	for {
+		x := v_0
+		if v_1.Op != OpS390XRISBGZ {
+			break
+		}
+		r := auxToS390xRotateParams(v_1.Aux)
+		y := v_1.Args[0]
+		if !(r.Amount == 0 && r.OutMask()&63 == 63) {
+			break
+		}
+		v.reset(OpS390XSLD)
+		v.AddArg2(x, y)
+		return true
+	}
 	// match: (SLD x (AND (MOVDconst [c]) y))
 	// result: (SLD x (ANDWconst <typ.UInt32> [int32(c&63)] y))
 	for {
@@ -13152,6 +13316,38 @@ func rewriteValueS390X_OpS390XSLD(v *Value) bool {
 }
 func rewriteValueS390X_OpS390XSLDconst(v *Value) bool {
 	v_0 := v.Args[0]
+	// match: (SLDconst (SRDconst x [c]) [d])
+	// result: (RISBGZ x {s390x.NewRotateParams(max8(0, c-d), 63-d, (d-c)&63)})
+	for {
+		d := auxIntToInt8(v.AuxInt)
+		if v_0.Op != OpS390XSRDconst {
+			break
+		}
+		c := auxIntToInt8(v_0.AuxInt)
+		x := v_0.Args[0]
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux(s390x.NewRotateParams(max8(0, c-d), 63-d, (d-c)&63))
+		v.AddArg(x)
+		return true
+	}
+	// match: (SLDconst (RISBGZ x {r}) [c])
+	// cond: s390x.NewRotateParams(0, 63-c, c).InMerge(r.OutMask()) != nil
+	// result: (RISBGZ x {(*s390x.NewRotateParams(0, 63-c, c).InMerge(r.OutMask())).RotateLeft(r.Amount)})
+	for {
+		c := auxIntToInt8(v.AuxInt)
+		if v_0.Op != OpS390XRISBGZ {
+			break
+		}
+		r := auxToS390xRotateParams(v_0.Aux)
+		x := v_0.Args[0]
+		if !(s390x.NewRotateParams(0, 63-c, c).InMerge(r.OutMask()) != nil) {
+			break
+		}
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux((*s390x.NewRotateParams(0, 63-c, c).InMerge(r.OutMask())).RotateLeft(r.Amount))
+		v.AddArg(x)
+		return true
+	}
 	// match: (SLDconst x [0])
 	// result: x
 	for {
@@ -13170,18 +13366,54 @@ func rewriteValueS390X_OpS390XSLW(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (SLW x (MOVDconst [c]))
-	// result: (SLWconst x [int8(c&63)])
+	// cond: c&32 == 0
+	// result: (SLWconst x [int8(c&31)])
 	for {
 		x := v_0
 		if v_1.Op != OpS390XMOVDconst {
 			break
 		}
 		c := auxIntToInt64(v_1.AuxInt)
+		if !(c&32 == 0) {
+			break
+		}
 		v.reset(OpS390XSLWconst)
-		v.AuxInt = int8ToAuxInt(int8(c & 63))
+		v.AuxInt = int8ToAuxInt(int8(c & 31))
 		v.AddArg(x)
 		return true
 	}
+	// match: (SLW _ (MOVDconst [c]))
+	// cond: c&32 != 0
+	// result: (MOVDconst [0])
+	for {
+		if v_1.Op != OpS390XMOVDconst {
+			break
+		}
+		c := auxIntToInt64(v_1.AuxInt)
+		if !(c&32 != 0) {
+			break
+		}
+		v.reset(OpS390XMOVDconst)
+		v.AuxInt = int64ToAuxInt(0)
+		return true
+	}
+	// match: (SLW x (RISBGZ y {r}))
+	// cond: r.Amount == 0 && r.OutMask()&63 == 63
+	// result: (SLW x y)
+	for {
+		x := v_0
+		if v_1.Op != OpS390XRISBGZ {
+			break
+		}
+		r := auxToS390xRotateParams(v_1.Aux)
+		y := v_1.Args[0]
+		if !(r.Amount == 0 && r.OutMask()&63 == 63) {
+			break
+		}
+		v.reset(OpS390XSLW)
+		v.AddArg2(x, y)
+		return true
+	}
 	// match: (SLW x (AND (MOVDconst [c]) y))
 	// result: (SLW x (ANDWconst <typ.UInt32> [int32(c&63)] y))
 	for {
@@ -13330,6 +13562,23 @@ func rewriteValueS390X_OpS390XSRAD(v *Value) bool {
 		v.AddArg(x)
 		return true
 	}
+	// match: (SRAD x (RISBGZ y {r}))
+	// cond: r.Amount == 0 && r.OutMask()&63 == 63
+	// result: (SRAD x y)
+	for {
+		x := v_0
+		if v_1.Op != OpS390XRISBGZ {
+			break
+		}
+		r := auxToS390xRotateParams(v_1.Aux)
+		y := v_1.Args[0]
+		if !(r.Amount == 0 && r.OutMask()&63 == 63) {
+			break
+		}
+		v.reset(OpS390XSRAD)
+		v.AddArg2(x, y)
+		return true
+	}
 	// match: (SRAD x (AND (MOVDconst [c]) y))
 	// result: (SRAD x (ANDWconst <typ.UInt32> [int32(c&63)] y))
 	for {
@@ -13478,18 +13727,56 @@ func rewriteValueS390X_OpS390XSRAW(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (SRAW x (MOVDconst [c]))
-	// result: (SRAWconst x [int8(c&63)])
+	// cond: c&32 == 0
+	// result: (SRAWconst x [int8(c&31)])
 	for {
 		x := v_0
 		if v_1.Op != OpS390XMOVDconst {
 			break
 		}
 		c := auxIntToInt64(v_1.AuxInt)
+		if !(c&32 == 0) {
+			break
+		}
 		v.reset(OpS390XSRAWconst)
-		v.AuxInt = int8ToAuxInt(int8(c & 63))
+		v.AuxInt = int8ToAuxInt(int8(c & 31))
 		v.AddArg(x)
 		return true
 	}
+	// match: (SRAW x (MOVDconst [c]))
+	// cond: c&32 != 0
+	// result: (SRAWconst x [31])
+	for {
+		x := v_0
+		if v_1.Op != OpS390XMOVDconst {
+			break
+		}
+		c := auxIntToInt64(v_1.AuxInt)
+		if !(c&32 != 0) {
+			break
+		}
+		v.reset(OpS390XSRAWconst)
+		v.AuxInt = int8ToAuxInt(31)
+		v.AddArg(x)
+		return true
+	}
+	// match: (SRAW x (RISBGZ y {r}))
+	// cond: r.Amount == 0 && r.OutMask()&63 == 63
+	// result: (SRAW x y)
+	for {
+		x := v_0
+		if v_1.Op != OpS390XRISBGZ {
+			break
+		}
+		r := auxToS390xRotateParams(v_1.Aux)
+		y := v_1.Args[0]
+		if !(r.Amount == 0 && r.OutMask()&63 == 63) {
+			break
+		}
+		v.reset(OpS390XSRAW)
+		v.AddArg2(x, y)
+		return true
+	}
 	// match: (SRAW x (AND (MOVDconst [c]) y))
 	// result: (SRAW x (ANDWconst <typ.UInt32> [int32(c&63)] y))
 	for {
@@ -13650,6 +13937,23 @@ func rewriteValueS390X_OpS390XSRD(v *Value) bool {
 		v.AddArg(x)
 		return true
 	}
+	// match: (SRD x (RISBGZ y {r}))
+	// cond: r.Amount == 0 && r.OutMask()&63 == 63
+	// result: (SRD x y)
+	for {
+		x := v_0
+		if v_1.Op != OpS390XRISBGZ {
+			break
+		}
+		r := auxToS390xRotateParams(v_1.Aux)
+		y := v_1.Args[0]
+		if !(r.Amount == 0 && r.OutMask()&63 == 63) {
+			break
+		}
+		v.reset(OpS390XSRD)
+		v.AddArg2(x, y)
+		return true
+	}
 	// match: (SRD x (AND (MOVDconst [c]) y))
 	// result: (SRD x (ANDWconst <typ.UInt32> [int32(c&63)] y))
 	for {
@@ -13768,24 +14072,36 @@ func rewriteValueS390X_OpS390XSRD(v *Value) bool {
 }
 func rewriteValueS390X_OpS390XSRDconst(v *Value) bool {
 	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SRDconst [1] (SLDconst [1] (LGDR <t> x)))
-	// result: (LGDR <t> (LPDFR <x.Type> x))
+	// match: (SRDconst (SLDconst x [c]) [d])
+	// result: (RISBGZ x {s390x.NewRotateParams(d, min8(63, 63-c+d), (c-d)&63)})
 	for {
-		if auxIntToInt8(v.AuxInt) != 1 || v_0.Op != OpS390XSLDconst || auxIntToInt8(v_0.AuxInt) != 1 {
+		d := auxIntToInt8(v.AuxInt)
+		if v_0.Op != OpS390XSLDconst {
 			break
 		}
-		v_0_0 := v_0.Args[0]
-		if v_0_0.Op != OpS390XLGDR {
+		c := auxIntToInt8(v_0.AuxInt)
+		x := v_0.Args[0]
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux(s390x.NewRotateParams(d, min8(63, 63-c+d), (c-d)&63))
+		v.AddArg(x)
+		return true
+	}
+	// match: (SRDconst (RISBGZ x {r}) [c])
+	// cond: s390x.NewRotateParams(c, 63, -c&63).InMerge(r.OutMask()) != nil
+	// result: (RISBGZ x {(*s390x.NewRotateParams(c, 63, -c&63).InMerge(r.OutMask())).RotateLeft(r.Amount)})
+	for {
+		c := auxIntToInt8(v.AuxInt)
+		if v_0.Op != OpS390XRISBGZ {
 			break
 		}
-		t := v_0_0.Type
-		x := v_0_0.Args[0]
-		v.reset(OpS390XLGDR)
-		v.Type = t
-		v0 := b.NewValue0(v.Pos, OpS390XLPDFR, x.Type)
-		v0.AddArg(x)
-		v.AddArg(v0)
+		r := auxToS390xRotateParams(v_0.Aux)
+		x := v_0.Args[0]
+		if !(s390x.NewRotateParams(c, 63, -c&63).InMerge(r.OutMask()) != nil) {
+			break
+		}
+		v.reset(OpS390XRISBGZ)
+		v.Aux = s390xRotateParamsToAux((*s390x.NewRotateParams(c, 63, -c&63).InMerge(r.OutMask())).RotateLeft(r.Amount))
+		v.AddArg(x)
 		return true
 	}
 	// match: (SRDconst x [0])
@@ -13806,18 +14122,54 @@ func rewriteValueS390X_OpS390XSRW(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (SRW x (MOVDconst [c]))
-	// result: (SRWconst x [int8(c&63)])
+	// cond: c&32 == 0
+	// result: (SRWconst x [int8(c&31)])
 	for {
 		x := v_0
 		if v_1.Op != OpS390XMOVDconst {
 			break
 		}
 		c := auxIntToInt64(v_1.AuxInt)
+		if !(c&32 == 0) {
+			break
+		}
 		v.reset(OpS390XSRWconst)
-		v.AuxInt = int8ToAuxInt(int8(c & 63))
+		v.AuxInt = int8ToAuxInt(int8(c & 31))
 		v.AddArg(x)
 		return true
 	}
+	// match: (SRW _ (MOVDconst [c]))
+	// cond: c&32 != 0
+	// result: (MOVDconst [0])
+	for {
+		if v_1.Op != OpS390XMOVDconst {
+			break
+		}
+		c := auxIntToInt64(v_1.AuxInt)
+		if !(c&32 != 0) {
+			break
+		}
+		v.reset(OpS390XMOVDconst)
+		v.AuxInt = int64ToAuxInt(0)
+		return true
+	}
+	// match: (SRW x (RISBGZ y {r}))
+	// cond: r.Amount == 0 && r.OutMask()&63 == 63
+	// result: (SRW x y)
+	for {
+		x := v_0
+		if v_1.Op != OpS390XRISBGZ {
+			break
+		}
+		r := auxToS390xRotateParams(v_1.Aux)
+		y := v_1.Args[0]
+		if !(r.Amount == 0 && r.OutMask()&63 == 63) {
+			break
+		}
+		v.reset(OpS390XSRW)
+		v.AddArg2(x, y)
+		return true
+	}
 	// match: (SRW x (AND (MOVDconst [c]) y))
 	// result: (SRW x (ANDWconst <typ.UInt32> [int32(c&63)] y))
 	for {
@@ -14564,9 +14916,8 @@ func rewriteValueS390X_OpS390XXOR(v *Value) bool {
 		}
 		break
 	}
-	// match: (XOR (SLDconst x [c]) (SRDconst x [d]))
-	// cond: d == 64-c
-	// result: (RLLGconst [c] x)
+	// match: (XOR (SLDconst x [c]) (SRDconst x [64-c]))
+	// result: (RISBGZ x {s390x.NewRotateParams(0, 63, c)})
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			if v_0.Op != OpS390XSLDconst {
@@ -14574,15 +14925,11 @@ func rewriteValueS390X_OpS390XXOR(v *Value) bool {
 			}
 			c := auxIntToInt8(v_0.AuxInt)
 			x := v_0.Args[0]
-			if v_1.Op != OpS390XSRDconst {
-				continue
-			}
-			d := auxIntToInt8(v_1.AuxInt)
-			if x != v_1.Args[0] || !(d == 64-c) {
+			if v_1.Op != OpS390XSRDconst || auxIntToInt8(v_1.AuxInt) != 64-c || x != v_1.Args[0] {
 				continue
 			}
-			v.reset(OpS390XRLLGconst)
-			v.AuxInt = int8ToAuxInt(c)
+			v.reset(OpS390XRISBGZ)
+			v.Aux = s390xRotateParamsToAux(s390x.NewRotateParams(0, 63, c))
 			v.AddArg(x)
 			return true
 		}
@@ -14665,9 +15012,8 @@ func rewriteValueS390X_OpS390XXORW(v *Value) bool {
 		}
 		break
 	}
-	// match: (XORW (SLWconst x [c]) (SRWconst x [d]))
-	// cond: d == 32-c
-	// result: (RLLconst [c] x)
+	// match: (XORW (SLWconst x [c]) (SRWconst x [32-c]))
+	// result: (RLLconst x [c])
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			if v_0.Op != OpS390XSLWconst {
@@ -14675,11 +15021,7 @@ func rewriteValueS390X_OpS390XXORW(v *Value) bool {
 			}
 			c := auxIntToInt8(v_0.AuxInt)
 			x := v_0.Args[0]
-			if v_1.Op != OpS390XSRWconst {
-				continue
-			}
-			d := auxIntToInt8(v_1.AuxInt)
-			if x != v_1.Args[0] || !(d == 32-c) {
+			if v_1.Op != OpS390XSRWconst || auxIntToInt8(v_1.AuxInt) != 32-c || x != v_1.Args[0] {
 				continue
 			}
 			v.reset(OpS390XRLLconst)
diff --git a/src/cmd/internal/obj/s390x/rotate.go b/src/cmd/internal/obj/s390x/rotate.go
index fd2d5482db..7dbc45e648 100644
--- a/src/cmd/internal/obj/s390x/rotate.go
+++ b/src/cmd/internal/obj/s390x/rotate.go
@@ -4,6 +4,10 @@
 
 package s390x
 
+import (
+	"math/bits"
+)
+
 // RotateParams represents the immediates required for a "rotate
 // then ... selected bits instruction".
 //
@@ -24,12 +28,18 @@ package s390x
 // input left by. Note that this rotation is performed
 // before the masked region is used.
 type RotateParams struct {
-	Start  uint8 // big-endian start bit index [0..63]
-	End    uint8 // big-endian end bit index [0..63]
-	Amount uint8 // amount to rotate left
+	Start  int8 // big-endian start bit index [0..63]
+	End    int8 // big-endian end bit index [0..63]
+	Amount int8 // amount to rotate left
 }
 
-func NewRotateParams(start, end, amount int64) RotateParams {
+// NewRotateParams creates a set of parameters representing a
+// rotation left by the amount provided and a selection of the bits
+// between the provided start and end indexes (inclusive).
+//
+// The start and end indexes and the rotation amount must all
+// be in the range 0-63 inclusive or this function will panic.
+func NewRotateParams(start, end, amount int8) RotateParams {
 	if start&^63 != 0 {
 		panic("start out of bounds")
 	}
@@ -40,8 +50,66 @@ func NewRotateParams(start, end, amount int64) RotateParams {
 		panic("amount out of bounds")
 	}
 	return RotateParams{
-		Start:  uint8(start),
-		End:    uint8(end),
-		Amount: uint8(amount),
+		Start:  start,
+		End:    end,
+		Amount: amount,
 	}
 }
+
+// RotateLeft generates a new set of parameters with the rotation amount
+// increased by the given value. The selected bits are left unchanged.
+func (r RotateParams) RotateLeft(amount int8) RotateParams {
+	r.Amount += amount
+	r.Amount &= 63
+	return r
+}
+
+// OutMask provides a mask representing the selected bits.
+func (r RotateParams) OutMask() uint64 {
+	// Note: z must be unsigned for bootstrap compiler
+	z := uint8(63-r.End+r.Start) & 63 // number of zero bits in mask
+	return bits.RotateLeft64(^uint64(0)<<z, -int(r.Start))
+}
+
+// InMask provides a mask representing the selected bits relative
+// to the source value (i.e. pre-rotation).
+func (r RotateParams) InMask() uint64 {
+	return bits.RotateLeft64(r.OutMask(), -int(r.Amount))
+}
+
+// OutMerge tries to generate a new set of parameters representing
+// the intersection between the selected bits and the provided mask.
+// If the intersection is unrepresentable (0 or not contiguous) nil
+// will be returned.
+func (r RotateParams) OutMerge(mask uint64) *RotateParams {
+	mask &= r.OutMask()
+	if mask == 0 {
+		return nil
+	}
+
+	// normalize the mask so that the set bits are left aligned
+	o := bits.LeadingZeros64(^mask)
+	mask = bits.RotateLeft64(mask, o)
+	z := bits.LeadingZeros64(mask)
+	mask = bits.RotateLeft64(mask, z)
+
+	// check that the normalized mask is contiguous
+	l := bits.LeadingZeros64(^mask)
+	if l+bits.TrailingZeros64(mask) != 64 {
+		return nil
+	}
+
+	// update start and end positions (rotation amount remains the same)
+	r.Start = int8(o+z) & 63
+	r.End = (r.Start + int8(l) - 1) & 63
+	return &r
+}
+
+// InMerge tries to generate a new set of parameters representing
+// the intersection between the selected bits and the provided mask
+// as applied to the source value (i.e. pre-rotation).
+// If the intersection is unrepresentable (0 or not contiguous) nil
+// will be returned.
+func (r RotateParams) InMerge(mask uint64) *RotateParams {
+	return r.OutMerge(bits.RotateLeft64(mask, int(r.Amount)))
+}
diff --git a/src/cmd/internal/obj/s390x/rotate_test.go b/src/cmd/internal/obj/s390x/rotate_test.go
new file mode 100644
index 0000000000..fa5b5bdecd
--- /dev/null
+++ b/src/cmd/internal/obj/s390x/rotate_test.go
@@ -0,0 +1,122 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s390x
+
+import (
+	"testing"
+)
+
+func TestRotateParamsMask(t *testing.T) {
+	tests := []struct {
+		start, end, amount int8
+		inMask, outMask    uint64
+	}{
+		// start before end, no rotation
+		{start: 0, end: 63, amount: 0, inMask: ^uint64(0), outMask: ^uint64(0)},
+		{start: 1, end: 63, amount: 0, inMask: ^uint64(0) >> 1, outMask: ^uint64(0) >> 1},
+		{start: 0, end: 62, amount: 0, inMask: ^uint64(1), outMask: ^uint64(1)},
+		{start: 1, end: 62, amount: 0, inMask: ^uint64(3) >> 1, outMask: ^uint64(3) >> 1},
+
+		// end before start, no rotation
+		{start: 63, end: 0, amount: 0, inMask: 1<<63 | 1, outMask: 1<<63 | 1},
+		{start: 62, end: 0, amount: 0, inMask: 1<<63 | 3, outMask: 1<<63 | 3},
+		{start: 63, end: 1, amount: 0, inMask: 3<<62 | 1, outMask: 3<<62 | 1},
+		{start: 62, end: 1, amount: 0, inMask: 3<<62 | 3, outMask: 3<<62 | 3},
+
+		// rotation
+		{start: 32, end: 63, amount: 32, inMask: 0xffffffff00000000, outMask: 0x00000000ffffffff},
+		{start: 48, end: 15, amount: 16, inMask: 0xffffffff00000000, outMask: 0xffff00000000ffff},
+		{start: 0, end: 7, amount: -8 & 63, inMask: 0xff, outMask: 0xff << 56},
+	}
+	for i, test := range tests {
+		r := NewRotateParams(test.start, test.end, test.amount)
+		if m := r.OutMask(); m != test.outMask {
+			t.Errorf("out mask %v: want %#x, got %#x", i, test.outMask, m)
+		}
+		if m := r.InMask(); m != test.inMask {
+			t.Errorf("in mask %v: want %#x, got %#x", i, test.inMask, m)
+		}
+	}
+}
+
+func TestRotateParamsMerge(t *testing.T) {
+	tests := []struct {
+		// inputs
+		src  RotateParams
+		mask uint64
+
+		// results
+		in  *RotateParams
+		out *RotateParams
+	}{
+		{
+			src:  RotateParams{Start: 48, End: 15, Amount: 16},
+			mask: 0xffffffffffffffff,
+			in:   &RotateParams{Start: 48, End: 15, Amount: 16},
+			out:  &RotateParams{Start: 48, End: 15, Amount: 16},
+		},
+		{
+			src:  RotateParams{Start: 16, End: 47, Amount: 0},
+			mask: 0x00000000ffffffff,
+			in:   &RotateParams{Start: 32, End: 47, Amount: 0},
+			out:  &RotateParams{Start: 32, End: 47, Amount: 0},
+		},
+		{
+			src:  RotateParams{Start: 16, End: 47, Amount: 0},
+			mask: 0xffff00000000ffff,
+			in:   nil,
+			out:  nil,
+		},
+		{
+			src:  RotateParams{Start: 0, End: 63, Amount: 0},
+			mask: 0xf7f0000000000000,
+			in:   nil,
+			out:  nil,
+		},
+		{
+			src:  RotateParams{Start: 0, End: 63, Amount: 1},
+			mask: 0x000000000000ff00,
+			in:   &RotateParams{Start: 47, End: 54, Amount: 1},
+			out:  &RotateParams{Start: 48, End: 55, Amount: 1},
+		},
+		{
+			src:  RotateParams{Start: 32, End: 63, Amount: 32},
+			mask: 0xffff00000000ffff,
+			in:   &RotateParams{Start: 32, End: 47, Amount: 32},
+			out:  &RotateParams{Start: 48, End: 63, Amount: 32},
+		},
+		{
+			src:  RotateParams{Start: 0, End: 31, Amount: 32},
+			mask: 0x8000000000000000,
+			in:   nil,
+			out:  &RotateParams{Start: 0, End: 0, Amount: 32},
+		},
+		{
+			src:  RotateParams{Start: 0, End: 31, Amount: 32},
+			mask: 0x0000000080000000,
+			in:   &RotateParams{Start: 0, End: 0, Amount: 32},
+			out:  nil,
+		},
+	}
+
+	eq := func(x, y *RotateParams) bool {
+		if x == nil && y == nil {
+			return true
+		}
+		if x == nil || y == nil {
+			return false
+		}
+		return *x == *y
+	}
+
+	for _, test := range tests {
+		if r := test.src.InMerge(test.mask); !eq(r, test.in) {
+			t.Errorf("%v merged with %#x (input): want %v, got %v", test.src, test.mask, test.in, r)
+		}
+		if r := test.src.OutMerge(test.mask); !eq(r, test.out) {
+			t.Errorf("%v merged with %#x (output): want %v, got %v", test.src, test.mask, test.out, r)
+		}
+	}
+}
diff --git a/test/codegen/bitfield.go b/test/codegen/bitfield.go
index 08788f1447..7abc1c2783 100644
--- a/test/codegen/bitfield.go
+++ b/test/codegen/bitfield.go
@@ -127,11 +127,13 @@ func sbfx6(x int32) int32 {
 // ubfiz
 func ubfiz1(x uint64) uint64 {
 	// arm64:"UBFIZ\t[$]3, R[0-9]+, [$]12",-"LSL",-"AND"
+	// s390x:"RISBGZ\t[$]49, [$]60, [$]3,",-"SLD",-"AND"
 	return (x & 0xfff) << 3
 }
 
 func ubfiz2(x uint64) uint64 {
 	// arm64:"UBFIZ\t[$]4, R[0-9]+, [$]12",-"LSL",-"AND"
+	// s390x:"RISBGZ\t[$]48, [$]59, [$]4,",-"SLD",-"AND"
 	return (x << 4) & 0xfff0
 }
 
@@ -149,6 +151,7 @@ func ubfiz5(x uint8) uint64 {
 
 func ubfiz6(x uint64) uint64 {
 	// arm64:"UBFIZ\t[$]1, R[0-9]+, [$]60",-"LSL",-"LSR"
+	// s390x:"RISBGZ\t[$]3, [$]62, [$]1, ",-"SLD",-"SRD"
 	return (x << 4) >> 3
 }
 
@@ -159,6 +162,7 @@ func ubfiz7(x uint32) uint32 {
 
 func ubfiz8(x uint64) uint64 {
 	// arm64:"UBFIZ\t[$]1, R[0-9]+, [$]20",-"LSL",-"LSR"
+	// s390x:"RISBGZ\t[$]43, [$]62, [$]1, ",-"SLD",-"SRD",-"AND"
 	return ((x & 0xfffff) << 4) >> 3
 }
 
@@ -169,17 +173,20 @@ func ubfiz9(x uint64) uint64 {
 
 func ubfiz10(x uint64) uint64 {
 	// arm64:"UBFIZ\t[$]7, R[0-9]+, [$]12",-"LSL",-"LSR",-"AND"
+	// s390x:"RISBGZ\t[$]45, [$]56, [$]7, ",-"SLD",-"SRD",-"AND"
 	return ((x << 5) & (0xfff << 5)) << 2
 }
 
 // ubfx
 func ubfx1(x uint64) uint64 {
 	// arm64:"UBFX\t[$]25, R[0-9]+, [$]10",-"LSR",-"AND"
+	// s390x:"RISBGZ\t[$]54, [$]63, [$]39, ",-"SRD",-"AND"
 	return (x >> 25) & 1023
 }
 
 func ubfx2(x uint64) uint64 {
 	// arm64:"UBFX\t[$]4, R[0-9]+, [$]8",-"LSR",-"AND"
+	// s390x:"RISBGZ\t[$]56, [$]63, [$]60, ",-"SRD",-"AND"
 	return (x & 0x0ff0) >> 4
 }
 
@@ -196,30 +203,37 @@ func ubfx5(x uint8) uint64 {
 }
 
 func ubfx6(x uint64) uint64 {
-	return (x << 1) >> 2 // arm64:"UBFX\t[$]1, R[0-9]+, [$]62",-"LSL",-"LSR"
+	// arm64:"UBFX\t[$]1, R[0-9]+, [$]62",-"LSL",-"LSR"
+	// s390x:"RISBGZ\t[$]2, [$]63, [$]63,",-"SLD",-"SRD"
+	return (x << 1) >> 2
 }
 
 func ubfx7(x uint32) uint32 {
-	return (x << 1) >> 2 // arm64:"UBFX\t[$]1, R[0-9]+, [$]30",-"LSL",-"LSR"
+	// arm64:"UBFX\t[$]1, R[0-9]+, [$]30",-"LSL",-"LSR"
+	return (x << 1) >> 2
 }
 
 func ubfx8(x uint64) uint64 {
 	// arm64:"UBFX\t[$]1, R[0-9]+, [$]12",-"LSL",-"LSR",-"AND"
+	// s390x:"RISBGZ\t[$]52, [$]63, [$]63,",-"SLD",-"SRD",-"AND"
 	return ((x << 1) >> 2) & 0xfff
 }
 
 func ubfx9(x uint64) uint64 {
 	// arm64:"UBFX\t[$]4, R[0-9]+, [$]11",-"LSL",-"LSR",-"AND"
+	// s390x:"RISBGZ\t[$]53, [$]63, [$]60, ",-"SLD",-"SRD",-"AND"
 	return ((x >> 3) & 0xfff) >> 1
 }
 
 func ubfx10(x uint64) uint64 {
 	// arm64:"UBFX\t[$]5, R[0-9]+, [$]56",-"LSL",-"LSR"
+	// s390x:"RISBGZ\t[$]8, [$]63, [$]59, ",-"SLD",-"SRD"
 	return ((x >> 2) << 5) >> 8
 }
 
 func ubfx11(x uint64) uint64 {
 	// arm64:"UBFX\t[$]1, R[0-9]+, [$]19",-"LSL",-"LSR"
+	// s390x:"RISBGZ\t[$]45, [$]63, [$]63, ",-"SLD",-"SRD",-"AND"
 	return ((x & 0xfffff) << 3) >> 4
 }
 
diff --git a/test/codegen/bits.go b/test/codegen/bits.go
index 398dd84e9e..56e0f3474e 100644
--- a/test/codegen/bits.go
+++ b/test/codegen/bits.go
@@ -340,3 +340,15 @@ func bitSetTest(x int) bool {
 	// amd64:"CMPQ\tAX, [$]9"
 	return x&9 == 9
 }
+
+// mask contiguous one bits
+func cont1Mask64U(x uint64) uint64 {
+	// s390x:"RISBGZ\t[$]16, [$]47, [$]0,"
+	return x&0x0000ffffffff0000
+}
+
+// mask contiguous zero bits
+func cont0Mask64U(x uint64) uint64 {
+	// s390x:"RISBGZ\t[$]48, [$]15, [$]0,"
+	return x&0xffff00000000ffff
+}
diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go
index 4c35f26997..fff6639546 100644
--- a/test/codegen/mathbits.go
+++ b/test/codegen/mathbits.go
@@ -213,7 +213,7 @@ func RotateLeft64(n uint64) uint64 {
 	// arm64:"ROR"
 	// ppc64:"ROTL"
 	// ppc64le:"ROTL"
-	// s390x:"RLLG"
+	// s390x:"RISBGZ\t[$]0, [$]63, [$]37, "
 	// wasm:"I64Rotl"
 	return bits.RotateLeft64(n, 37)
 }
diff --git a/test/codegen/rotate.go b/test/codegen/rotate.go
index 0c8b030970..e0bcd0abbc 100644
--- a/test/codegen/rotate.go
+++ b/test/codegen/rotate.go
@@ -17,21 +17,21 @@ func rot64(x uint64) uint64 {
 
 	// amd64:"ROLQ\t[$]7"
 	// arm64:"ROR\t[$]57"
-	// s390x:"RLLG\t[$]7"
+	// s390x:"RISBGZ\t[$]0, [$]63, [$]7, "
 	// ppc64:"ROTL\t[$]7"
 	// ppc64le:"ROTL\t[$]7"
 	a += x<<7 | x>>57
 
 	// amd64:"ROLQ\t[$]8"
 	// arm64:"ROR\t[$]56"
-	// s390x:"RLLG\t[$]8"
+	// s390x:"RISBGZ\t[$]0, [$]63, [$]8, "
 	// ppc64:"ROTL\t[$]8"
 	// ppc64le:"ROTL\t[$]8"
 	a += x<<8 + x>>56
 
 	// amd64:"ROLQ\t[$]9"
 	// arm64:"ROR\t[$]55"
-	// s390x:"RLLG\t[$]9"
+	// s390x:"RISBGZ\t[$]0, [$]63, [$]9, "
 	// ppc64:"ROTL\t[$]9"
 	// ppc64le:"ROTL\t[$]9"
 	a += x<<9 ^ x>>55
diff --git a/test/codegen/shift.go b/test/codegen/shift.go
index a45f27c9cf..d19a1984c1 100644
--- a/test/codegen/shift.go
+++ b/test/codegen/shift.go
@@ -11,84 +11,84 @@ package codegen
 // ------------------ //
 
 func lshMask64x64(v int64, s uint64) int64 {
-	// s390x:-".*AND",-".*MOVDGE"
+	// s390x:-"RISBGZ",-"AND",-"LOCGR"
 	// ppc64le:"ANDCC",-"ORN",-"ISEL"
 	// ppc64:"ANDCC",-"ORN",-"ISEL"
 	return v << (s & 63)
 }
 
 func rshMask64Ux64(v uint64, s uint64) uint64 {
-	// s390x:-".*AND",-".*MOVDGE"
+	// s390x:-"RISBGZ",-"AND",-"LOCGR"
 	// ppc64le:"ANDCC",-"ORN",-"ISEL"
 	// ppc64:"ANDCC",-"ORN",-"ISEL"
 	return v >> (s & 63)
 }
 
 func rshMask64x64(v int64, s uint64) int64 {
-	// s390x:-".*AND",-".*MOVDGE"
+	// s390x:-"RISBGZ",-"AND",-"LOCGR"
 	// ppc64le:"ANDCC",-ORN",-"ISEL"
 	// ppc64:"ANDCC",-"ORN",-"ISEL"
 	return v >> (s & 63)
 }
 
 func lshMask32x64(v int32, s uint64) int32 {
-	// s390x:-".*AND",-".*MOVDGE"
+	// s390x:-"RISBGZ",-"AND",-"LOCGR"
 	// ppc64le:"ISEL",-"ORN"
 	// ppc64:"ISEL",-"ORN"
 	return v << (s & 63)
 }
 
 func rshMask32Ux64(v uint32, s uint64) uint32 {
-	// s390x:-".*AND",-".*MOVDGE"
+	// s390x:-"RISBGZ",-"AND",-"LOCGR"
 	// ppc64le:"ISEL",-"ORN"
 	// ppc64:"ISEL",-"ORN"
 	return v >> (s & 63)
 }
 
 func rshMask32x64(v int32, s uint64) int32 {
-	// s390x:-".*AND",-".*MOVDGE"
+	// s390x:-"RISBGZ",-"AND",-"LOCGR"
 	// ppc64le:"ISEL",-"ORN"
 	// ppc64:"ISEL",-"ORN"
 	return v >> (s & 63)
 }
 
 func lshMask64x32(v int64, s uint32) int64 {
-	// s390x:-".*AND",-".*MOVDGE"
+	// s390x:-"RISBGZ",-"AND",-"LOCGR"
 	// ppc64le:"ANDCC",-"ORN"
 	// ppc64:"ANDCC",-"ORN"
 	return v << (s & 63)
 }
 
 func rshMask64Ux32(v uint64, s uint32) uint64 {
-	// s390x:-".*AND",-".*MOVDGE"
+	// s390x:-"RISBGZ",-"AND",-"LOCGR"
 	// ppc64le:"ANDCC",-"ORN"
 	// ppc64:"ANDCC",-"ORN"
 	return v >> (s & 63)
 }
 
 func rshMask64x32(v int64, s uint32) int64 {
-	// s390x:-".*AND",-".*MOVDGE"
+	// s390x:-"RISBGZ",-"AND",-"LOCGR"
 	// ppc64le:"ANDCC",-"ORN",-"ISEL"
 	// ppc64:"ANDCC",-"ORN",-"ISEL"
 	return v >> (s & 63)
 }
 
 func lshMask64x32Ext(v int64, s int32) int64 {
-	// s390x:-".*AND",-".*MOVDGE"
+	// s390x:-"RISBGZ",-"AND",-"LOCGR"
 	// ppc64le:"ANDCC",-"ORN",-"ISEL"
 	// ppc64:"ANDCC",-"ORN",-"ISEL"
 	return v << uint(s&63)
 }
 
 func rshMask64Ux32Ext(v uint64, s int32) uint64 {
-	// s390x:-".*AND",-".*MOVDGE"
+	// s390x:-"RISBGZ",-"AND",-"LOCGR"
 	// ppc64le:"ANDCC",-"ORN",-"ISEL"
 	// ppc64:"ANDCC",-"ORN",-"ISEL"
 	return v >> uint(s&63)
 }
 
 func rshMask64x32Ext(v int64, s int32) int64 {
-	// s390x:-".*AND",-".*MOVDGE"
+	// s390x:-"RISBGZ",-"AND",-"LOCGR"
 	// ppc64le:"ANDCC",-"ORN",-"ISEL"
 	// ppc64:"ANDCC",-"ORN",-"ISEL"
 	return v >> uint(s&63)
@@ -128,7 +128,8 @@ func lshSignedMasked(v8 int8, v16 int16, v32 int32, v64 int64, x int) {
 
 func rshGuarded64(v int64, s uint) int64 {
 	if s < 64 {
-		// s390x:-".*AND",-".*MOVDGE" wasm:-"Select",-".*LtU"
+		// s390x:-"RISBGZ",-"AND",-"LOCGR"
+		// wasm:-"Select",-".*LtU"
 		return v >> s
 	}
 	panic("shift too large")
@@ -136,7 +137,8 @@ func rshGuarded64(v int64, s uint) int64 {
 
 func rshGuarded64U(v uint64, s uint) uint64 {
 	if s < 64 {
-		// s390x:-".*AND",-".*MOVDGE" wasm:-"Select",-".*LtU"
+		// s390x:-"RISBGZ",-"AND",-"LOCGR"
+		// wasm:-"Select",-".*LtU"
 		return v >> s
 	}
 	panic("shift too large")
@@ -144,7 +146,8 @@ func rshGuarded64U(v uint64, s uint) uint64 {
 
 func lshGuarded64(v int64, s uint) int64 {
 	if s < 64 {
-		// s390x:-".*AND",-".*MOVDGE" wasm:-"Select",-".*LtU"
+		// s390x:-"RISBGZ",-"AND",-"LOCGR"
+		// wasm:-"Select",-".*LtU"
 		return v << s
 	}
 	panic("shift too large")
-- 
cgit v1.3


From 7307e86afda3c5c7f6158d2469c39606fd1dba65 Mon Sep 17 00:00:00 2001
From: Alberto Donizetti <alb.donizetti@gmail.com>
Date: Sun, 8 Nov 2020 09:44:33 +0100
Subject: test/codegen: go fmt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #42445

Change-Id: I9653ef094dba2a1ac2e3daaa98279d10df17a2a1
Reviewed-on: https://go-review.googlesource.com/c/go/+/268257
Trust: Alberto Donizetti <alb.donizetti@gmail.com>
Trust: Martin Möhrmann <moehrmann@google.com>
Run-TryBot: Alberto Donizetti <alb.donizetti@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Martin Möhrmann <moehrmann@google.com>
---
 test/codegen/bits.go               |  4 +--
 test/codegen/compare_and_branch.go | 72 +++++++++++++++++++-------------------
 2 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'test/codegen')

diff --git a/test/codegen/bits.go b/test/codegen/bits.go
index 56e0f3474e..4508eba487 100644
--- a/test/codegen/bits.go
+++ b/test/codegen/bits.go
@@ -344,11 +344,11 @@ func bitSetTest(x int) bool {
 // mask contiguous one bits
 func cont1Mask64U(x uint64) uint64 {
 	// s390x:"RISBGZ\t[$]16, [$]47, [$]0,"
-	return x&0x0000ffffffff0000
+	return x & 0x0000ffffffff0000
 }
 
 // mask contiguous zero bits
 func cont0Mask64U(x uint64) uint64 {
 	// s390x:"RISBGZ\t[$]48, [$]15, [$]0,"
-	return x&0xffff00000000ffff
+	return x & 0xffff00000000ffff
 }
diff --git a/test/codegen/compare_and_branch.go b/test/codegen/compare_and_branch.go
index 696a2d5f1c..f7515064b0 100644
--- a/test/codegen/compare_and_branch.go
+++ b/test/codegen/compare_and_branch.go
@@ -155,52 +155,52 @@ func ui32x8() {
 
 // Signed 64-bit comparison with unsigned 8-bit immediate.
 func si64xu8(x chan int64) {
-        // s390x:"CLGIJ\t[$]8, R[0-9]+, [$]128, "
-        for <-x == 128 {
-                dummy()
-        }
-
-        // s390x:"CLGIJ\t[$]6, R[0-9]+, [$]255, "
-        for <-x != 255 {
-                dummy()
-        }
+	// s390x:"CLGIJ\t[$]8, R[0-9]+, [$]128, "
+	for <-x == 128 {
+		dummy()
+	}
+
+	// s390x:"CLGIJ\t[$]6, R[0-9]+, [$]255, "
+	for <-x != 255 {
+		dummy()
+	}
 }
 
 // Signed 32-bit comparison with unsigned 8-bit immediate.
 func si32xu8(x chan int32) {
-        // s390x:"CLIJ\t[$]8, R[0-9]+, [$]255, "
-        for <-x == 255 {
-                dummy()
-        }
-
-        // s390x:"CLIJ\t[$]6, R[0-9]+, [$]128, "
-        for <-x != 128 {
-                dummy()
-        }
+	// s390x:"CLIJ\t[$]8, R[0-9]+, [$]255, "
+	for <-x == 255 {
+		dummy()
+	}
+
+	// s390x:"CLIJ\t[$]6, R[0-9]+, [$]128, "
+	for <-x != 128 {
+		dummy()
+	}
 }
 
 // Unsigned 64-bit comparison with signed 8-bit immediate.
 func ui64xu8(x chan uint64) {
-        // s390x:"CGIJ\t[$]8, R[0-9]+, [$]-1, "
-        for <-x == ^uint64(0) {
-                dummy()
-        }
-
-        // s390x:"CGIJ\t[$]6, R[0-9]+, [$]-128, "
-        for <-x != ^uint64(127) {
-                dummy()
-        }
+	// s390x:"CGIJ\t[$]8, R[0-9]+, [$]-1, "
+	for <-x == ^uint64(0) {
+		dummy()
+	}
+
+	// s390x:"CGIJ\t[$]6, R[0-9]+, [$]-128, "
+	for <-x != ^uint64(127) {
+		dummy()
+	}
 }
 
 // Unsigned 32-bit comparison with signed 8-bit immediate.
 func ui32xu8(x chan uint32) {
-        // s390x:"CIJ\t[$]8, R[0-9]+, [$]-128, "
-        for <-x == ^uint32(127) {
-                dummy()
-        }
-
-        // s390x:"CIJ\t[$]6, R[0-9]+, [$]-1, "
-        for <-x != ^uint32(0) {
-                dummy()
-        }
+	// s390x:"CIJ\t[$]8, R[0-9]+, [$]-128, "
+	for <-x == ^uint32(127) {
+		dummy()
+	}
+
+	// s390x:"CIJ\t[$]6, R[0-9]+, [$]-1, "
+	for <-x != ^uint32(0) {
+		dummy()
+	}
 }
-- 
cgit v1.3


From 0ae3b7cb742c586df9b68d9eac042b32148abf9c Mon Sep 17 00:00:00 2001
From: Lynn Boger <laboger@linux.vnet.ibm.com>
Date: Mon, 16 Nov 2020 09:40:45 -0500
Subject: cmd/compile: fix rules regression with shifts on PPC64

Some rules for PPC64 were checking for a case
where a shift followed by an 'and' of a mask could
be lowered, depending on the format of the mask. The
function to verify if the mask was valid for this purpose
was not checking if the mask was 0 which we don't want to
allow. This case can happen if previous optimizations
resulted in that mask value.

This fixes isPPC64ValidShiftMask to check for a mask of 0 and return
false.

This also adds a codegen testcase to verify it doesn't try to
match the rules in the future.

Fixes #42610

Change-Id: I565d94e88495f51321ab365d6388c01e791b4dbb
Reviewed-on: https://go-review.googlesource.com/c/go/+/270358
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Paul Murphy <murp@ibm.com>
Reviewed-by: Carlos Eduardo Seo <carlos.seo@linaro.org>
Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
---
 src/cmd/compile/internal/ssa/rewrite.go |  7 ++++---
 test/codegen/issue42610.go              | 30 ++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 test/codegen/issue42610.go

(limited to 'test/codegen')

diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
index 39aa63d947..24efd38fb7 100644
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -1427,10 +1427,11 @@ func DecodePPC64RotateMask(sauxint int64) (rotate, mb, me int64, mask uint64) {
 	return
 }
 
-// This verifies that the mask occupies the
-// rightmost bits.
+// This verifies that the mask is a set of
+// consecutive bits including the least
+// significant bit.
 func isPPC64ValidShiftMask(v int64) bool {
-	if ((v + 1) & v) == 0 {
+	if (v != 0) && ((v+1)&v) == 0 {
 		return true
 	}
 	return false
diff --git a/test/codegen/issue42610.go b/test/codegen/issue42610.go
new file mode 100644
index 0000000000..c7eeddc53c
--- /dev/null
+++ b/test/codegen/issue42610.go
@@ -0,0 +1,30 @@
+// asmcheck
+
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Don't allow 0 masks in shift lowering rules on ppc64x.
+// See issue 42610.
+
+package codegen
+
+func f32(a []int32, i uint32) {
+        g := func(p int32) int32 {
+                i = uint32(p) * (uint32(p) & (i & 1))
+                return 1
+        }
+        // ppc64le: -"RLWNIM"
+        // ppc64: -"RLWNIM"
+        a[0] = g(8) >> 1
+}
+
+func f(a []int, i uint) {
+	g := func(p int) int {
+		i = uint(p) * (uint(p) & (i & 1))
+		return 1
+	}
+	// ppc64le: -"RLDIC"
+	// ppc64: -"RLDIC"
+	a[0] = g(8) >> 1
+}
-- 
cgit v1.3