diff options
| author | Alexander Musman <alexander.musman@gmail.com> | 2026-04-01 16:23:15 +0300 |
|---|---|---|
| committer | Gopher Robot <gobot@golang.org> | 2026-04-08 03:51:48 -0700 |
| commit | 9111d85e2f699672d67dcee1d6432a940f5306e1 (patch) | |
| tree | 1b9538168dbf9d5f28d4f50a2f4905bd5c0c847e | |
| parent | 4dffc57944c829d2fb2cf1b25168c27e555a8e5c (diff) | |
| download | go-9111d85e2f699672d67dcee1d6432a940f5306e1.tar.xz | |
cmd/internal/obj/arm64: add ASIMD shift instructions
Add support for ASIMD shift instructions. These use the ASIMDSHF
encoding class from the ARM architecture specification, where the
shift amount is encoded as an immediate derived from the element size.
Also add ASIMD shifts-by-vector (3-register form) where the shift
amount comes from a second vector register. These use the ASIMDSAME
encoding class.
New instructions by group:
Shift by immediate (signed): VSSHR, VSRSHR
Shift by immediate (saturating): VSQSHL, VUQSHL
Narrowing shift by immediate: VSHRN, VSHRN2
Shift by vector (3-reg): VSSHL, VUSHL, VSQSHL, VUQSHL
Change-Id: I039cc16bc01980b04e6940cc1d4670faf5fa7e3c
Reviewed-on: https://go-review.googlesource.com/c/go/+/762180
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
| -rw-r--r-- | src/cmd/asm/internal/asm/testdata/arm64.s | 62 | ||||
| -rw-r--r-- | src/cmd/asm/internal/asm/testdata/arm64error.s | 16 | ||||
| -rw-r--r-- | src/cmd/internal/obj/arm64/a.out.go | 8 | ||||
| -rw-r--r-- | src/cmd/internal/obj/arm64/anames.go | 8 | ||||
| -rw-r--r-- | src/cmd/internal/obj/arm64/asm7.go | 64 |
5 files changed, 153 insertions, 5 deletions
diff --git a/src/cmd/asm/internal/asm/testdata/arm64.s b/src/cmd/asm/internal/asm/testdata/arm64.s index 2f85308bd3..bb0c9e2c05 100644 --- a/src/cmd/asm/internal/asm/testdata/arm64.s +++ b/src/cmd/asm/internal/asm/testdata/arm64.s @@ -154,6 +154,26 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8 VUSHR $8, V1.H8, V2.H8 // 2204186f VUSHR $2, V1.B8, V2.B8 // 22040e2f VUSHR $2, V1.B16, V2.B16 // 22040e6f + VSHRN $7, V1.H8, V0.B8 // 2084090f + VSHRN $15, V1.S4, V0.H4 // 2084110f + VSHRN $31, V1.D2, V0.S2 // 2084210f + VSHRN2 $7, V1.H8, V0.B16 // 2084094f + VSHRN2 $15, V1.S4, V0.H8 // 2084114f + VSHRN2 $31, V1.D2, V0.S4 // 2084214f + VSSHR $2, V0.B8, V1.B8 // 01040e0f + VSSHR $2, V0.B16, V1.B16 // 01040e4f + VSSHR $8, V0.H4, V1.H4 // 0104180f + VSSHR $8, V0.H8, V1.H8 // 0104184f + VSSHR $16, V0.S2, V1.S2 // 0104300f + VSSHR $16, V0.S4, V1.S4 // 0104304f + VSSHR $32, V0.D2, V1.D2 // 0104604f + VSRSHR $2, V0.B8, V1.B8 // 01240e0f + VSRSHR $2, V0.B16, V1.B16 // 01240e4f + VSRSHR $8, V0.H4, V1.H4 // 0124180f + VSRSHR $8, V0.H8, V1.H8 // 0124184f + VSRSHR $16, V0.S2, V1.S2 // 0124300f + VSRSHR $16, V0.S4, V1.S4 // 0124304f + VSRSHR $32, V0.D2, V1.D2 // 0124604f VSHL $56, V1.D2, V2.D2 // 2254784f VSHL $24, V1.S4, V2.S4 // 2254384f VSHL $24, V1.S2, V2.S2 // 2254380f @@ -161,6 +181,48 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8 VSHL $8, V1.H8, V2.H8 // 2254184f VSHL $2, V1.B8, V2.B8 // 22540a0f VSHL $2, V1.B16, V2.B16 // 22540a4f + VSQSHL $56, V1.D2, V2.D2 // 2274784f + VSQSHL $24, V1.S4, V2.S4 // 2274384f + VSQSHL $24, V1.S2, V2.S2 // 2274380f + VSQSHL $8, V1.H4, V2.H4 // 2274180f + VSQSHL $8, V1.H8, V2.H8 // 2274184f + VSQSHL $2, V1.B8, V2.B8 // 22740a0f + VSQSHL $2, V1.B16, V2.B16 // 22740a4f + VUQSHL $56, V1.D2, V2.D2 // 2274786f + VUQSHL $24, V1.S4, V2.S4 // 2274386f + VUQSHL $24, V1.S2, V2.S2 // 2274382f + VUQSHL $8, V1.H4, V2.H4 // 2274182f + VUQSHL $8, V1.H8, V2.H8 // 2274186f + VUQSHL $2, V1.B8, V2.B8 // 22740a2f + VUQSHL $2, V1.B16, V2.B16 // 22740a6f + VSSHL V1.D2, V2.D2, V3.D2 // 4344e14e + VSSHL V1.S4, V2.S4, V3.S4 // 4344a14e + VSSHL V1.S2, V2.S2, V3.S2 // 4344a10e + VSSHL V1.H4, V2.H4, V3.H4 // 4344610e + VSSHL V1.H8, V2.H8, V3.H8 // 4344614e + VSSHL V1.B8, V2.B8, V3.B8 // 4344210e + VSSHL V1.B16, V2.B16, V3.B16 // 4344214e + VUSHL V1.D2, V2.D2, V3.D2 // 4344e16e + VUSHL V1.S4, V2.S4, V3.S4 // 4344a16e + VUSHL V1.S2, V2.S2, V3.S2 // 4344a12e + VUSHL V1.H4, V2.H4, V3.H4 // 4344612e + VUSHL V1.H8, V2.H8, V3.H8 // 4344616e + VUSHL V1.B8, V2.B8, V3.B8 // 4344212e + VUSHL V1.B16, V2.B16, V3.B16 // 4344216e + VSQSHL V1.D2, V2.D2, V3.D2 // 434ce14e + VSQSHL V1.S4, V2.S4, V3.S4 // 434ca14e + VSQSHL V1.S2, V2.S2, V3.S2 // 434ca10e + VSQSHL V1.H4, V2.H4, V3.H4 // 434c610e + VSQSHL V1.H8, V2.H8, V3.H8 // 434c614e + VSQSHL V1.B8, V2.B8, V3.B8 // 434c210e + VSQSHL V1.B16, V2.B16, V3.B16 // 434c214e + VUQSHL V1.D2, V2.D2, V3.D2 // 434ce16e + VUQSHL V1.S4, V2.S4, V3.S4 // 434ca16e + VUQSHL V1.S2, V2.S2, V3.S2 // 434ca12e + VUQSHL V1.H4, V2.H4, V3.H4 // 434c612e + VUQSHL V1.H8, V2.H8, V3.H8 // 434c616e + VUQSHL V1.B8, V2.B8, V3.B8 // 434c212e + VUQSHL V1.B16, V2.B16, V3.B16 // 434c216e VSRI $56, V1.D2, V2.D2 // 2244486f VSRI $24, V1.S4, V2.S4 // 2244286f VSRI $24, V1.S2, V2.S2 // 2244282f diff --git a/src/cmd/asm/internal/asm/testdata/arm64error.s b/src/cmd/asm/internal/asm/testdata/arm64error.s index 71c025ca55..72a22896c1 100644 --- a/src/cmd/asm/internal/asm/testdata/arm64error.s +++ b/src/cmd/asm/internal/asm/testdata/arm64error.s @@ -147,6 +147,10 @@ TEXT errors(SB),$0 VRBIT V1.H4, V2.H4 // ERROR "invalid arrangement" VUSHR $56, V1.D2, V2.H4 // ERROR "invalid arrangement" VUSHR $127, V1.D2, V2.D2 // ERROR "shift out of range" + VSSHR $127, V1.D2, V2.D2 // ERROR "shift out of range" + VSSHR $56, V1.D2, V2.H4 // ERROR "invalid arrangement" + VSRSHR $127, V1.D2, V2.D2 // ERROR "shift out of range" + VSRSHR $56, V1.D2, V2.H4 // ERROR "invalid arrangement" VLD1.P (R8)(R9.SXTX<<2), [V2.B16] // ERROR "invalid extended register" VLD1.P (R8)(R9<<2), [V2.B16] // ERROR "invalid extended register" VST1.P [V1.B16], (R8)(R9.UXTW) // ERROR "invalid extended register" @@ -379,6 +383,10 @@ TEXT errors(SB),$0 VUMIN V1.H4, V2.S4, V3.H4 // ERROR "operand mismatch" VSLI $64, V7.D2, V8.D2 // ERROR "shift out of range" VUSRA $0, V7.D2, V8.D2 // ERROR "shift out of range" + VSSHL V1.B8, V2.B8, V3.B16 // ERROR "operand mismatch" + VUSHL V1.B8, V2.B8, V3.B16 // ERROR "operand mismatch" + VSQSHL V1.B8, V2.B8, V3.B16 // ERROR "operand mismatch" + VUQSHL V1.B8, V2.B8, V3.B16 // ERROR "operand mismatch" CASPD (R3, R4), (R2), (R8, R9) // ERROR "source register pair must start from even register" CASPD (R2, R3), (R2), (R9, R10) // ERROR "destination register pair must start from even register" CASPD (R2, R4), (R2), (R8, R9) // ERROR "source register pair must be contiguous" @@ -429,4 +437,12 @@ TEXT errors(SB),$0 AUTIA1716 $45 // ERROR "illegal combination" AUTIB1716 R0 // ERROR "illegal combination" SB $1 // ERROR "illegal combination" + + // VSHRN/VSHRN2 error test cases - invalid arrangements + VSHRN $8, V1.B8, V0.B8 // ERROR "invalid arrangement" + VSHRN $8, V1.S4, V0.S4 // ERROR "invalid arrangement" + VSHRN $8, V1.H8, V0.H8 // ERROR "invalid arrangement" + VSHRN2 $8, V1.B8, V0.B16 // ERROR "invalid arrangement" + VSHRN2 $8, V1.S4, V0.S4 // ERROR "invalid arrangement" + VSHRN2 $8, V1.H8, V0.H8 // ERROR "invalid arrangement" RET diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go index fdc42eabaa..56f68756fd 100644 --- a/src/cmd/internal/obj/arm64/a.out.go +++ b/src/cmd/internal/obj/arm64/a.out.go @@ -1161,7 +1161,13 @@ const ( AVREV32 AVREV64 AVSHL + AVSHRN + AVSHRN2 AVSLI + AVSQSHL + AVSSHL + AVUSHL + AVUQSHL AVSRI AVST1 AVST2 @@ -1180,6 +1186,8 @@ const ( AVUSHLL AVUSHLL2 AVUSHR + AVSRSHR + AVSSHR AVUSRA AVUXTL AVUXTL2 diff --git a/src/cmd/internal/obj/arm64/anames.go b/src/cmd/internal/obj/arm64/anames.go index 04986e1748..e40c043edd 100644 --- a/src/cmd/internal/obj/arm64/anames.go +++ b/src/cmd/internal/obj/arm64/anames.go @@ -517,7 +517,13 @@ var Anames = []string{ "VREV32", "VREV64", "VSHL", + "VSHRN", + "VSHRN2", "VSLI", + "VSQSHL", + "VSSHL", + "VUSHL", + "VUQSHL", "VSRI", "VST1", "VST2", @@ -536,6 +542,8 @@ var Anames = []string{ "VUSHLL", "VUSHLL2", "VUSHR", + "VSRSHR", + "VSSHR", "VUSRA", "VUXTL", "VUXTL2", diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index ecc62251a0..0d8c1f417e 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -542,7 +542,9 @@ var optab = []Optab{ {AVEXT, C_VCON, C_ARNG, C_ARNG, C_ARNG, C_NONE, 94, 4, 0, 0, 0}, {AVTBL, C_ARNG, C_NONE, C_LIST, C_ARNG, C_NONE, 100, 4, 0, 0, 0}, {AVUSHR, C_VCON, C_ARNG, C_NONE, C_ARNG, C_NONE, 95, 4, 0, 0, 0}, + {AVSQSHL, C_VCON, C_ARNG, C_NONE, C_ARNG, C_NONE, 95, 4, 0, 0, 0}, {AVZIP1, C_ARNG, C_ARNG, C_NONE, C_ARNG, C_NONE, 72, 4, 0, 0, 0}, + {AVSQSHL, C_ARNG, C_ARNG, C_NONE, C_ARNG, C_NONE, 72, 4, 0, 0, 0}, {AVUSHLL, C_VCON, C_ARNG, C_NONE, C_ARNG, C_NONE, 102, 4, 0, 0, 0}, {AVUXTL, C_ARNG, C_NONE, C_NONE, C_ARNG, C_NONE, 102, 4, 0, 0, 0}, {AVUADDW, C_ARNG, C_ARNG, C_NONE, C_ARNG, C_NONE, 105, 4, 0, 0, 0}, @@ -3217,6 +3219,8 @@ func buildop(ctxt *obj.Link) { oprangeset(AVBIT, t) oprangeset(AVCMTST, t) oprangeset(AVCMHI, t) + oprangeset(AVSSHL, t) + oprangeset(AVUSHL, t) oprangeset(AVCMHS, t) oprangeset(AVUMAX, t) oprangeset(AVUMIN, t) @@ -3277,6 +3281,13 @@ func buildop(ctxt *obj.Link) { oprangeset(AVSRI, t) oprangeset(AVSLI, t) oprangeset(AVUSRA, t) + oprangeset(AVSSHR, t) + oprangeset(AVSRSHR, t) + oprangeset(AVSHRN, t) + oprangeset(AVSHRN2, t) + + case AVSQSHL: + oprangeset(AVUQSHL, t) case AVREV32: oprangeset(AVCNT, t) @@ -5431,14 +5442,15 @@ func (c *ctxt7) asmout(p *obj.Prog, out []uint32) (count int) { af := int((p.Reg >> 5) & 15) shift := int(p.From.Offset) - if af != at { + if af != at && p.As != AVSHRN && p.As != AVSHRN2 { c.ctxt.Diag("invalid arrangement on op Vn.<T>, Vd.<T>: %v", p) + at = af } var Q uint32 var imax, esize int - switch af { + switch at { case ARNG_8B, ARNG_4H, ARNG_2S: Q = 0 case ARNG_16B, ARNG_8H, ARNG_4S, ARNG_2D: @@ -5447,29 +5459,44 @@ func (c *ctxt7) asmout(p *obj.Prog, out []uint32) (count int) { c.ctxt.Diag("invalid arrangement on op Vn.<T>, Vd.<T>: %v", p) } - switch af { + atwice := -1 + switch at { case ARNG_8B, ARNG_16B: imax = 15 esize = 8 + atwice = ARNG_8H case ARNG_4H, ARNG_8H: imax = 31 esize = 16 + atwice = ARNG_4S case ARNG_2S, ARNG_4S: imax = 63 esize = 32 + atwice = ARNG_2D case ARNG_2D: imax = 127 esize = 64 } + switch p.As { + case AVSHRN: + if Q != 0 || atwice != af { + c.ctxt.Diag("invalid arrangement on op: %v", p) + } + case AVSHRN2: + if Q != 1 || atwice != af { + c.ctxt.Diag("invalid arrangement on op: %v", p) + } + } + imm := 0 switch p.As { - case AVUSHR, AVSRI, AVUSRA: + case AVUSHR, AVSRI, AVUSRA, AVSSHR, AVSRSHR, AVSHRN, AVSHRN2: imm = esize*2 - shift if imm < esize || imm > imax { c.ctxt.Diag("shift out of range: %v", p) } - case AVSHL, AVSLI: + case AVSHL, AVSLI, AVSQSHL, AVUQSHL: imm = esize + shift if imm > imax { c.ctxt.Diag("shift out of range: %v", p) @@ -6538,9 +6565,21 @@ func (c *ctxt7) oprrr(p *obj.Prog, a obj.As, rd, rn, rm int16) uint32 { case AVSUB: op = ASIMDSAME(1, 0, 0x10) + case AVSSHL: + op = ASIMDSAME(0, 0, 0x8) + + case AVUSHL: + op = ASIMDSAME(1, 0, 0x8) + case AVADDP: op = ASIMDSAME(0, 0, 0x17) + case AVSQSHL: + op = ASIMDSAME(0, 0, 0x9) + + case AVUQSHL: + op = ASIMDSAME(1, 0, 0x9) + case AVAND: op = ASIMDSAME(0, 0, 0x03) @@ -6895,9 +6934,24 @@ func (c *ctxt7) opirr(p *obj.Prog, a obj.As) uint32 { case AVUSHR: return ASIMDSHF(1, 0x00) + case AVSSHR: + return ASIMDSHF(0, 0x00) + + case AVSRSHR: + return ASIMDSHF(0, 0x04) + case AVSHL: return ASIMDSHF(0, 0x0A) + case AVSQSHL: + return ASIMDSHF(0, 0xE) + + case AVUQSHL: + return ASIMDSHF(1, 0xE) + + case AVSHRN, AVSHRN2: + return ASIMDSHF(0, 0x10) + case AVSRI: return ASIMDSHF(1, 0x08) |
