aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorfanzha02 <fannie.zhang@arm.com>2020-09-28 18:58:51 +0800
committerfannie zhang <Fannie.Zhang@arm.com>2020-10-29 03:52:23 +0000
commit3089ef6bd7ecc7af4b23eb68e3d7879d340aa673 (patch)
tree9a7a30af48ab7f3cc2205de7a3c6b935b8954ea3 /src
parent15f01d6ae9853fd51ee8842d9af93d04ce25458c (diff)
downloadgo-3089ef6bd7ecc7af4b23eb68e3d7879d340aa673.tar.xz
cmd/asm: add several arm64 SIMD instructions
This patch enables VSLI, VUADDW(2), VUSRA and FMOVQ SIMD instructions required by the issue #40725. And the GNU syntax of 'FMOVQ' is 128-bit ldr/str(immediate, simd&fp). Add test cases. Fixes #40725 Change-Id: Ide968ef4a9385ce4cd8f69bce854289014d30456 Reviewed-on: https://go-review.googlesource.com/c/go/+/258397 Trust: fannie zhang <Fannie.Zhang@arm.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
Diffstat (limited to 'src')
-rw-r--r--src/cmd/asm/internal/asm/testdata/arm64.s11
-rw-r--r--src/cmd/asm/internal/asm/testdata/arm64error.s16
-rw-r--r--src/cmd/internal/obj/arm64/a.out.go59
-rw-r--r--src/cmd/internal/obj/arm64/anames.go5
-rw-r--r--src/cmd/internal/obj/arm64/anames7.go10
-rw-r--r--src/cmd/internal/obj/arm64/asm7.go300
6 files changed, 290 insertions, 111 deletions
diff --git a/src/cmd/asm/internal/asm/testdata/arm64.s b/src/cmd/asm/internal/asm/testdata/arm64.s
index 7f495b90bb..b6c22e0d6f 100644
--- a/src/cmd/asm/internal/asm/testdata/arm64.s
+++ b/src/cmd/asm/internal/asm/testdata/arm64.s
@@ -454,6 +454,17 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
FMOVD F20, (R2) // 540000fd
FMOVD.P F20, 8(R1) // 348400fc
FMOVD.W 8(R1), F20 // 348c40fc
+ FMOVQ.P F13, 11(R10) // 4db5803c
+ FMOVQ.W F15, 11(R20) // 8fbe803c
+ FMOVQ.P 11(R10), F13 // 4db5c03c
+ FMOVQ.W 11(R20), F15 // 8fbec03c
+ FMOVQ F10, 65520(R10) // 4afdbf3d
+ FMOVQ F11, 64(RSP) // eb13803d
+ FMOVQ F11, 8(R20) // 8b82803c
+ FMOVQ F11, 4(R20) // 8b42803c
+ FMOVQ 32(R5), F2 // a208c03d
+ FMOVQ 65520(R10), F10 // 4afdff3d
+ FMOVQ 64(RSP), F11 // eb13c03d
PRFM (R2), PLDL1KEEP // 400080f9
PRFM 16(R2), PLDL1KEEP // 400880f9
PRFM 48(R6), PSTL2STRM // d31880f9
diff --git a/src/cmd/asm/internal/asm/testdata/arm64error.s b/src/cmd/asm/internal/asm/testdata/arm64error.s
index 20b1f3e9f0..c3a617066a 100644
--- a/src/cmd/asm/internal/asm/testdata/arm64error.s
+++ b/src/cmd/asm/internal/asm/testdata/arm64error.s
@@ -87,13 +87,13 @@ TEXT errors(SB),$0
VLD1.P 32(R1), [V8.S4, V9.S4, V10.S4] // ERROR "invalid post-increment offset"
VLD1.P 48(R1), [V7.S4, V8.S4, V9.S4, V10.S4] // ERROR "invalid post-increment offset"
VPMULL V1.D1, V2.H4, V3.Q1 // ERROR "invalid arrangement"
- VPMULL V1.H4, V2.H4, V3.Q1 // ERROR "invalid arrangement"
- VPMULL V1.D2, V2.D2, V3.Q1 // ERROR "invalid arrangement"
- VPMULL V1.B16, V2.B16, V3.H8 // ERROR "invalid arrangement"
+ VPMULL V1.H4, V2.H4, V3.Q1 // ERROR "operand mismatch"
+ VPMULL V1.D2, V2.D2, V3.Q1 // ERROR "operand mismatch"
+ VPMULL V1.B16, V2.B16, V3.H8 // ERROR "operand mismatch"
VPMULL2 V1.D2, V2.H4, V3.Q1 // ERROR "invalid arrangement"
- VPMULL2 V1.H4, V2.H4, V3.Q1 // ERROR "invalid arrangement"
- VPMULL2 V1.D1, V2.D1, V3.Q1 // ERROR "invalid arrangement"
- VPMULL2 V1.B8, V2.B8, V3.H8 // ERROR "invalid arrangement"
+ VPMULL2 V1.H4, V2.H4, V3.Q1 // ERROR "operand mismatch"
+ VPMULL2 V1.D1, V2.D1, V3.Q1 // ERROR "operand mismatch"
+ VPMULL2 V1.B8, V2.B8, V3.H8 // ERROR "operand mismatch"
VEXT $8, V1.B16, V2.B8, V2.B16 // ERROR "invalid arrangement"
VEXT $8, V1.H8, V2.H8, V2.H8 // ERROR "invalid arrangement"
VRBIT V1.B16, V2.B8 // ERROR "invalid arrangement"
@@ -353,4 +353,8 @@ TEXT errors(SB),$0
VUSHLL2 $32, V30.S4, V2.D2 // ERROR "shift amount out of range"
VBIF V0.B8, V1.B8, V2.B16 // ERROR "operand mismatch"
VBIF V0.D2, V1.D2, V2.D2 // ERROR "invalid arrangement"
+ VUADDW V9.B8, V12.H8, V14.B8 // ERROR "invalid arrangement"
+ VUADDW2 V9.B8, V12.S4, V14.S4 // ERROR "operand mismatch"
+ VSLI $64, V7.D2, V8.D2 // ERROR "shift out of range"
+ VUSRA $0, V7.D2, V8.D2 // ERROR "shift out of range"
RET
diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go
index 33319e48df..98504353e2 100644
--- a/src/cmd/internal/obj/arm64/a.out.go
+++ b/src/cmd/internal/obj/arm64/a.out.go
@@ -419,27 +419,33 @@ const (
C_SBRA // for TYPE_BRANCH
C_LBRA
- C_ZAUTO // 0(RSP)
- C_NSAUTO_8 // -256 <= x < 0, 0 mod 8
- C_NSAUTO_4 // -256 <= x < 0, 0 mod 4
- C_NSAUTO // -256 <= x < 0
- C_NPAUTO // -512 <= x < 0, 0 mod 8
- C_NAUTO4K // -4095 <= x < 0
- C_PSAUTO_8 // 0 to 255, 0 mod 8
- C_PSAUTO_4 // 0 to 255, 0 mod 4
- C_PSAUTO // 0 to 255
- C_PPAUTO // 0 to 504, 0 mod 8
- C_UAUTO4K_8 // 0 to 4095, 0 mod 8
- C_UAUTO4K_4 // 0 to 4095, 0 mod 4
- C_UAUTO4K_2 // 0 to 4095, 0 mod 2
- C_UAUTO4K // 0 to 4095
- C_UAUTO8K_8 // 0 to 8190, 0 mod 8
- C_UAUTO8K_4 // 0 to 8190, 0 mod 4
- C_UAUTO8K // 0 to 8190, 0 mod 2
- C_UAUTO16K_8 // 0 to 16380, 0 mod 8
- C_UAUTO16K // 0 to 16380, 0 mod 4
- C_UAUTO32K // 0 to 32760, 0 mod 8
- C_LAUTO // any other 32-bit constant
+ C_ZAUTO // 0(RSP)
+ C_NSAUTO_8 // -256 <= x < 0, 0 mod 8
+ C_NSAUTO_4 // -256 <= x < 0, 0 mod 4
+ C_NSAUTO // -256 <= x < 0
+ C_NPAUTO // -512 <= x < 0, 0 mod 8
+ C_NAUTO4K // -4095 <= x < 0
+ C_PSAUTO_8 // 0 to 255, 0 mod 8
+ C_PSAUTO_4 // 0 to 255, 0 mod 4
+ C_PSAUTO // 0 to 255
+ C_PPAUTO_16 // 0 to 504, 0 mod 16
+ C_PPAUTO // 0 to 504, 0 mod 8
+ C_UAUTO4K_16 // 0 to 4095, 0 mod 16
+ C_UAUTO4K_8 // 0 to 4095, 0 mod 8
+ C_UAUTO4K_4 // 0 to 4095, 0 mod 4
+ C_UAUTO4K_2 // 0 to 4095, 0 mod 2
+ C_UAUTO4K // 0 to 4095
+ C_UAUTO8K_16 // 0 to 8190, 0 mod 16
+ C_UAUTO8K_8 // 0 to 8190, 0 mod 8
+ C_UAUTO8K_4 // 0 to 8190, 0 mod 4
+ C_UAUTO8K // 0 to 8190, 0 mod 2 + C_PSAUTO
+ C_UAUTO16K_16 // 0 to 16380, 0 mod 16
+ C_UAUTO16K_8 // 0 to 16380, 0 mod 8
+ C_UAUTO16K // 0 to 16380, 0 mod 4 + C_PSAUTO
+ C_UAUTO32K_16 // 0 to 32760, 0 mod 16 + C_PSAUTO
+ C_UAUTO32K // 0 to 32760, 0 mod 8 + C_PSAUTO
+ C_UAUTO64K // 0 to 65520, 0 mod 16 + C_PSAUTO
+ C_LAUTO // any other 32-bit constant
C_SEXT1 // 0 to 4095, direct
C_SEXT2 // 0 to 8190
@@ -457,17 +463,23 @@ const (
C_PSOREG_8
C_PSOREG_4
C_PSOREG
+ C_PPOREG_16
C_PPOREG
+ C_UOREG4K_16
C_UOREG4K_8
C_UOREG4K_4
C_UOREG4K_2
C_UOREG4K
+ C_UOREG8K_16
C_UOREG8K_8
C_UOREG8K_4
C_UOREG8K
+ C_UOREG16K_16
C_UOREG16K_8
C_UOREG16K
+ C_UOREG32K_16
C_UOREG32K
+ C_UOREG64K
C_LOREG
C_ADDR // TODO(aram): explain difference from C_VCONADDR
@@ -873,6 +885,7 @@ const (
AFDIVS
AFLDPD
AFLDPS
+ AFMOVQ
AFMOVD
AFMOVS
AVMOVQ
@@ -1001,6 +1014,7 @@ const (
AVUZP2
AVSHL
AVSRI
+ AVSLI
AVBSL
AVBIT
AVTBL
@@ -1008,6 +1022,9 @@ const (
AVZIP1
AVZIP2
AVCMTST
+ AVUADDW2
+ AVUADDW
+ AVUSRA
ALAST
AB = obj.AJMP
ABL = obj.ACALL
diff --git a/src/cmd/internal/obj/arm64/anames.go b/src/cmd/internal/obj/arm64/anames.go
index e5534e26b9..126eefd032 100644
--- a/src/cmd/internal/obj/arm64/anames.go
+++ b/src/cmd/internal/obj/arm64/anames.go
@@ -379,6 +379,7 @@ var Anames = []string{
"FDIVS",
"FLDPD",
"FLDPS",
+ "FMOVQ",
"FMOVD",
"FMOVS",
"VMOVQ",
@@ -507,6 +508,7 @@ var Anames = []string{
"VUZP2",
"VSHL",
"VSRI",
+ "VSLI",
"VBSL",
"VBIT",
"VTBL",
@@ -514,5 +516,8 @@ var Anames = []string{
"VZIP1",
"VZIP2",
"VCMTST",
+ "VUADDW2",
+ "VUADDW",
+ "VUSRA",
"LAST",
}
diff --git a/src/cmd/internal/obj/arm64/anames7.go b/src/cmd/internal/obj/arm64/anames7.go
index 96c9f788d9..f7e99517ce 100644
--- a/src/cmd/internal/obj/arm64/anames7.go
+++ b/src/cmd/internal/obj/arm64/anames7.go
@@ -51,16 +51,21 @@ var cnames7 = []string{
"PSAUTO_4",
"PSAUTO",
"PPAUTO",
+ "UAUTO4K_16",
"UAUTO4K_8",
"UAUTO4K_4",
"UAUTO4K_2",
"UAUTO4K",
+ "UAUTO8K_16",
"UAUTO8K_8",
"UAUTO8K_4",
"UAUTO8K",
+ "UAUTO16K_16",
"UAUTO16K_8",
"UAUTO16K",
+ "UAUTO32K_8",
"UAUTO32K",
+ "UAUTO64K",
"LAUTO",
"SEXT1",
"SEXT2",
@@ -78,16 +83,21 @@ var cnames7 = []string{
"PSOREG_4",
"PSOREG",
"PPOREG",
+ "UOREG4K_16",
"UOREG4K_8",
"UOREG4K_4",
"UOREG4K_2",
"UOREG4K",
+ "UOREG8K_16",
"UOREG8K_8",
"UOREG8K_4",
"UOREG8K",
+ "UOREG16K_16",
"UOREG16K_8",
"UOREG16K",
+ "UOREG32K_16",
"UOREG32K",
+ "UOREG64K",
"LOREG",
"ADDR",
"GOTADDR",
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go
index 92c5729d25..8cbf5e719f 100644
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -483,6 +483,7 @@ var optab = []Optab{
{AVZIP1, C_ARNG, C_ARNG, C_NONE, C_ARNG, 72, 4, 0, 0, 0},
{AVUSHLL, C_VCON, C_ARNG, C_NONE, C_ARNG, 102, 4, 0, 0, 0},
{AVUXTL, C_ARNG, C_NONE, C_NONE, C_ARNG, 102, 4, 0, 0, 0},
+ {AVUADDW, C_ARNG, C_ARNG, C_NONE, C_ARNG, 105, 4, 0, 0, 0},
/* conditional operations */
{ACSEL, C_COND, C_REG, C_REG, C_REG, 18, 4, 0, 0, 0},
@@ -509,6 +510,8 @@ var optab = []Optab{
{AFMOVS, C_FREG, C_NONE, C_NONE, C_UOREG16K, 20, 4, 0, 0, 0},
{AFMOVD, C_FREG, C_NONE, C_NONE, C_UAUTO32K, 20, 4, REGSP, 0, 0},
{AFMOVD, C_FREG, C_NONE, C_NONE, C_UOREG32K, 20, 4, 0, 0, 0},
+ {AFMOVQ, C_FREG, C_NONE, C_NONE, C_UAUTO64K, 20, 4, REGSP, 0, 0},
+ {AFMOVQ, C_FREG, C_NONE, C_NONE, C_UOREG64K, 20, 4, 0, 0, 0},
/* unscaled 9-bit signed displacement store */
{AMOVB, C_REG, C_NONE, C_NONE, C_NSAUTO, 20, 4, REGSP, 0, 0},
@@ -526,6 +529,8 @@ var optab = []Optab{
{AFMOVS, C_FREG, C_NONE, C_NONE, C_NSOREG, 20, 4, 0, 0, 0},
{AFMOVD, C_FREG, C_NONE, C_NONE, C_NSAUTO, 20, 4, REGSP, 0, 0},
{AFMOVD, C_FREG, C_NONE, C_NONE, C_NSOREG, 20, 4, 0, 0, 0},
+ {AFMOVQ, C_FREG, C_NONE, C_NONE, C_NSAUTO, 20, 4, REGSP, 0, 0},
+ {AFMOVQ, C_FREG, C_NONE, C_NONE, C_NSOREG, 20, 4, 0, 0, 0},
/* scaled 12-bit unsigned displacement load */
{AMOVB, C_UAUTO4K, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
@@ -543,6 +548,8 @@ var optab = []Optab{
{AFMOVS, C_UOREG16K, C_NONE, C_NONE, C_FREG, 21, 4, 0, 0, 0},
{AFMOVD, C_UAUTO32K, C_NONE, C_NONE, C_FREG, 21, 4, REGSP, 0, 0},
{AFMOVD, C_UOREG32K, C_NONE, C_NONE, C_FREG, 21, 4, 0, 0, 0},
+ {AFMOVQ, C_UAUTO64K, C_NONE, C_NONE, C_FREG, 21, 4, REGSP, 0, 0},
+ {AFMOVQ, C_UOREG64K, C_NONE, C_NONE, C_FREG, 21, 4, 0, 0, 0},
/* unscaled 9-bit signed displacement load */
{AMOVB, C_NSAUTO, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
@@ -560,6 +567,8 @@ var optab = []Optab{
{AFMOVS, C_NSOREG, C_NONE, C_NONE, C_FREG, 21, 4, 0, 0, 0},
{AFMOVD, C_NSAUTO, C_NONE, C_NONE, C_FREG, 21, 4, REGSP, 0, 0},
{AFMOVD, C_NSOREG, C_NONE, C_NONE, C_FREG, 21, 4, 0, 0, 0},
+ {AFMOVQ, C_NSAUTO, C_NONE, C_NONE, C_FREG, 21, 4, REGSP, 0, 0},
+ {AFMOVQ, C_NSOREG, C_NONE, C_NONE, C_FREG, 21, 4, 0, 0, 0},
/* long displacement store */
{AMOVB, C_REG, C_NONE, C_NONE, C_LAUTO, 30, 8, REGSP, LTO, 0},
@@ -577,6 +586,8 @@ var optab = []Optab{
{AFMOVS, C_FREG, C_NONE, C_NONE, C_LOREG, 30, 8, 0, LTO, 0},
{AFMOVD, C_FREG, C_NONE, C_NONE, C_LAUTO, 30, 8, REGSP, LTO, 0},
{AFMOVD, C_FREG, C_NONE, C_NONE, C_LOREG, 30, 8, 0, LTO, 0},
+ {AFMOVQ, C_FREG, C_NONE, C_NONE, C_LAUTO, 30, 8, REGSP, LTO, 0},
+ {AFMOVQ, C_FREG, C_NONE, C_NONE, C_LOREG, 30, 8, 0, LTO, 0},
/* long displacement load */
{AMOVB, C_LAUTO, C_NONE, C_NONE, C_REG, 31, 8, REGSP, LFROM, 0},
@@ -594,6 +605,8 @@ var optab = []Optab{
{AFMOVS, C_LOREG, C_NONE, C_NONE, C_FREG, 31, 8, 0, LFROM, 0},
{AFMOVD, C_LAUTO, C_NONE, C_NONE, C_FREG, 31, 8, REGSP, LFROM, 0},
{AFMOVD, C_LOREG, C_NONE, C_NONE, C_FREG, 31, 8, 0, LFROM, 0},
+ {AFMOVQ, C_LAUTO, C_NONE, C_NONE, C_FREG, 31, 8, REGSP, LFROM, 0},
+ {AFMOVQ, C_LOREG, C_NONE, C_NONE, C_FREG, 31, 8, 0, LFROM, 0},
/* pre/post-indexed load (unscaled, signed 9-bit offset) */
{AMOVD, C_LOREG, C_NONE, C_NONE, C_REG, 22, 4, 0, 0, C_XPOST},
@@ -603,6 +616,7 @@ var optab = []Optab{
{AMOVBU, C_LOREG, C_NONE, C_NONE, C_REG, 22, 4, 0, 0, C_XPOST},
{AFMOVS, C_LOREG, C_NONE, C_NONE, C_FREG, 22, 4, 0, 0, C_XPOST},
{AFMOVD, C_LOREG, C_NONE, C_NONE, C_FREG, 22, 4, 0, 0, C_XPOST},
+ {AFMOVQ, C_LOREG, C_NONE, C_NONE, C_FREG, 22, 4, 0, 0, C_XPOST},
{AMOVD, C_LOREG, C_NONE, C_NONE, C_REG, 22, 4, 0, 0, C_XPRE},
{AMOVW, C_LOREG, C_NONE, C_NONE, C_REG, 22, 4, 0, 0, C_XPRE},
@@ -611,6 +625,7 @@ var optab = []Optab{
{AMOVBU, C_LOREG, C_NONE, C_NONE, C_REG, 22, 4, 0, 0, C_XPRE},
{AFMOVS, C_LOREG, C_NONE, C_NONE, C_FREG, 22, 4, 0, 0, C_XPRE},
{AFMOVD, C_LOREG, C_NONE, C_NONE, C_FREG, 22, 4, 0, 0, C_XPRE},
+ {AFMOVQ, C_LOREG, C_NONE, C_NONE, C_FREG, 22, 4, 0, 0, C_XPRE},
/* pre/post-indexed store (unscaled, signed 9-bit offset) */
{AMOVD, C_REG, C_NONE, C_NONE, C_LOREG, 23, 4, 0, 0, C_XPOST},
@@ -620,6 +635,7 @@ var optab = []Optab{
{AMOVBU, C_REG, C_NONE, C_NONE, C_LOREG, 23, 4, 0, 0, C_XPOST},
{AFMOVS, C_FREG, C_NONE, C_NONE, C_LOREG, 23, 4, 0, 0, C_XPOST},
{AFMOVD, C_FREG, C_NONE, C_NONE, C_LOREG, 23, 4, 0, 0, C_XPOST},
+ {AFMOVQ, C_FREG, C_NONE, C_NONE, C_LOREG, 23, 4, 0, 0, C_XPOST},
{AMOVD, C_REG, C_NONE, C_NONE, C_LOREG, 23, 4, 0, 0, C_XPRE},
{AMOVW, C_REG, C_NONE, C_NONE, C_LOREG, 23, 4, 0, 0, C_XPRE},
@@ -628,6 +644,7 @@ var optab = []Optab{
{AMOVBU, C_REG, C_NONE, C_NONE, C_LOREG, 23, 4, 0, 0, C_XPRE},
{AFMOVS, C_FREG, C_NONE, C_NONE, C_LOREG, 23, 4, 0, 0, C_XPRE},
{AFMOVD, C_FREG, C_NONE, C_NONE, C_LOREG, 23, 4, 0, 0, C_XPRE},
+ {AFMOVQ, C_FREG, C_NONE, C_NONE, C_LOREG, 23, 4, 0, 0, C_XPRE},
/* load with shifted or extended register offset */
{AMOVD, C_ROFF, C_NONE, C_NONE, C_REG, 98, 4, 0, 0, 0},
@@ -1207,37 +1224,49 @@ func (c *ctxt7) addpool(p *obj.Prog, a *obj.Addr) {
C_PSAUTO,
C_PSAUTO_8,
C_PSAUTO_4,
+ C_PPAUTO_16,
C_PPAUTO,
+ C_UAUTO4K_16,
C_UAUTO4K_8,
C_UAUTO4K_4,
C_UAUTO4K_2,
C_UAUTO4K,
+ C_UAUTO8K_16,
C_UAUTO8K_8,
C_UAUTO8K_4,
C_UAUTO8K,
+ C_UAUTO16K_16,
C_UAUTO16K_8,
C_UAUTO16K,
+ C_UAUTO32K_16,
C_UAUTO32K,
+ C_UAUTO64K,
C_NSAUTO_8,
C_NSAUTO_4,
C_NSAUTO,
C_NPAUTO,
C_NAUTO4K,
C_LAUTO,
- C_PPOREG,
C_PSOREG,
- C_PSOREG_4,
C_PSOREG_8,
+ C_PSOREG_4,
+ C_PPOREG_16,
+ C_PPOREG,
+ C_UOREG4K_16,
C_UOREG4K_8,
C_UOREG4K_4,
C_UOREG4K_2,
C_UOREG4K,
+ C_UOREG8K_16,
C_UOREG8K_8,
C_UOREG8K_4,
C_UOREG8K,
+ C_UOREG16K_16,
C_UOREG16K_8,
C_UOREG16K,
+ C_UOREG32K_16,
C_UOREG32K,
+ C_UOREG64K,
C_NSOREG_8,
C_NSOREG_4,
C_NSOREG,
@@ -1532,10 +1561,18 @@ func autoclass(l int64) int {
}
return C_PSAUTO
}
- if l <= 504 && l&7 == 0 {
- return C_PPAUTO
+ if l <= 504 {
+ if l&15 == 0 {
+ return C_PPAUTO_16
+ }
+ if l&7 == 0 {
+ return C_PPAUTO
+ }
}
if l <= 4095 {
+ if l&15 == 0 {
+ return C_UAUTO4K_16
+ }
if l&7 == 0 {
return C_UAUTO4K_8
}
@@ -1548,6 +1585,9 @@ func autoclass(l int64) int {
return C_UAUTO4K
}
if l <= 8190 {
+ if l&15 == 0 {
+ return C_UAUTO8K_16
+ }
if l&7 == 0 {
return C_UAUTO8K_8
}
@@ -1559,6 +1599,9 @@ func autoclass(l int64) int {
}
}
if l <= 16380 {
+ if l&15 == 0 {
+ return C_UAUTO16K_16
+ }
if l&7 == 0 {
return C_UAUTO16K_8
}
@@ -1566,8 +1609,16 @@ func autoclass(l int64) int {
return C_UAUTO16K
}
}
- if l <= 32760 && (l&7) == 0 {
- return C_UAUTO32K
+ if l <= 32760 {
+ if l&15 == 0 {
+ return C_UAUTO32K_16
+ }
+ if l&7 == 0 {
+ return C_UAUTO32K
+ }
+ }
+ if l <= 65520 && (l&15) == 0 {
+ return C_UAUTO64K
}
return C_LAUTO
}
@@ -1595,6 +1646,8 @@ func (c *ctxt7) offsetshift(p *obj.Prog, v int64, cls int) int64 {
s = 2
case C_UAUTO32K, C_UOREG32K:
s = 3
+ case C_UAUTO64K, C_UOREG64K:
+ s = 4
default:
c.ctxt.Diag("bad class: %v\n%v", DRconv(cls), p)
}
@@ -2126,46 +2179,66 @@ func cmp(a int, b int) bool {
case C_PPAUTO:
switch b {
- case C_ZAUTO, C_PSAUTO_8:
+ case C_ZAUTO, C_PSAUTO_8, C_PPAUTO_16:
return true
}
case C_UAUTO4K:
switch b {
case C_ZAUTO, C_PSAUTO, C_PSAUTO_4, C_PSAUTO_8,
- C_PPAUTO, C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8:
+ C_PPAUTO, C_PPAUTO_16,
+ C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8, C_UAUTO4K_16:
return true
}
case C_UAUTO8K:
switch b {
- case C_ZAUTO, C_PSAUTO, C_PSAUTO_4, C_PSAUTO_8, C_PPAUTO,
- C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8, C_UAUTO8K_4, C_UAUTO8K_8:
+ case C_ZAUTO, C_PSAUTO, C_PSAUTO_4, C_PSAUTO_8,
+ C_PPAUTO, C_PPAUTO_16,
+ C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8, C_UAUTO4K_16,
+ C_UAUTO8K_4, C_UAUTO8K_8, C_UAUTO8K_16:
return true
}
case C_UAUTO16K:
switch b {
- case C_ZAUTO, C_PSAUTO, C_PSAUTO_4, C_PSAUTO_8, C_PPAUTO,
- C_UAUTO4K_4, C_UAUTO4K_8, C_UAUTO8K_4, C_UAUTO8K_8, C_UAUTO16K_8:
+ case C_ZAUTO, C_PSAUTO, C_PSAUTO_4, C_PSAUTO_8,
+ C_PPAUTO, C_PPAUTO_16,
+ C_UAUTO4K_4, C_UAUTO4K_8, C_UAUTO4K_16,
+ C_UAUTO8K_4, C_UAUTO8K_8, C_UAUTO8K_16,
+ C_UAUTO16K_8, C_UAUTO16K_16:
return true
}
case C_UAUTO32K:
switch b {
case C_ZAUTO, C_PSAUTO, C_PSAUTO_4, C_PSAUTO_8,
- C_PPAUTO, C_UAUTO4K_8, C_UAUTO8K_8, C_UAUTO16K_8:
+ C_PPAUTO, C_PPAUTO_16,
+ C_UAUTO4K_8, C_UAUTO4K_16,
+ C_UAUTO8K_8, C_UAUTO8K_16,
+ C_UAUTO16K_8, C_UAUTO16K_16,
+ C_UAUTO32K_16:
+ return true
+ }
+
+ case C_UAUTO64K:
+ switch b {
+ case C_ZAUTO, C_PSAUTO, C_PSAUTO_4, C_PSAUTO_8,
+ C_PPAUTO_16, C_UAUTO4K_16, C_UAUTO8K_16, C_UAUTO16K_16,
+ C_UAUTO32K_16:
return true
}
case C_LAUTO:
switch b {
- case C_ZAUTO, C_NSAUTO, C_NSAUTO_4, C_NSAUTO_8, C_NPAUTO,
- C_NAUTO4K, C_PSAUTO, C_PSAUTO_4, C_PSAUTO_8, C_PPAUTO,
- C_UAUTO4K, C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8,
- C_UAUTO8K, C_UAUTO8K_4, C_UAUTO8K_8,
- C_UAUTO16K, C_UAUTO16K_8,
- C_UAUTO32K:
+ case C_ZAUTO, C_NSAUTO, C_NSAUTO_4, C_NSAUTO_8, C_NPAUTO, C_NAUTO4K,
+ C_PSAUTO, C_PSAUTO_4, C_PSAUTO_8,
+ C_PPAUTO, C_PPAUTO_16,
+ C_UAUTO4K, C_UAUTO4K_2, C_UAUTO4K_4, C_UAUTO4K_8, C_UAUTO4K_16,
+ C_UAUTO8K, C_UAUTO8K_4, C_UAUTO8K_8, C_UAUTO8K_16,
+ C_UAUTO16K, C_UAUTO16K_8, C_UAUTO16K_16,
+ C_UAUTO32K, C_UAUTO32K_16,
+ C_UAUTO64K:
return true
}
@@ -2192,6 +2265,11 @@ func cmp(a int, b int) bool {
return true
}
+ case C_PSOREG_8:
+ if b == C_ZOREG {
+ return true
+ }
+
case C_PSOREG_4:
switch b {
case C_ZOREG, C_PSOREG_8:
@@ -2206,48 +2284,66 @@ func cmp(a int, b int) bool {
case C_PPOREG:
switch b {
- case C_ZOREG, C_PSOREG_8:
+ case C_ZOREG, C_PSOREG_8, C_PPOREG_16:
return true
}
case C_UOREG4K:
switch b {
- case C_ZOREG, C_PSOREG_4, C_PSOREG_8, C_PSOREG,
- C_PPOREG, C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8:
+ case C_ZOREG, C_PSOREG, C_PSOREG_4, C_PSOREG_8,
+ C_PPOREG, C_PPOREG_16,
+ C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8, C_UOREG4K_16:
return true
}
case C_UOREG8K:
switch b {
- case C_ZOREG, C_PSOREG_4, C_PSOREG_8, C_PSOREG,
- C_PPOREG, C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8,
- C_UOREG8K_4, C_UOREG8K_8:
+ case C_ZOREG, C_PSOREG, C_PSOREG_4, C_PSOREG_8,
+ C_PPOREG, C_PPOREG_16,
+ C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8, C_UOREG4K_16,
+ C_UOREG8K_4, C_UOREG8K_8, C_UOREG8K_16:
return true
}
case C_UOREG16K:
switch b {
- case C_ZOREG, C_PSOREG_4, C_PSOREG_8, C_PSOREG,
- C_PPOREG, C_UOREG4K_4, C_UOREG4K_8, C_UOREG8K_4,
- C_UOREG8K_8, C_UOREG16K_8:
+ case C_ZOREG, C_PSOREG, C_PSOREG_4, C_PSOREG_8,
+ C_PPOREG, C_PPOREG_16,
+ C_UOREG4K_4, C_UOREG4K_8, C_UOREG4K_16,
+ C_UOREG8K_4, C_UOREG8K_8, C_UOREG8K_16,
+ C_UOREG16K_8, C_UOREG16K_16:
return true
}
case C_UOREG32K:
switch b {
- case C_ZOREG, C_PSOREG_4, C_PSOREG_8, C_PSOREG,
- C_PPOREG, C_UOREG4K_8, C_UOREG8K_8, C_UOREG16K_8:
+ case C_ZOREG, C_PSOREG, C_PSOREG_4, C_PSOREG_8,
+ C_PPOREG, C_PPOREG_16,
+ C_UOREG4K_8, C_UOREG4K_16,
+ C_UOREG8K_8, C_UOREG8K_16,
+ C_UOREG16K_8, C_UOREG16K_16,
+ C_UOREG32K_16:
+ return true
+ }
+
+ case C_UOREG64K:
+ switch b {
+ case C_ZOREG, C_PSOREG, C_PSOREG_4, C_PSOREG_8,
+ C_PPOREG_16, C_UOREG4K_16, C_UOREG8K_16, C_UOREG16K_16,
+ C_UOREG32K_16:
return true
}
case C_LOREG:
switch b {
- case C_ZOREG, C_NSOREG, C_NSOREG_4, C_NSOREG_8, C_NPOREG,
- C_NOREG4K, C_PSOREG_4, C_PSOREG_8, C_PSOREG, C_PPOREG,
- C_UOREG4K, C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8,
- C_UOREG8K, C_UOREG8K_4, C_UOREG8K_8,
- C_UOREG16K, C_UOREG16K_8,
- C_UOREG32K:
+ case C_ZOREG, C_NSOREG, C_NSOREG_4, C_NSOREG_8, C_NPOREG, C_NOREG4K,
+ C_PSOREG, C_PSOREG_4, C_PSOREG_8,
+ C_PPOREG, C_PPOREG_16,
+ C_UOREG4K, C_UOREG4K_2, C_UOREG4K_4, C_UOREG4K_8, C_UOREG4K_16,
+ C_UOREG8K, C_UOREG8K_4, C_UOREG8K_8, C_UOREG8K_16,
+ C_UOREG16K, C_UOREG16K_8, C_UOREG16K_16,
+ C_UOREG32K, C_UOREG32K_16,
+ C_UOREG64K:
return true
}
@@ -2675,7 +2771,8 @@ func buildop(ctxt *obj.Link) {
case AFCSELD:
oprangeset(AFCSELS, t)
- case AFMOVS, AFMOVD, AVMOVQ, AVMOVD, AVMOVS:
+ case AFMOVQ, AFMOVD, AFMOVS,
+ AVMOVQ, AVMOVD, AVMOVS:
break
case AFCVTZSD:
@@ -2802,6 +2899,8 @@ func buildop(ctxt *obj.Link) {
case AVUSHR:
oprangeset(AVSHL, t)
oprangeset(AVSRI, t)
+ oprangeset(AVSLI, t)
+ oprangeset(AVUSRA, t)
case AVREV32:
oprangeset(AVCNT, t)
@@ -2829,6 +2928,9 @@ func buildop(ctxt *obj.Link) {
case AVEOR3:
oprangeset(AVBCAX, t)
+ case AVUADDW:
+ oprangeset(AVUADDW2, t)
+
case ASHA1H,
AVCNT,
AVMOV,
@@ -4805,47 +4907,31 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
}
o1 |= (uint32(imm5&0x1f) << 16) | (uint32(imm4&0xf) << 11) | (uint32(rf&31) << 5) | uint32(rt&31)
- case 93: /* vpmull{2} Vm.<T>, Vn.<T>, Vd */
- af := int((p.From.Reg >> 5) & 15)
- at := int((p.To.Reg >> 5) & 15)
- a := int((p.Reg >> 5) & 15)
+ case 93: /* vpmull{2} Vm.<Tb>, Vn.<Tb>, Vd.<Ta> */
+ af := uint8((p.From.Reg >> 5) & 15)
+ at := uint8((p.To.Reg >> 5) & 15)
+ a := uint8((p.Reg >> 5) & 15)
+ if af != a {
+ c.ctxt.Diag("invalid arrangement: %v", p)
+ }
var Q, size uint32
- if p.As == AVPMULL {
- Q = 0
- } else {
+ if p.As == AVPMULL2 {
Q = 1
}
-
- var fArng int
- switch at {
- case ARNG_8H:
- if Q == 0 {
- fArng = ARNG_8B
- } else {
- fArng = ARNG_16B
- }
+ switch pack(Q, at, af) {
+ case pack(0, ARNG_8H, ARNG_8B), pack(1, ARNG_8H, ARNG_16B):
size = 0
- case ARNG_1Q:
- if Q == 0 {
- fArng = ARNG_1D
- } else {
- fArng = ARNG_2D
- }
+ case pack(0, ARNG_1Q, ARNG_1D), pack(1, ARNG_1Q, ARNG_2D):
size = 3
default:
- c.ctxt.Diag("invalid arrangement on Vd.<T>: %v", p)
- }
-
- if af != a || af != fArng {
- c.ctxt.Diag("invalid arrangement: %v", p)
+ c.ctxt.Diag("operand mismatch: %v\n", p)
}
o1 = c.oprrr(p, p.As)
rf := int((p.From.Reg) & 31)
rt := int((p.To.Reg) & 31)
r := int((p.Reg) & 31)
-
o1 |= ((Q & 1) << 30) | ((size & 3) << 22) | (uint32(rf&31) << 16) | (uint32(r&31) << 5) | uint32(rt&31)
case 94: /* vext $imm4, Vm.<T>, Vn.<T>, Vd.<T> */
@@ -4883,7 +4969,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
o1 |= ((Q & 1) << 30) | (uint32(r&31) << 16) | (uint32(index&15) << 11) | (uint32(rf&31) << 5) | uint32(rt&31)
- case 95: /* vushr $shift, Vn.<T>, Vd.<T> */
+ case 95: /* vushr/vshl/vsri/vsli/vusra $shift, Vn.<T>, Vd.<T> */
at := int((p.To.Reg >> 5) & 15)
af := int((p.Reg >> 5) & 15)
shift := int(p.From.Offset)
@@ -4920,14 +5006,13 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
}
imm := 0
-
switch p.As {
- case AVUSHR, AVSRI:
+ case AVUSHR, AVSRI, AVUSRA:
imm = esize*2 - shift
if imm < esize || imm > imax {
c.ctxt.Diag("shift out of range: %v", p)
}
- case AVSHL:
+ case AVSHL, AVSLI:
imm = esize + shift
if imm > imax {
c.ctxt.Diag("shift out of range: %v", p)
@@ -4940,7 +5025,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
rt := int((p.To.Reg) & 31)
rf := int((p.Reg) & 31)
- o1 |= ((Q & 1) << 30) | (uint32(imm&127) << 16) | (uint32(rf&31) << 5) | uint32(rt&31)
+ o1 |= ((Q & 1) << 30) | (uint32(imm&0x7f) << 16) | (uint32(rf&31) << 5) | uint32(rt&31)
case 96: /* vst1 Vt1.<T>[index], offset(Rn) */
af := int((p.From.Reg >> 5) & 15)
@@ -5169,11 +5254,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
shift = 0
}
- pack := func(q, x, y uint8) uint32 {
- return uint32(q)<<16 | uint32(x)<<8 | uint32(y)
- }
-
- var Q uint8 = uint8(o1>>30) & 1
+ Q := (o1 >> 30) & 1
var immh, width uint8
switch pack(Q, af, at) {
case pack(0, ARNG_8B, ARNG_8H):
@@ -5195,6 +5276,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
c.ctxt.Diag("shift amount out of range: %v\n", p)
}
o1 |= uint32(immh)<<19 | uint32(shift)<<16 | uint32(rf&31)<<5 | uint32(p.To.Reg&31)
+
case 103: /* VEOR3/VBCAX Va.B16, Vm.B16, Vn.B16, Vd.B16 */
ta := (p.From.Reg >> 5) & 15
tm := (p.Reg >> 5) & 15
@@ -5240,6 +5322,35 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
o1 |= (uint32(r&31) << 16) | (uint32(index&63) << 10) | (uint32(rf&31) << 5) | uint32(rt&31)
+ case 105: /* vuaddw{2} Vm.<Tb>, Vn.<Ta>, Vd.<Ta> */
+ af := uint8((p.From.Reg >> 5) & 15)
+ at := uint8((p.To.Reg >> 5) & 15)
+ a := uint8((p.Reg >> 5) & 15)
+ if at != a {
+ c.ctxt.Diag("invalid arrangement: %v", p)
+ break
+ }
+
+ var Q, size uint32
+ if p.As == AVUADDW2 {
+ Q = 1
+ }
+ switch pack(Q, at, af) {
+ case pack(0, ARNG_8H, ARNG_8B), pack(1, ARNG_8H, ARNG_16B):
+ size = 0
+ case pack(0, ARNG_4S, ARNG_4H), pack(1, ARNG_4S, ARNG_8H):
+ size = 1
+ case pack(0, ARNG_2D, ARNG_2S), pack(1, ARNG_2D, ARNG_4S):
+ size = 2
+ default:
+ c.ctxt.Diag("operand mismatch: %v\n", p)
+ }
+
+ o1 = c.oprrr(p, p.As)
+ rf := int((p.From.Reg) & 31)
+ rt := int((p.To.Reg) & 31)
+ r := int((p.Reg) & 31)
+ o1 |= ((Q & 1) << 30) | ((size & 3) << 22) | (uint32(rf&31) << 16) | (uint32(r&31) << 5) | uint32(rt&31)
}
out[0] = o1
out[1] = o2
@@ -5898,6 +6009,9 @@ func (c *ctxt7) oprrr(p *obj.Prog, a obj.As) uint32 {
case AVUZP2:
return 7<<25 | 1<<14 | 3<<11
+
+ case AVUADDW, AVUADDW2:
+ return 0x17<<25 | 1<<21 | 1<<12
}
c.ctxt.Diag("%v: bad rrr %d %v", p, a, a)
@@ -6097,6 +6211,9 @@ func (c *ctxt7) opirr(p *obj.Prog, a obj.As) uint32 {
case AVSRI:
return 0x5E<<23 | 17<<10
+ case AVSLI:
+ return 0x5E<<23 | 21<<10
+
case AVUSHLL, AVUXTL:
return 1<<29 | 15<<24 | 0x29<<10
@@ -6106,6 +6223,9 @@ func (c *ctxt7) opirr(p *obj.Prog, a obj.As) uint32 {
case AVXAR:
return 0xCE<<24 | 1<<23
+ case AVUSRA:
+ return 1<<29 | 15<<24 | 5<<10
+
case APRFM:
return 0xf9<<24 | 2<<22
}
@@ -6535,7 +6655,14 @@ func (c *ctxt7) olsr9s(p *obj.Prog, o int32, v int32, b int, r int) uint32 {
// pre/post-indexed store.
// and the 12-bit and 9-bit are distinguished in olsr12u and oslr9s.
func (c *ctxt7) opstr(p *obj.Prog, a obj.As) uint32 {
- return LD2STR(c.opldr(p, a))
+ enc := c.opldr(p, a)
+ switch p.As {
+ case AFMOVQ:
+ enc = enc &^ (1 << 22)
+ default:
+ enc = LD2STR(enc)
+ }
+ return enc
}
// load(immediate)
@@ -6571,6 +6698,9 @@ func (c *ctxt7) opldr(p *obj.Prog, a obj.As) uint32 {
case AFMOVD:
return LDSTR(3, 1, 1)
+
+ case AFMOVQ:
+ return LDSTR(0, 1, 3)
}
c.ctxt.Diag("bad opldr %v\n%v", a, p)
@@ -7063,10 +7193,13 @@ func (c *ctxt7) maskOpvldvst(p *obj.Prog, o1 uint32) uint32 {
*/
func movesize(a obj.As) int {
switch a {
- case AMOVD:
+ case AFMOVQ:
+ return 4
+
+ case AMOVD, AFMOVD:
return 3
- case AMOVW, AMOVWU:
+ case AMOVW, AMOVWU, AFMOVS:
return 2
case AMOVH, AMOVHU:
@@ -7075,12 +7208,6 @@ func movesize(a obj.As) int {
case AMOVB, AMOVBU:
return 0
- case AFMOVS:
- return 2
-
- case AFMOVD:
- return 3
-
default:
return -1
}
@@ -7145,3 +7272,8 @@ func (c *ctxt7) encRegShiftOrExt(a *obj.Addr, r int16) uint32 {
return 0
}
+
+// pack returns the encoding of the "Q" field and two arrangement specifiers.
+func pack(q uint32, arngA, arngB uint8) uint32 {
+ return uint32(q)<<16 | uint32(arngA)<<8 | uint32(arngB)
+}