aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/internal/obj
diff options
context:
space:
mode:
authorXiaolin Zhao <zhaoxiaolin@loongson.cn>2025-08-29 16:20:16 +0800
committerGopher Robot <gobot@golang.org>2025-09-04 09:22:33 -0700
commitb8cc907425c4b851d2b941cf689cf8177ea8a153 (patch)
treec6d99ae0cff79fbfa55dcaa69928a1c24ffc474a /src/cmd/internal/obj
parent8c27a808905b0611b0a7b7bbff08819206be3b86 (diff)
downloadgo-b8cc907425c4b851d2b941cf689cf8177ea8a153.tar.xz
cmd/internal/obj/loong64: fix the usage of offset in the instructions [X]VLDREPL.{B/H/W/D}
The previously defined usage of offset was ambiguous and not easy to understand. For example, to fetch 4 bytes of data from the address base+8 and broadcast it to each word element of vector register V5, the assembly implementation is as follows: previous: VMOVQ 2(base), V5.W4 current: VMOVQ 8(base), V5.W4 Change-Id: I8bc84e35033ab63bd10f4c61618789f94314f78c Reviewed-on: https://go-review.googlesource.com/c/go/+/699875 Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Auto-Submit: Michael Pratt <mpratt@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Diffstat (limited to 'src/cmd/internal/obj')
-rw-r--r--src/cmd/internal/obj/loong64/asm.go35
-rw-r--r--src/cmd/internal/obj/loong64/doc.go9
2 files changed, 43 insertions, 1 deletions
diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
index 1b982f6c86..35b33b9376 100644
--- a/src/cmd/internal/obj/loong64/asm.go
+++ b/src/cmd/internal/obj/loong64/asm.go
@@ -1983,6 +1983,18 @@ func OP_12IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
return op | (i&0xFFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
}
+func OP_11IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+ return op | (i&0x7FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+}
+
+func OP_10IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+ return op | (i&0x3FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+}
+
+func OP_9IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+ return op | (i&0x1FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+}
+
func OP_8IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
return op | (i&0xFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
}
@@ -2535,7 +2547,28 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
si := c.regoff(&p.From)
Rj := uint32(p.From.Reg & EXT_REG_MASK)
Vd := uint32(p.To.Reg & EXT_REG_MASK)
- o1 = v | uint32(si<<10) | (Rj << 5) | Vd
+ switch v & 0xc00000 {
+ case 0x800000: // [x]vldrepl.b
+ o1 = OP_12IRR(v, uint32(si), Rj, Vd)
+ case 0x400000: // [x]vldrepl.h
+ if si&1 != 0 {
+ c.ctxt.Diag("%v: offset must be a multiple of 2.\n", p)
+ }
+ o1 = OP_11IRR(v, uint32(si>>1), Rj, Vd)
+ case 0x0:
+ switch v & 0x300000 {
+ case 0x200000: // [x]vldrepl.w
+ if si&3 != 0 {
+ c.ctxt.Diag("%v: offset must be a multiple of 4.\n", p)
+ }
+ o1 = OP_10IRR(v, uint32(si>>2), Rj, Vd)
+ case 0x100000: // [x]vldrepl.d
+ if si&7 != 0 {
+ c.ctxt.Diag("%v: offset must be a multiple of 8.\n", p)
+ }
+ o1 = OP_9IRR(v, uint32(si>>3), Rj, Vd)
+ }
+ }
case 47: // preld offset(Rbase), $hint
offs := c.regoff(&p.From)
diff --git a/src/cmd/internal/obj/loong64/doc.go b/src/cmd/internal/obj/loong64/doc.go
index 6c8f2618a2..20c5a9e0a6 100644
--- a/src/cmd/internal/obj/loong64/doc.go
+++ b/src/cmd/internal/obj/loong64/doc.go
@@ -220,6 +220,15 @@ Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate)
XVMOVQ offset(Rj), Xd.W8 | xvldrepl.w Xd, Rj, si10 | for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
XVMOVQ offset(Rj), Xd.V4 | xvldrepl.d Xd, Rj, si9 | for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
+ note: In Go assembly, for ease of understanding, offset representing the actual address offset.
+ However, during platform encoding, the offset is shifted to increase the encodable offset range, as follows:
+
+ Go assembly | platform assembly
+ VMOVQ 1(R4), V5.B16 | vldrepl.b v5, r4, $1
+ VMOVQ 2(R4), V5.H8 | vldrepl.h v5, r4, $1
+ VMOVQ 8(R4), V5.W4 | vldrepl.w v5, r4, $2
+ VMOVQ 8(R4), V5.V2 | vldrepl.d v5, r4, $1
+
# Special instruction encoding definition and description on LoongArch
1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased