aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/internal/obj
diff options
context:
space:
mode:
authorJoel Sing <joel@sing.id.au>2025-09-26 05:05:49 +1000
committerJoel Sing <joel@sing.id.au>2025-11-18 00:58:00 -0800
commit9859b436430aac382b337964a1b380bc4bfcda70 (patch)
tree62dcd5c1a80b6fc8946c7ed58ba8186bc5deaf53 /src/cmd/internal/obj
parentb9ef0633f6117c74fabcd7247a76b4feb86df086 (diff)
downloadgo-9859b436430aac382b337964a1b380bc4bfcda70.tar.xz
cmd/asm,cmd/compile,cmd/internal/obj/riscv: use compressed instructions on riscv64
Make use of compressed instructions on riscv64 - add a compress pass to the end of the assembler, which replaces non-compressed instructions with compressed alternatives if possible. Provide a `compressinstructions` compiler and assembler debug flag, such that the compression pass can be disabled via `-asmflags=all=-d=compressinstructions=0` and `-gcflags=all=-d=compressinstructions=0`. Note that this does not prevent the explicit use of compressed instructions via assembly. Note that this does not make use of compressed control transfer instructions - this will be implemented in later changes. Reduces the text size of a hello world binary by ~121KB and reduces the text size of the go binary on riscv64 by ~1.21MB (between 8-10% in both cases). Updates #71105 Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64 Change-Id: I24258353688554042c2a836deed4830cc673e985 Reviewed-on: https://go-review.googlesource.com/c/go/+/523478 Reviewed-by: Mark Ryan <markdryan@rivosinc.com> Reviewed-by: Mark Freeman <markfreeman@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Cherry Mui <cherryyz@google.com>
Diffstat (limited to 'src/cmd/internal/obj')
-rw-r--r--src/cmd/internal/obj/link.go61
-rw-r--r--src/cmd/internal/obj/riscv/asm_test.go16
-rw-r--r--src/cmd/internal/obj/riscv/cpu.go3
-rw-r--r--src/cmd/internal/obj/riscv/obj.go178
4 files changed, 214 insertions, 44 deletions
diff --git a/src/cmd/internal/obj/link.go b/src/cmd/internal/obj/link.go
index 85dca33d27..c70c1d9438 100644
--- a/src/cmd/internal/obj/link.go
+++ b/src/cmd/internal/obj/link.go
@@ -1153,36 +1153,37 @@ type Func interface {
// Link holds the context for writing object code from a compiler
// to be linker input or for reading that input into the linker.
type Link struct {
- Headtype objabi.HeadType
- Arch *LinkArch
- Debugasm int
- Debugvlog bool
- Debugpcln string
- Flag_shared bool
- Flag_dynlink bool
- Flag_linkshared bool
- Flag_optimize bool
- Flag_locationlists bool
- Flag_noRefName bool // do not include referenced symbol names in object file
- Retpoline bool // emit use of retpoline stubs for indirect jmp/call
- Flag_maymorestack string // If not "", call this function before stack checks
- Bso *bufio.Writer
- Pathname string
- Pkgpath string // the current package's import path
- hashmu sync.Mutex // protects hash, funchash
- hash map[string]*LSym // name -> sym mapping
- funchash map[string]*LSym // name -> sym mapping for ABIInternal syms
- statichash map[string]*LSym // name -> sym mapping for static syms
- PosTable src.PosTable
- InlTree InlTree // global inlining tree used by gc/inl.go
- DwFixups *DwarfFixupTable
- DwTextCount int
- Imports []goobj.ImportedPkg
- DiagFunc func(string, ...any)
- DiagFlush func()
- DebugInfo func(ctxt *Link, fn *LSym, info *LSym, curfn Func) ([]dwarf.Scope, dwarf.InlCalls)
- GenAbstractFunc func(fn *LSym)
- Errors int
+ Headtype objabi.HeadType
+ Arch *LinkArch
+ CompressInstructions bool // use compressed instructions where possible (if supported by architecture)
+ Debugasm int
+ Debugvlog bool
+ Debugpcln string
+ Flag_shared bool
+ Flag_dynlink bool
+ Flag_linkshared bool
+ Flag_optimize bool
+ Flag_locationlists bool
+ Flag_noRefName bool // do not include referenced symbol names in object file
+ Retpoline bool // emit use of retpoline stubs for indirect jmp/call
+ Flag_maymorestack string // If not "", call this function before stack checks
+ Bso *bufio.Writer
+ Pathname string
+ Pkgpath string // the current package's import path
+ hashmu sync.Mutex // protects hash, funchash
+ hash map[string]*LSym // name -> sym mapping
+ funchash map[string]*LSym // name -> sym mapping for ABIInternal syms
+ statichash map[string]*LSym // name -> sym mapping for static syms
+ PosTable src.PosTable
+ InlTree InlTree // global inlining tree used by gc/inl.go
+ DwFixups *DwarfFixupTable
+ DwTextCount int
+ Imports []goobj.ImportedPkg
+ DiagFunc func(string, ...any)
+ DiagFlush func()
+ DebugInfo func(ctxt *Link, fn *LSym, info *LSym, curfn Func) ([]dwarf.Scope, dwarf.InlCalls)
+ GenAbstractFunc func(fn *LSym)
+ Errors int
InParallel bool // parallel backend phase in effect
UseBASEntries bool // use Base Address Selection Entries in location lists and PC ranges
diff --git a/src/cmd/internal/obj/riscv/asm_test.go b/src/cmd/internal/obj/riscv/asm_test.go
index f40e57fa64..5b50d1533a 100644
--- a/src/cmd/internal/obj/riscv/asm_test.go
+++ b/src/cmd/internal/obj/riscv/asm_test.go
@@ -11,8 +11,8 @@ import (
"os"
"os/exec"
"path/filepath"
+ "regexp"
"runtime"
- "strings"
"testing"
)
@@ -48,10 +48,10 @@ func genLargeBranch(buf *bytes.Buffer) {
fmt.Fprintln(buf, "TEXT f(SB),0,$0-0")
fmt.Fprintln(buf, "BEQ X0, X0, label")
for i := 0; i < 1<<19; i++ {
- fmt.Fprintln(buf, "ADD $0, X0, X0")
+ fmt.Fprintln(buf, "ADD $0, X5, X0")
}
fmt.Fprintln(buf, "label:")
- fmt.Fprintln(buf, "ADD $0, X0, X0")
+ fmt.Fprintln(buf, "ADD $0, X5, X0")
}
// TestLargeCall generates a large function (>1MB of text) with a call to
@@ -112,11 +112,11 @@ func genLargeCall(buf *bytes.Buffer) {
fmt.Fprintln(buf, "TEXT ·x(SB),0,$0-0")
fmt.Fprintln(buf, "CALL ·y(SB)")
for i := 0; i < 1<<19; i++ {
- fmt.Fprintln(buf, "ADD $0, X0, X0")
+ fmt.Fprintln(buf, "ADD $0, X5, X0")
}
fmt.Fprintln(buf, "RET")
fmt.Fprintln(buf, "TEXT ·y(SB),0,$0-0")
- fmt.Fprintln(buf, "ADD $0, X0, X0")
+ fmt.Fprintln(buf, "ADD $0, X5, X0")
fmt.Fprintln(buf, "RET")
}
@@ -301,9 +301,9 @@ TEXT _stub(SB),$0-0
// FENCE
// NOP
// FENCE
- // RET
- want := "0f 00 f0 0f 13 00 00 00 0f 00 f0 0f 67 80 00 00"
- if !strings.Contains(string(out), want) {
+ // RET (CJALR or JALR)
+ want := regexp.MustCompile("0x0000 0f 00 f0 0f 13 00 00 00 0f 00 f0 0f (82 80|67 80 00 00) ")
+ if !want.Match(out) {
t.Errorf("PCALIGN test failed - got %s\nwant %s", out, want)
}
}
diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
index 60174a0b3a..a91395dd38 100644
--- a/src/cmd/internal/obj/riscv/cpu.go
+++ b/src/cmd/internal/obj/riscv/cpu.go
@@ -326,6 +326,9 @@ const (
NEED_GOT_PCREL_ITYPE_RELOC
)
+const NEED_RELOC = NEED_JAL_RELOC | NEED_CALL_RELOC | NEED_PCREL_ITYPE_RELOC |
+ NEED_PCREL_STYPE_RELOC | NEED_GOT_PCREL_ITYPE_RELOC
+
// RISC-V mnemonics, as defined in the "opcodes" and "opcodes-pseudo" files
// at https://github.com/riscv/riscv-opcodes.
//
diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
index 8b9be5d78b..043be17c07 100644
--- a/src/cmd/internal/obj/riscv/obj.go
+++ b/src/cmd/internal/obj/riscv/obj.go
@@ -414,10 +414,10 @@ func containsCall(sym *obj.LSym) bool {
// setPCs sets the Pc field in all instructions reachable from p.
// It uses pc as the initial value and returns the next available pc.
-func setPCs(p *obj.Prog, pc int64) int64 {
+func setPCs(p *obj.Prog, pc int64, compress bool) int64 {
for ; p != nil; p = p.Link {
p.Pc = pc
- for _, ins := range instructionsForProg(p) {
+ for _, ins := range instructionsForProg(p, compress) {
pc += int64(ins.length())
}
@@ -671,7 +671,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
// a fixed point will be reached). No attempt to handle functions > 2GiB.
for {
big, rescan := false, false
- maxPC := setPCs(cursym.Func().Text, 0)
+ maxPC := setPCs(cursym.Func().Text, 0, ctxt.CompressInstructions)
if maxPC+maxTrampSize > (1 << 20) {
big = true
}
@@ -801,7 +801,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
// Validate all instructions - this provides nice error messages.
for p := cursym.Func().Text; p != nil; p = p.Link {
- for _, ins := range instructionsForProg(p) {
+ for _, ins := range instructionsForProg(p, ctxt.CompressInstructions) {
ins.validate(ctxt)
}
}
@@ -1141,6 +1141,14 @@ func wantImmU(ctxt *obj.Link, ins *instruction, imm int64, nbits uint) {
}
}
+func isScaledImmI(imm int64, nbits uint, scale int64) bool {
+ return immFits(imm, nbits, true) == nil && imm%scale == 0
+}
+
+func isScaledImmU(imm int64, nbits uint, scale int64) bool {
+ return immFits(imm, nbits, false) == nil && imm%scale == 0
+}
+
func wantScaledImm(ctxt *obj.Link, ins *instruction, imm int64, nbits uint, scale int64, signed bool) {
if err := immFits(imm, nbits, signed); err != nil {
ctxt.Diag("%v: %v", ins, err)
@@ -1180,6 +1188,10 @@ func wantIntReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) {
wantReg(ctxt, ins, pos, "integer", r, REG_X0, REG_X31)
}
+func isIntPrimeReg(r uint32) bool {
+ return r >= REG_X8 && r <= REG_X15
+}
+
// wantIntPrimeReg checks that r is an integer register that can be used
// in a prime register field of a compressed instruction.
func wantIntPrimeReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) {
@@ -1191,6 +1203,10 @@ func wantFloatReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) {
wantReg(ctxt, ins, pos, "float", r, REG_F0, REG_F31)
}
+func isFloatPrimeReg(r uint32) bool {
+ return r >= REG_F8 && r <= REG_F15
+}
+
// wantFloatPrimeReg checks that r is an floating-point register that can
// be used in a prime register field of a compressed instruction.
func wantFloatPrimeReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) {
@@ -3515,6 +3531,147 @@ func (ins *instruction) usesRegTmp() bool {
return ins.rd == REG_TMP || ins.rs1 == REG_TMP || ins.rs2 == REG_TMP
}
+func (ins *instruction) compress() {
+ switch ins.as {
+ case ALW:
+ if ins.rd != REG_X0 && ins.rs1 == REG_SP && isScaledImmU(ins.imm, 8, 4) {
+ ins.as, ins.rs1, ins.rs2 = ACLWSP, obj.REG_NONE, ins.rs1
+ } else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && isScaledImmU(ins.imm, 7, 4) {
+ ins.as = ACLW
+ }
+
+ case ALD:
+ if ins.rs1 == REG_SP && ins.rd != REG_X0 && isScaledImmU(ins.imm, 9, 8) {
+ ins.as, ins.rs1, ins.rs2 = ACLDSP, obj.REG_NONE, ins.rs1
+ } else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && isScaledImmU(ins.imm, 8, 8) {
+ ins.as = ACLD
+ }
+
+ case AFLD:
+ if ins.rs1 == REG_SP && isScaledImmU(ins.imm, 9, 8) {
+ ins.as, ins.rs1, ins.rs2 = ACFLDSP, obj.REG_NONE, ins.rs1
+ } else if isFloatPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && isScaledImmU(ins.imm, 8, 8) {
+ ins.as = ACFLD
+ }
+
+ case ASW:
+ if ins.rd == REG_SP && isScaledImmU(ins.imm, 8, 4) {
+ ins.as, ins.rs1, ins.rs2 = ACSWSP, obj.REG_NONE, ins.rs1
+ } else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && isScaledImmU(ins.imm, 7, 4) {
+ ins.as, ins.rd, ins.rs1, ins.rs2 = ACSW, obj.REG_NONE, ins.rd, ins.rs1
+ }
+
+ case ASD:
+ if ins.rd == REG_SP && isScaledImmU(ins.imm, 9, 8) {
+ ins.as, ins.rs1, ins.rs2 = ACSDSP, obj.REG_NONE, ins.rs1
+ } else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && isScaledImmU(ins.imm, 8, 8) {
+ ins.as, ins.rd, ins.rs1, ins.rs2 = ACSD, obj.REG_NONE, ins.rd, ins.rs1
+ }
+
+ case AFSD:
+ if ins.rd == REG_SP && isScaledImmU(ins.imm, 9, 8) {
+ ins.as, ins.rs1, ins.rs2 = ACFSDSP, obj.REG_NONE, ins.rs1
+ } else if isIntPrimeReg(ins.rd) && isFloatPrimeReg(ins.rs1) && isScaledImmU(ins.imm, 8, 8) {
+ ins.as, ins.rd, ins.rs1, ins.rs2 = ACFSD, obj.REG_NONE, ins.rd, ins.rs1
+ }
+
+ case AADDI:
+ if ins.rd == REG_SP && ins.rs1 == REG_SP && ins.imm != 0 && isScaledImmI(ins.imm, 10, 16) {
+ ins.as = ACADDI16SP
+ } else if ins.rd != REG_X0 && ins.rd == ins.rs1 && ins.imm != 0 && immIFits(ins.imm, 6) == nil {
+ ins.as = ACADDI
+ } else if isIntPrimeReg(ins.rd) && ins.rs1 == REG_SP && ins.imm != 0 && isScaledImmU(ins.imm, 10, 4) {
+ ins.as = ACADDI4SPN
+ } else if ins.rd != REG_X0 && ins.rs1 == REG_X0 && immIFits(ins.imm, 6) == nil {
+ ins.as, ins.rs1 = ACLI, obj.REG_NONE
+ } else if ins.rd != REG_X0 && ins.rs1 != REG_X0 && ins.imm == 0 {
+ ins.as, ins.rs1, ins.rs2 = ACMV, obj.REG_NONE, ins.rs1
+ } else if ins.rd == REG_X0 && ins.rs1 == REG_X0 && ins.imm == 0 {
+ ins.as, ins.rs1 = ACNOP, ins.rd
+ }
+
+ case AADDIW:
+ if ins.rd == ins.rs1 && immIFits(ins.imm, 6) == nil {
+ ins.as = ACADDIW
+ }
+
+ case ALUI:
+ if ins.rd != REG_X0 && ins.rd != REG_SP && ins.imm != 0 && immIFits(ins.imm, 6) == nil {
+ ins.as = ACLUI
+ }
+
+ case ASLLI:
+ if ins.rd != REG_X0 && ins.rd == ins.rs1 && ins.imm != 0 {
+ ins.as = ACSLLI
+ }
+
+ case ASRLI:
+ if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && ins.imm != 0 {
+ ins.as = ACSRLI
+ }
+
+ case ASRAI:
+ if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && ins.imm != 0 {
+ ins.as = ACSRAI
+ }
+
+ case AANDI:
+ if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && immIFits(ins.imm, 6) == nil {
+ ins.as = ACANDI
+ }
+
+ case AADD:
+ if ins.rd != REG_X0 && ins.rd == ins.rs1 && ins.rs2 != REG_X0 {
+ ins.as = ACADD
+ } else if ins.rd != REG_X0 && ins.rd == ins.rs2 && ins.rs1 != REG_X0 {
+ ins.as, ins.rs1, ins.rs2 = ACADD, ins.rs2, ins.rs1
+ } else if ins.rd != REG_X0 && ins.rs1 == REG_X0 && ins.rs2 != REG_X0 {
+ ins.as = ACMV
+ }
+
+ case AADDW:
+ if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && isIntPrimeReg(ins.rs2) {
+ ins.as = ACADDW
+ } else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && ins.rd == ins.rs2 {
+ ins.as, ins.rs1, ins.rs2 = ACADDW, ins.rs2, ins.rs1
+ }
+
+ case ASUB:
+ if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && isIntPrimeReg(ins.rs2) {
+ ins.as = ACSUB
+ }
+
+ case ASUBW:
+ if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && isIntPrimeReg(ins.rs2) {
+ ins.as = ACSUBW
+ }
+
+ case AAND:
+ if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && isIntPrimeReg(ins.rs2) {
+ ins.as = ACAND
+ } else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && ins.rd == ins.rs2 {
+ ins.as, ins.rs1, ins.rs2 = ACAND, ins.rs2, ins.rs1
+ }
+
+ case AOR:
+ if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && isIntPrimeReg(ins.rs2) {
+ ins.as = ACOR
+ } else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && ins.rd == ins.rs2 {
+ ins.as, ins.rs1, ins.rs2 = ACOR, ins.rs2, ins.rs1
+ }
+
+ case AXOR:
+ if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && isIntPrimeReg(ins.rs2) {
+ ins.as = ACXOR
+ } else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && ins.rd == ins.rs2 {
+ ins.as, ins.rs1, ins.rs2 = ACXOR, ins.rs2, ins.rs1
+ }
+
+ case AEBREAK:
+ ins.as, ins.rd, ins.rs1 = ACEBREAK, obj.REG_NONE, obj.REG_NONE
+ }
+}
+
// instructionForProg returns the default *obj.Prog to instruction mapping.
func instructionForProg(p *obj.Prog) *instruction {
ins := &instruction{
@@ -4057,7 +4214,7 @@ func instructionsForMinMax(p *obj.Prog, ins *instruction) []*instruction {
}
// instructionsForProg returns the machine instructions for an *obj.Prog.
-func instructionsForProg(p *obj.Prog) []*instruction {
+func instructionsForProg(p *obj.Prog, compress bool) []*instruction {
ins := instructionForProg(p)
inss := []*instruction{ins}
@@ -4710,6 +4867,15 @@ func instructionsForProg(p *obj.Prog) []*instruction {
ins.rs1, ins.rs2 = obj.REG_NONE, REG_V0
}
+ // Only compress instructions when there is no relocation, since
+ // relocation relies on knowledge about the exact instructions that
+ // are in use.
+ if compress && p.Mark&NEED_RELOC == 0 {
+ for _, ins := range inss {
+ ins.compress()
+ }
+ }
+
for _, ins := range inss {
ins.p = p
}
@@ -4814,7 +4980,7 @@ func assemble(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
}
offset := p.Pc
- for _, ins := range instructionsForProg(p) {
+ for _, ins := range instructionsForProg(p, ctxt.CompressInstructions) {
if ic, err := ins.encode(); err == nil {
cursym.WriteInt(ctxt, offset, ins.length(), int64(ic))
offset += int64(ins.length())