9 files changed, 552 insertions, 51 deletions
diff --git a/src/cmd/internal/obj/link.go b/src/cmd/internal/obj/link.go
index 85dca33d27..c70c1d9438 100644
--- a/src/cmd/internal/obj/link.go
+++ b/src/cmd/internal/obj/link.go
@@ -1153,36 +1153,37 @@ type Func interface {
 // Link holds the context for writing object code from a compiler
 // to be linker input or for reading that input into the linker.
 type Link struct {
-	Headtype           objabi.HeadType
-	Arch               *LinkArch
-	Debugasm           int
-	Debugvlog          bool
-	Debugpcln          string
-	Flag_shared        bool
-	Flag_dynlink       bool
-	Flag_linkshared    bool
-	Flag_optimize      bool
-	Flag_locationlists bool
-	Flag_noRefName     bool   // do not include referenced symbol names in object file
-	Retpoline          bool   // emit use of retpoline stubs for indirect jmp/call
-	Flag_maymorestack  string // If not "", call this function before stack checks
-	Bso                *bufio.Writer
-	Pathname           string
-	Pkgpath            string           // the current package's import path
-	hashmu             sync.Mutex       // protects hash, funchash
-	hash               map[string]*LSym // name -> sym mapping
-	funchash           map[string]*LSym // name -> sym mapping for ABIInternal syms
-	statichash         map[string]*LSym // name -> sym mapping for static syms
-	PosTable           src.PosTable
-	InlTree            InlTree // global inlining tree used by gc/inl.go
-	DwFixups           *DwarfFixupTable
-	DwTextCount        int
-	Imports            []goobj.ImportedPkg
-	DiagFunc           func(string, ...any)
-	DiagFlush          func()
-	DebugInfo          func(ctxt *Link, fn *LSym, info *LSym, curfn Func) ([]dwarf.Scope, dwarf.InlCalls)
-	GenAbstractFunc    func(fn *LSym)
-	Errors             int
+	Headtype             objabi.HeadType
+	Arch                 *LinkArch
+	CompressInstructions bool // use compressed instructions where possible (if supported by architecture)
+	Debugasm             int
+	Debugvlog            bool
+	Debugpcln            string
+	Flag_shared          bool
+	Flag_dynlink         bool
+	Flag_linkshared      bool
+	Flag_optimize        bool
+	Flag_locationlists   bool
+	Flag_noRefName       bool   // do not include referenced symbol names in object file
+	Retpoline            bool   // emit use of retpoline stubs for indirect jmp/call
+	Flag_maymorestack    string // If not "", call this function before stack checks
+	Bso                  *bufio.Writer
+	Pathname             string
+	Pkgpath              string           // the current package's import path
+	hashmu               sync.Mutex       // protects hash, funchash
+	hash                 map[string]*LSym // name -> sym mapping
+	funchash             map[string]*LSym // name -> sym mapping for ABIInternal syms
+	statichash           map[string]*LSym // name -> sym mapping for static syms
+	PosTable             src.PosTable
+	InlTree              InlTree // global inlining tree used by gc/inl.go
+	DwFixups             *DwarfFixupTable
+	DwTextCount          int
+	Imports              []goobj.ImportedPkg
+	DiagFunc             func(string, ...any)
+	DiagFlush            func()
+	DebugInfo            func(ctxt *Link, fn *LSym, info *LSym, curfn Func) ([]dwarf.Scope, dwarf.InlCalls)
+	GenAbstractFunc      func(fn *LSym)
+	Errors               int
 
 	InParallel    bool // parallel backend phase in effect
 	UseBASEntries bool // use Base Address Selection Entries in location lists and PC ranges
diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
index 73f145df14..5b8bffc9f1 100644
--- a/src/cmd/internal/obj/loong64/a.out.go
+++ b/src/cmd/internal/obj/loong64/a.out.go
@@ -589,6 +589,10 @@ const (
 	AORN
 	AANDN
 
+	// 2.2.1.12
+	AMULWVW
+	AMULWVWU
+
 	// 2.2.7. Atomic Memory Access Instructions
 	AAMSWAPB
 	AAMSWAPH
diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go
index ab85c52a21..1749b43bf6 100644
--- a/src/cmd/internal/obj/loong64/anames.go
+++ b/src/cmd/internal/obj/loong64/anames.go
@@ -131,6 +131,8 @@ var Anames = []string{
 	"ALSLV",
 	"ORN",
 	"ANDN",
+	"MULWVW",
+	"MULWVWU",
 	"AMSWAPB",
 	"AMSWAPH",
 	"AMSWAPW",
diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
index 38b075d77e..b35e49a1b6 100644
--- a/src/cmd/internal/obj/loong64/asm.go
+++ b/src/cmd/internal/obj/loong64/asm.go
@@ -1503,6 +1503,8 @@ func buildop(ctxt *obj.Link) {
 			opset(AREMU, r0)
 			opset(ADIV, r0)
 			opset(ADIVU, r0)
+			opset(AMULWVW, r0)
+			opset(AMULWVWU, r0)
 
 		case AMULV:
 			opset(AMULVU, r0)
@@ -3230,6 +3232,10 @@ func (c *ctxt0) oprrr(a obj.As) uint32 {
 		return 0x3c << 15 // mulh.d
 	case AMULHVU:
 		return 0x3d << 15 // mulhu.d
+	case AMULWVW:
+		return 0x3e << 15 // mulw.d.w
+	case AMULWVWU:
+		return 0x3f << 15 // mulw.d.wu
 	case ADIV:
 		return 0x40 << 15 // div.w
 	case ADIVU:
diff --git a/src/cmd/internal/obj/riscv/asm_test.go b/src/cmd/internal/obj/riscv/asm_test.go
index f40e57fa64..5b50d1533a 100644
--- a/src/cmd/internal/obj/riscv/asm_test.go
+++ b/src/cmd/internal/obj/riscv/asm_test.go
@@ -11,8 +11,8 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"regexp"
 	"runtime"
-	"strings"
 	"testing"
 )
 
@@ -48,10 +48,10 @@ func genLargeBranch(buf *bytes.Buffer) {
 	fmt.Fprintln(buf, "TEXT f(SB),0,$0-0")
 	fmt.Fprintln(buf, "BEQ X0, X0, label")
 	for i := 0; i < 1<<19; i++ {
-		fmt.Fprintln(buf, "ADD $0, X0, X0")
+		fmt.Fprintln(buf, "ADD $0, X5, X0")
 	}
 	fmt.Fprintln(buf, "label:")
-	fmt.Fprintln(buf, "ADD $0, X0, X0")
+	fmt.Fprintln(buf, "ADD $0, X5, X0")
 }
 
 // TestLargeCall generates a large function (>1MB of text) with a call to
@@ -112,11 +112,11 @@ func genLargeCall(buf *bytes.Buffer) {
 	fmt.Fprintln(buf, "TEXT ·x(SB),0,$0-0")
 	fmt.Fprintln(buf, "CALL ·y(SB)")
 	for i := 0; i < 1<<19; i++ {
-		fmt.Fprintln(buf, "ADD $0, X0, X0")
+		fmt.Fprintln(buf, "ADD $0, X5, X0")
 	}
 	fmt.Fprintln(buf, "RET")
 	fmt.Fprintln(buf, "TEXT ·y(SB),0,$0-0")
-	fmt.Fprintln(buf, "ADD $0, X0, X0")
+	fmt.Fprintln(buf, "ADD $0, X5, X0")
 	fmt.Fprintln(buf, "RET")
 }
 
@@ -301,9 +301,9 @@ TEXT _stub(SB),$0-0
 	//	FENCE
 	//	NOP
 	//	FENCE
-	//	RET
-	want := "0f 00 f0 0f 13 00 00 00 0f 00 f0 0f 67 80 00 00"
-	if !strings.Contains(string(out), want) {
+	//	RET	(CJALR or JALR)
+	want := regexp.MustCompile("0x0000 0f 00 f0 0f 13 00 00 00 0f 00 f0 0f (82 80|67 80 00 00) ")
+	if !want.Match(out) {
 		t.Errorf("PCALIGN test failed - got %s\nwant %s", out, want)
 	}
 }
diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
index 60174a0b3a..a91395dd38 100644
--- a/src/cmd/internal/obj/riscv/cpu.go
+++ b/src/cmd/internal/obj/riscv/cpu.go
@@ -326,6 +326,9 @@ const (
 	NEED_GOT_PCREL_ITYPE_RELOC
 )
 
+const NEED_RELOC = NEED_JAL_RELOC | NEED_CALL_RELOC | NEED_PCREL_ITYPE_RELOC |
+	NEED_PCREL_STYPE_RELOC | NEED_GOT_PCREL_ITYPE_RELOC
+
 // RISC-V mnemonics, as defined in the "opcodes" and "opcodes-pseudo" files
 // at https://github.com/riscv/riscv-opcodes.
 //
diff --git a/src/cmd/internal/obj/riscv/doc.go b/src/cmd/internal/obj/riscv/doc.go
new file mode 100644
index 0000000000..365bedd299
--- /dev/null
+++ b/src/cmd/internal/obj/riscv/doc.go
@@ -0,0 +1,297 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package riscv implements the riscv64 assembler.
+
+# Register naming
+
+The integer registers are named X0 through to X31, however X4 must be accessed
+through its RISC-V ABI name, TP, and X27, which holds a pointer to the Go
+routine structure, must be referred to as g. Additionally, when building in
+shared mode, X3 is unavailable and must be accessed via its RISC-V ABI name,
+GP.
+
+The floating-point registers are named F0 through to F31.
+
+The vector registers are named V0 through to V31.
+
+Both integer and floating-point registers can be referred to by their RISC-V
+ABI names, e.g., A0 or FT0, with the exception that X27 cannot be referred to
+by its RISC-V ABI name, S11.  It must be referred to as g.
+
+Some of the integer registers are used by the Go runtime and assembler - X26 is
+the closure pointer, X27 points to the Go routine structure and X31 is a
+temporary register used by the Go assembler. Use of X31 should be avoided in
+hand written assembly code as its value could be altered by the instruction
+sequences emitted by the assembler.
+
+# Instruction naming
+
+Many RISC-V instructions contain one or more suffixes in their names. In the
+[RISC-V ISA Manual] these suffixes are separated from themselves and the
+name of the instruction mnemonic with a dot ('.'). In the Go assembler, the
+separators are omitted and the suffixes are written in upper case.
+
+Example:
+
+	FMVWX           <=>     fmv.w.x
+
+# Rounding modes
+
+The Go toolchain does not set the FCSR register and requires the desired
+rounding mode to be explicitly encoded within floating-point instructions.
+The syntax the Go assembler uses to specify the rounding modes differs
+from the syntax in the RISC-V specifications. In the [RISC-V ISA Manual]
+the rounding mode is given as an extra operand at the end of an
+assembly language instruction. In the Go assembler, the rounding modes are
+converted to uppercase and follow the instruction mnemonic from which they
+are separated with a dot ('.').
+
+Example:
+
+	FCVTLUS.RNE F0, X5      <=>     fcvt.lu.s x5, f0, rne
+
+RTZ is assumed if the rounding mode is omitted.
+
+# RISC-V extensions
+
+By default the Go compiler targets the [rva20u64] profile. This profile mandates
+all the general RISC-V instructions, allowing Go to use integer, multiplication,
+division, floating-point and atomic instructions without having to
+perform compile time or runtime checks to verify that their use is appropriate
+for the target hardware. All widely available riscv64 devices support at least
+[rva20u64]. The Go toolchain can be instructed to target later RISC-V profiles,
+including, [rva22u64] and [rva23u64], via the GORISCV64 environment variable.
+Instructions that are provided by newer profiles cannot typically be used in
+handwritten assembly code without compile time guards (or runtime checks)
+that ensure they are hardware supported.
+
+The file asm_riscv64.h defines macros for each RISC-V extension that is enabled
+by setting the GORISCV64 environment variable to a value other than [rva20u64].
+For example, if GORISCV64=rva22u64 the macros hasZba, hasZbb and hasZbs will be
+defined. If GORISCV64=rva23u64 hasV will be defined in addition to hasZba,
+hasZbb and hasZbs. These macros can be used to determine whether it's safe
+to use an instruction in hand-written assembly.
+
+It is not always necessary to include asm_riscv64.h and use #ifdefs in your
+code to safely take advantage of instructions present in the [rva22u64]
+profile. In some cases the assembler can generate [rva20u64] compatible code
+even when an [rva22u64] instruction is used in an assembly source file. When
+GORISCV64=rva20u64 the assembler will synthesize certain [rva22u64]
+instructions, e.g., ANDN, using multiple [rva20u64] instructions. Instructions
+such as ANDN can then be freely used in assembly code without checking to see
+whether the instruction is supported by the target profile. When building a
+source file containing the ANDN instruction with GORISCV64=rva22u64 the
+assembler will emit the Zbb ANDN instruction directly. When building the same
+source file with GORISCV64=rva20u64 the assembler will emit multiple [rva20u64]
+instructions to synthesize ANDN.
+
+The assembler will also use [rva22u64] instructions to implement the zero and
+sign extension instructions, e.g., MOVB and MOVHU, when GORISCV64=rva22u64 or
+greater.
+
+The instructions not implemented in the default profile ([rva20u64]) that can
+be safely used in assembly code without compile time checks are:
+
+  - ANDN
+  - MAX
+  - MAXU
+  - MIN
+  - MINU
+  - MOVB
+  - MOVH
+  - MOVHU
+  - MOVWU
+  - ORN
+  - ROL
+  - ROLW
+  - ROR
+  - RORI
+  - RORIW
+  - RORW
+  - XNOR
+
+# Operand ordering
+
+The ordering used for instruction operands in the Go assembler differs from the
+ordering defined in the [RISC-V ISA Manual].
+
+1. R-Type instructions
+
+R-Type instructions are written in the reverse order to that given in the
+[RISC-V ISA Manual], with the register order being rs2, rs1, rd.
+
+Examples:
+
+	ADD X10, X11, X12       <=>     add x12, x11, x10
+	FADDD F10, F11, F12     <=>     fadd.d f12, f11, f10
+
+2. I-Type arithmetic instructions
+
+I-Type arithmetic instructions (not loads, fences, ebreak, ecall) use the same
+ordering as the R-Type instructions, typically, imm12, rs1, rd.
+
+Examples:
+
+	ADDI $1, X11, X12       <=>     add x12, x11, 1
+	SLTI $1, X11, X12       <=>     slti x12, x11, 1
+
+3. Loads and Stores
+
+Load instructions are written with the source operand (whether it be a register
+or a memory address), first followed by the destination operand.
+
+Examples:
+
+	MOV 16(X2), X10         <=>     ld x10, 16(x2)
+	MOV X10, (X2)           <=>     sd x10, 0(x2)
+
+4. Branch instructions
+
+The branch instructions use the same operand ordering as is given in the
+[RISC-V ISA Manual], e.g., rs1, rs2, label.
+
+Example:
+
+	BLT X12, X23, loop1     <=>     blt x12, x23, loop1
+
+BLT X12, X23, label will jump to label if X12 < X23. Note this is not the
+same ordering as is used for the SLT instructions.
+
+5. FMA instructions
+
+The Go assembler uses a different ordering for the RISC-V FMA operands to
+the ordering given in the [RISC-V ISA Manual]. The operands are rotated one
+place to the left, so that the destination operand comes last.
+
+Example:
+
+	FMADDS  F1, F2, F3, F4  <=>     fmadd.s f4, f1, f2, f3
+
+6. AMO instructions
+
+The ordering used for the AMO operations is rs2, rs1, rd, i.e., the operands
+as specified in the [RISC-V ISA Manual] are rotated one place to the left.
+
+Example:
+
+	AMOSWAPW X5, (X6), X7   <=>     amoswap.w x7, x5, (x6)
+
+7. Vector instructions
+
+The VSETVLI instruction uses the same symbolic names as the [RISC-V ISA Manual]
+to represent the components of vtype, with the exception
+that they are written in upper case. The ordering of the operands in the Go
+assembler differs from the [RISC-V ISA Manual] in that the operands are
+rotated one place to the left so that the destination register, the register
+that holds the new vl, is the last operand.
+
+Example:
+
+	VSETVLI X10, E8, M1, TU, MU, X12        <=>     vsetvli x12, x10, e8, m1, tu, mu
+
+Vector load and store instructions follow the pattern set by scalar loads and
+stores, i.e., the source is always the first operand and the destination the
+last. However, the ordering of the operands of these instructions is
+complicated by the optional mask register and, in some cases, the use of an
+additional stride or index register. In the Go assembler the index and stride
+registers appear as the second operand in indexed or strided loads and stores,
+while the mask register, if present, is always the penultimate operand.
+
+Examples:
+
+	VLE8V (X10), V3                 <=>     vle8.v  v3, (x10)
+	VSE8V V3, (X10)                 <=>     vse8.v  v3, (x10)
+	VLE8V (X10), V0, V3             <=>     vle8.v  v3, (x10), v0.t
+	VSE8V V3, V0, (X10)             <=>     vse8.v  v3, (x10), v0.t
+	VLSE8V (X10), X11, V3           <=>     vlse8.v v3, (x10), x11
+	VSSE8V V3, X11, (X10)           <=>     vsse8.v v3, (x10), x11
+	VLSE8V (X10), X11, V0, V3       <=>     vlse8.v v3, (x10), x11, v0.t
+	VSSE8V V3, X11, V0, (X10)       <=>     vsse8.v v3, (x10), x11, v0.t
+	VLUXEI8V (X10), V2, V3          <=>     vluxei8.v v3, (x10), v2
+	VSUXEI8V V3, V2, (X10)          <=>     vsuxei8.v v3, (x10), v2
+	VLUXEI8V (X10), V2, V0, V3      <=>     vluxei8.v v3, (x10), v2, v0.t
+	VSUXEI8V V3, V2, V0, (X10)      <=>     vsuxei8.v v3, (x10), v2, v0.t
+	VL1RE8V (X10), V3               <=>     vl1re8.v v3, (x10)
+	VS1RV V3, (X11)                 <=>     vs1r.v  v3, (x11)
+
+The ordering of operands for two and three argument vector arithmetic instructions is
+reversed in the Go assembler.
+
+Examples:
+
+	VMVVV V2, V3                    <=> vmv.v.v v3, v2
+	VADDVV V1, V2, V3               <=> vadd.vv v3, v2, v1
+	VADDVX X10, V2, V3              <=> vadd.vx v3, v2, x10
+	VMADCVI $15, V2, V3             <=> vmadc.vi v3, v2, 15
+
+The mask register, when specified, is always the penultimate operand in a vector
+arithmetic instruction, appearing before the destination register.
+
+Examples:
+
+	VANDVV V1, V2, V0, V3           <=> vand.vv v3, v2, v1, v0.t
+
+# Ternary instructions
+
+The Go assembler allows the second operand to be omitted from most ternary
+instructions if it matches the third (destination) operand.
+
+Examples:
+
+	ADD X10, X12, X12       <=>     ADD X10, X12
+	ANDI $3, X12, X12       <=>     ANDI $3, X12
+
+The use of this abbreviated syntax is encouraged.
+
+# Ordering of atomic instructions
+
+It is not possible to specify the ordering bits in the FENCE, LR, SC or AMO
+instructions.  The FENCE instruction is always emitted as a full fence, the
+acquire and release bits are always set for the AMO instructions, the acquire
+bit is always set for the LR instructions while the release bit is set for
+the SC instructions.
+
+# Immediate operands
+
+In many cases, where an R-Type instruction has a corresponding I-Type
+instruction, the R-Type mnemonic can be used in place of the I-Type mnemonic.
+The assembler assumes that the immediate form of the instruction was intended
+when the first operand is given as an immediate value rather than a register.
+
+Example:
+
+	AND $3, X12, X13        <=>     ANDI $3, X12, X13
+
+# Integer constant materialization
+
+The MOV instruction can be used to set a register to the value of any 64 bit
+constant literal. The way this is achieved by the assembler varies depending
+on the value of the constant. Where possible the assembler will synthesize the
+constant using one or more RISC-V arithmetic instructions. If it is unable
+to easily materialize the constant it will load the 64 bit literal from memory.
+
+A 32 bit constant literal can be specified as an argument to ADDI, ANDI, ORI and
+XORI. If the specified literal does not fit into 12 bits the assembler will
+generate extra instructions to synthesize it.
+
+Integer constants provided as operands to all other instructions must fit into
+the number of bits allowed by the instructions' encodings for immediate values.
+Otherwise, an error will be generated.
+
+# Floating point constant materialization
+
+The MOVF and MOVD instructions can be used to set a register to the value
+of any 32 bit or 64 bit floating point constant literal, respectively.  Unless
+the constant literal is 0.0, MOVF and MOVD will be encoded as FLW and FLD
+instructions that load the constant from a location within the program's
+binary.
+
+[RISC-V ISA Manual]: https://github.com/riscv/riscv-isa-manual
+[rva20u64]: https://github.com/riscv/riscv-profiles/blob/main/src/profiles.adoc#51-rva20u64-profile
+[rva22u64]: https://github.com/riscv/riscv-profiles/blob/main/src/profiles.adoc#rva22u64-profile
+[rva23u64]: https://github.com/riscv/riscv-profiles/blob/main/src/rva23-profile.adoc#rva23u64-profile
+*/
+package riscv
diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
index 3deab34d31..043be17c07 100644
--- a/src/cmd/internal/obj/riscv/obj.go
+++ b/src/cmd/internal/obj/riscv/obj.go
@@ -414,10 +414,10 @@ func containsCall(sym *obj.LSym) bool {
 
 // setPCs sets the Pc field in all instructions reachable from p.
 // It uses pc as the initial value and returns the next available pc.
-func setPCs(p *obj.Prog, pc int64) int64 {
+func setPCs(p *obj.Prog, pc int64, compress bool) int64 {
 	for ; p != nil; p = p.Link {
 		p.Pc = pc
-		for _, ins := range instructionsForProg(p) {
+		for _, ins := range instructionsForProg(p, compress) {
 			pc += int64(ins.length())
 		}
 
@@ -671,7 +671,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 	// a fixed point will be reached).  No attempt to handle functions > 2GiB.
 	for {
 		big, rescan := false, false
-		maxPC := setPCs(cursym.Func().Text, 0)
+		maxPC := setPCs(cursym.Func().Text, 0, ctxt.CompressInstructions)
 		if maxPC+maxTrampSize > (1 << 20) {
 			big = true
 		}
@@ -801,7 +801,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 
 	// Validate all instructions - this provides nice error messages.
 	for p := cursym.Func().Text; p != nil; p = p.Link {
-		for _, ins := range instructionsForProg(p) {
+		for _, ins := range instructionsForProg(p, ctxt.CompressInstructions) {
 			ins.validate(ctxt)
 		}
 	}
@@ -1141,6 +1141,14 @@ func wantImmU(ctxt *obj.Link, ins *instruction, imm int64, nbits uint) {
 	}
 }
 
+func isScaledImmI(imm int64, nbits uint, scale int64) bool {
+	return immFits(imm, nbits, true) == nil && imm%scale == 0
+}
+
+func isScaledImmU(imm int64, nbits uint, scale int64) bool {
+	return immFits(imm, nbits, false) == nil && imm%scale == 0
+}
+
 func wantScaledImm(ctxt *obj.Link, ins *instruction, imm int64, nbits uint, scale int64, signed bool) {
 	if err := immFits(imm, nbits, signed); err != nil {
 		ctxt.Diag("%v: %v", ins, err)
@@ -1180,6 +1188,10 @@ func wantIntReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) {
 	wantReg(ctxt, ins, pos, "integer", r, REG_X0, REG_X31)
 }
 
+func isIntPrimeReg(r uint32) bool {
+	return r >= REG_X8 && r <= REG_X15
+}
+
 // wantIntPrimeReg checks that r is an integer register that can be used
 // in a prime register field of a compressed instruction.
 func wantIntPrimeReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) {
@@ -1191,6 +1203,10 @@ func wantFloatReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) {
 	wantReg(ctxt, ins, pos, "float", r, REG_F0, REG_F31)
 }
 
+func isFloatPrimeReg(r uint32) bool {
+	return r >= REG_F8 && r <= REG_F15
+}
+
 // wantFloatPrimeReg checks that r is an floating-point register that can
 // be used in a prime register field of a compressed instruction.
 func wantFloatPrimeReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) {
@@ -3515,6 +3531,147 @@ func (ins *instruction) usesRegTmp() bool {
 	return ins.rd == REG_TMP || ins.rs1 == REG_TMP || ins.rs2 == REG_TMP
 }
 
+func (ins *instruction) compress() {
+	switch ins.as {
+	case ALW:
+		if ins.rd != REG_X0 && ins.rs1 == REG_SP && isScaledImmU(ins.imm, 8, 4) {
+			ins.as, ins.rs1, ins.rs2 = ACLWSP, obj.REG_NONE, ins.rs1
+		} else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && isScaledImmU(ins.imm, 7, 4) {
+			ins.as = ACLW
+		}
+
+	case ALD:
+		if ins.rs1 == REG_SP && ins.rd != REG_X0 && isScaledImmU(ins.imm, 9, 8) {
+			ins.as, ins.rs1, ins.rs2 = ACLDSP, obj.REG_NONE, ins.rs1
+		} else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && isScaledImmU(ins.imm, 8, 8) {
+			ins.as = ACLD
+		}
+
+	case AFLD:
+		if ins.rs1 == REG_SP && isScaledImmU(ins.imm, 9, 8) {
+			ins.as, ins.rs1, ins.rs2 = ACFLDSP, obj.REG_NONE, ins.rs1
+		} else if isFloatPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && isScaledImmU(ins.imm, 8, 8) {
+			ins.as = ACFLD
+		}
+
+	case ASW:
+		if ins.rd == REG_SP && isScaledImmU(ins.imm, 8, 4) {
+			ins.as, ins.rs1, ins.rs2 = ACSWSP, obj.REG_NONE, ins.rs1
+		} else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && isScaledImmU(ins.imm, 7, 4) {
+			ins.as, ins.rd, ins.rs1, ins.rs2 = ACSW, obj.REG_NONE, ins.rd, ins.rs1
+		}
+
+	case ASD:
+		if ins.rd == REG_SP && isScaledImmU(ins.imm, 9, 8) {
+			ins.as, ins.rs1, ins.rs2 = ACSDSP, obj.REG_NONE, ins.rs1
+		} else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && isScaledImmU(ins.imm, 8, 8) {
+			ins.as, ins.rd, ins.rs1, ins.rs2 = ACSD, obj.REG_NONE, ins.rd, ins.rs1
+		}
+
+	case AFSD:
+		if ins.rd == REG_SP && isScaledImmU(ins.imm, 9, 8) {
+			ins.as, ins.rs1, ins.rs2 = ACFSDSP, obj.REG_NONE, ins.rs1
+		} else if isIntPrimeReg(ins.rd) && isFloatPrimeReg(ins.rs1) && isScaledImmU(ins.imm, 8, 8) {
+			ins.as, ins.rd, ins.rs1, ins.rs2 = ACFSD, obj.REG_NONE, ins.rd, ins.rs1
+		}
+
+	case AADDI:
+		if ins.rd == REG_SP && ins.rs1 == REG_SP && ins.imm != 0 && isScaledImmI(ins.imm, 10, 16) {
+			ins.as = ACADDI16SP
+		} else if ins.rd != REG_X0 && ins.rd == ins.rs1 && ins.imm != 0 && immIFits(ins.imm, 6) == nil {
+			ins.as = ACADDI
+		} else if isIntPrimeReg(ins.rd) && ins.rs1 == REG_SP && ins.imm != 0 && isScaledImmU(ins.imm, 10, 4) {
+			ins.as = ACADDI4SPN
+		} else if ins.rd != REG_X0 && ins.rs1 == REG_X0 && immIFits(ins.imm, 6) == nil {
+			ins.as, ins.rs1 = ACLI, obj.REG_NONE
+		} else if ins.rd != REG_X0 && ins.rs1 != REG_X0 && ins.imm == 0 {
+			ins.as, ins.rs1, ins.rs2 = ACMV, obj.REG_NONE, ins.rs1
+		} else if ins.rd == REG_X0 && ins.rs1 == REG_X0 && ins.imm == 0 {
+			ins.as, ins.rs1 = ACNOP, ins.rd
+		}
+
+	case AADDIW:
+		if ins.rd == ins.rs1 && immIFits(ins.imm, 6) == nil {
+			ins.as = ACADDIW
+		}
+
+	case ALUI:
+		if ins.rd != REG_X0 && ins.rd != REG_SP && ins.imm != 0 && immIFits(ins.imm, 6) == nil {
+			ins.as = ACLUI
+		}
+
+	case ASLLI:
+		if ins.rd != REG_X0 && ins.rd == ins.rs1 && ins.imm != 0 {
+			ins.as = ACSLLI
+		}
+
+	case ASRLI:
+		if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && ins.imm != 0 {
+			ins.as = ACSRLI
+		}
+
+	case ASRAI:
+		if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && ins.imm != 0 {
+			ins.as = ACSRAI
+		}
+
+	case AANDI:
+		if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && immIFits(ins.imm, 6) == nil {
+			ins.as = ACANDI
+		}
+
+	case AADD:
+		if ins.rd != REG_X0 && ins.rd == ins.rs1 && ins.rs2 != REG_X0 {
+			ins.as = ACADD
+		} else if ins.rd != REG_X0 && ins.rd == ins.rs2 && ins.rs1 != REG_X0 {
+			ins.as, ins.rs1, ins.rs2 = ACADD, ins.rs2, ins.rs1
+		} else if ins.rd != REG_X0 && ins.rs1 == REG_X0 && ins.rs2 != REG_X0 {
+			ins.as = ACMV
+		}
+
+	case AADDW:
+		if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && isIntPrimeReg(ins.rs2) {
+			ins.as = ACADDW
+		} else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && ins.rd == ins.rs2 {
+			ins.as, ins.rs1, ins.rs2 = ACADDW, ins.rs2, ins.rs1
+		}
+
+	case ASUB:
+		if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && isIntPrimeReg(ins.rs2) {
+			ins.as = ACSUB
+		}
+
+	case ASUBW:
+		if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && isIntPrimeReg(ins.rs2) {
+			ins.as = ACSUBW
+		}
+
+	case AAND:
+		if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && isIntPrimeReg(ins.rs2) {
+			ins.as = ACAND
+		} else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && ins.rd == ins.rs2 {
+			ins.as, ins.rs1, ins.rs2 = ACAND, ins.rs2, ins.rs1
+		}
+
+	case AOR:
+		if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && isIntPrimeReg(ins.rs2) {
+			ins.as = ACOR
+		} else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && ins.rd == ins.rs2 {
+			ins.as, ins.rs1, ins.rs2 = ACOR, ins.rs2, ins.rs1
+		}
+
+	case AXOR:
+		if isIntPrimeReg(ins.rd) && ins.rd == ins.rs1 && isIntPrimeReg(ins.rs2) {
+			ins.as = ACXOR
+		} else if isIntPrimeReg(ins.rd) && isIntPrimeReg(ins.rs1) && ins.rd == ins.rs2 {
+			ins.as, ins.rs1, ins.rs2 = ACXOR, ins.rs2, ins.rs1
+		}
+
+	case AEBREAK:
+		ins.as, ins.rd, ins.rs1 = ACEBREAK, obj.REG_NONE, obj.REG_NONE
+	}
+}
+
 // instructionForProg returns the default *obj.Prog to instruction mapping.
 func instructionForProg(p *obj.Prog) *instruction {
 	ins := &instruction{
@@ -4057,7 +4214,7 @@ func instructionsForMinMax(p *obj.Prog, ins *instruction) []*instruction {
 }
 
 // instructionsForProg returns the machine instructions for an *obj.Prog.
-func instructionsForProg(p *obj.Prog) []*instruction {
+func instructionsForProg(p *obj.Prog, compress bool) []*instruction {
 	ins := instructionForProg(p)
 	inss := []*instruction{ins}
 
@@ -4710,6 +4867,15 @@ func instructionsForProg(p *obj.Prog) []*instruction {
 		ins.rs1, ins.rs2 = obj.REG_NONE, REG_V0
 	}
 
+	// Only compress instructions when there is no relocation, since
+	// relocation relies on knowledge about the exact instructions that
+	// are in use.
+	if compress && p.Mark&NEED_RELOC == 0 {
+		for _, ins := range inss {
+			ins.compress()
+		}
+	}
+
 	for _, ins := range inss {
 		ins.p = p
 	}
@@ -4799,15 +4965,22 @@ func assemble(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 			v := pcAlignPadLength(p.Pc, alignedValue)
 			offset := p.Pc
 			for ; v >= 4; v -= 4 {
-				// NOP
-				cursym.WriteBytes(ctxt, offset, []byte{0x13, 0, 0, 0})
+				// NOP (ADDI $0, X0, X0)
+				cursym.WriteBytes(ctxt, offset, []byte{0x13, 0x00, 0x00, 0x00})
 				offset += 4
 			}
+			if v == 2 {
+				// CNOP
+				cursym.WriteBytes(ctxt, offset, []byte{0x01, 0x00})
+				offset += 2
+			} else if v != 0 {
+				ctxt.Diag("bad PCALIGN pad length")
+			}
 			continue
 		}
 
 		offset := p.Pc
-		for _, ins := range instructionsForProg(p) {
+		for _, ins := range instructionsForProg(p, ctxt.CompressInstructions) {
 			if ic, err := ins.encode(); err == nil {
 				cursym.WriteInt(ctxt, offset, ins.length(), int64(ic))
 				offset += int64(ins.length())
diff --git a/src/cmd/internal/obj/x86/obj6.go b/src/cmd/internal/obj/x86/obj6.go
index 9c8e5e96f8..ed41d81388 100644
--- a/src/cmd/internal/obj/x86/obj6.go
+++ b/src/cmd/internal/obj/x86/obj6.go
@@ -423,8 +423,12 @@ func rewriteToUseGot(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
 			q.From.Reg = reg
 		}
 	}
-	if p.GetFrom3() != nil && p.GetFrom3().Name == obj.NAME_EXTERN {
-		ctxt.Diag("don't know how to handle %v with -dynlink", p)
+	from3 := p.GetFrom3()
+	for i := range p.RestArgs {
+		a := &p.RestArgs[i].Addr
+		if a != from3 && a.Name == obj.NAME_EXTERN && !a.Sym.Local() {
+			ctxt.Diag("don't know how to handle %v with -dynlink", p)
+		}
 	}
 	var source *obj.Addr
 	// MOVx sym, Ry becomes $MOV sym@GOT, R15; MOVx (R15), Ry
@@ -434,9 +438,17 @@ func rewriteToUseGot(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
 		if p.To.Name == obj.NAME_EXTERN && !p.To.Sym.Local() {
 			ctxt.Diag("cannot handle NAME_EXTERN on both sides in %v with -dynlink", p)
 		}
+		if from3 != nil && from3.Name == obj.NAME_EXTERN && !from3.Sym.Local() {
+			ctxt.Diag("cannot handle NAME_EXTERN on multiple operands in %v with -dynlink", p)
+		}
 		source = &p.From
 	} else if p.To.Name == obj.NAME_EXTERN && !p.To.Sym.Local() {
+		if from3 != nil && from3.Name == obj.NAME_EXTERN && !from3.Sym.Local() {
+			ctxt.Diag("cannot handle NAME_EXTERN on multiple operands in %v with -dynlink", p)
+		}
 		source = &p.To
+	} else if from3 != nil && from3.Name == obj.NAME_EXTERN && !from3.Sym.Local() {
+		source = from3
 	} else {
 		return
 	}
@@ -501,9 +513,7 @@ func rewriteToUseGot(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
 	p2.As = p.As
 	p2.From = p.From
 	p2.To = p.To
-	if from3 := p.GetFrom3(); from3 != nil {
-		p2.AddRestSource(*from3)
-	}
+	p2.RestArgs = p.RestArgs
 	if p.From.Name == obj.NAME_EXTERN {
 		p2.From.Reg = reg
 		p2.From.Name = obj.NAME_NONE
@@ -512,6 +522,11 @@ func rewriteToUseGot(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
 		p2.To.Reg = reg
 		p2.To.Name = obj.NAME_NONE
 		p2.To.Sym = nil
+	} else if p.GetFrom3() != nil && p.GetFrom3().Name == obj.NAME_EXTERN {
+		from3 = p2.GetFrom3()
+		from3.Reg = reg
+		from3.Name = obj.NAME_NONE
+		from3.Sym = nil
 	} else {
 		return
 	}