17 files changed, 195 insertions, 381 deletions
diff --git a/src/math/big/arith.go b/src/math/big/arith.go
index b0885f261f..750ce8aa39 100644
--- a/src/math/big/arith.go
+++ b/src/math/big/arith.go
@@ -60,12 +60,6 @@ func nlz(x Word) uint {
 	return uint(bits.LeadingZeros(uint(x)))
 }
 
-// q = (u1<<_W + u0 - r)/v
-func divWW_g(u1, u0, v Word) (q, r Word) {
-	qq, rr := bits.Div(uint(u1), uint(u0), uint(v))
-	return Word(qq), Word(rr)
-}
-
 // The resulting carry c is either 0 or 1.
 func addVV_g(z, x, y []Word) (c Word) {
 	// The comment near the top of this file discusses this for loop condition.
@@ -207,10 +201,87 @@ func addMulVVW_g(z, x []Word, y Word) (c Word) {
 	return
 }
 
-func divWVW_g(z []Word, xn Word, x []Word, y Word) (r Word) {
+// q = ( x1 << _W + x0 - r)/y. m = floor(( _B^2 - 1 ) / d - _B). Requiring x1<y.
+// An approximate reciprocal with a reference to "Improved Division by Invariant Integers
+// (IEEE Transactions on Computers, 11 Jun. 2010)"
+func divWW(x1, x0, y, m Word) (q, r Word) {
+	s := nlz(y)
+	if s != 0 {
+		x1 = x1<<s | x0>>(_W-s)
+		x0 <<= s
+		y <<= s
+	}
+	d := uint(y)
+	// We know that
+	//   m = ⎣(B^2-1)/d⎦-B
+	//   ⎣(B^2-1)/d⎦ = m+B
+	//   (B^2-1)/d = m+B+delta1    0 <= delta1 <= (d-1)/d
+	//   B^2/d = m+B+delta2        0 <= delta2 <= 1
+	// The quotient we're trying to compute is
+	//   quotient = ⎣(x1*B+x0)/d⎦
+	//            = ⎣(x1*B*(B^2/d)+x0*(B^2/d))/B^2⎦
+	//            = ⎣(x1*B*(m+B+delta2)+x0*(m+B+delta2))/B^2⎦
+	//            = ⎣(x1*m+x1*B+x0)/B + x0*m/B^2 + delta2*(x1*B+x0)/B^2⎦
+	// The latter two terms of this three-term sum are between 0 and 1.
+	// So we can compute just the first term, and we will be low by at most 2.
+	t1, t0 := bits.Mul(uint(m), uint(x1))
+	_, c := bits.Add(t0, uint(x0), 0)
+	t1, _ = bits.Add(t1, uint(x1), c)
+	// The quotient is either t1, t1+1, or t1+2.
+	// We'll try t1 and adjust if needed.
+	qq := t1
+	// compute remainder r=x-d*q.
+	dq1, dq0 := bits.Mul(d, qq)
+	r0, b := bits.Sub(uint(x0), dq0, 0)
+	r1, _ := bits.Sub(uint(x1), dq1, b)
+	// The remainder we just computed is bounded above by B+d:
+	// r = x1*B + x0 - d*q.
+	//   = x1*B + x0 - d*⎣(x1*m+x1*B+x0)/B⎦
+	//   = x1*B + x0 - d*((x1*m+x1*B+x0)/B-alpha)                                   0 <= alpha < 1
+	//   = x1*B + x0 - x1*d/B*m                         - x1*d - x0*d/B + d*alpha
+	//   = x1*B + x0 - x1*d/B*⎣(B^2-1)/d-B⎦             - x1*d - x0*d/B + d*alpha
+	//   = x1*B + x0 - x1*d/B*⎣(B^2-1)/d-B⎦             - x1*d - x0*d/B + d*alpha
+	//   = x1*B + x0 - x1*d/B*((B^2-1)/d-B-beta)        - x1*d - x0*d/B + d*alpha   0 <= beta < 1
+	//   = x1*B + x0 - x1*B + x1/B + x1*d + x1*d/B*beta - x1*d - x0*d/B + d*alpha
+	//   =        x0        + x1/B        + x1*d/B*beta        - x0*d/B + d*alpha
+	//   = x0*(1-d/B) + x1*(1+d*beta)/B + d*alpha
+	//   <  B*(1-d/B) +  d*B/B          + d          because x0<B (and 1-d/B>0), x1<d, 1+d*beta<=B, alpha<1
+	//   =  B - d     +  d              + d
+	//   = B+d
+	// So r1 can only be 0 or 1. If r1 is 1, then we know q was too small.
+	// Add 1 to q and subtract d from r. That guarantees that r is <B, so
+	// we no longer need to keep track of r1.
+	if r1 != 0 {
+		qq++
+		r0 -= d
+	}
+	// If the remainder is still too large, increment q one more time.
+	if r0 >= d {
+		qq++
+		r0 -= d
+	}
+	return Word(qq), Word(r0 >> s)
+}
+
+func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) {
 	r = xn
+	if len(x) == 1 {
+		qq, rr := bits.Div(uint(r), uint(x[0]), uint(y))
+		z[0] = Word(qq)
+		return Word(rr)
+	}
+	rec := reciprocalWord(y)
 	for i := len(z) - 1; i >= 0; i-- {
-		z[i], r = divWW_g(r, x[i], y)
+		z[i], r = divWW(r, x[i], y, rec)
 	}
-	return
+	return r
+}
+
+// reciprocalWord return the reciprocal of the divisor. rec = floor(( _B^2 - 1 ) / u - _B). u = d1 << nlz(d1).
+func reciprocalWord(d1 Word) Word {
+	u := uint(d1 << nlz(d1))
+	x1 := ^u
+	x0 := uint(_M)
+	rec, _ := bits.Div(x1, x0, u) // (_B^2-1)/U-_B = (_B*(_M-C)+_M)/U
+	return Word(rec)
 }
diff --git a/src/math/big/arith_386.s b/src/math/big/arith_386.s
index f61da2aba7..d0ea949fe6 100644
--- a/src/math/big/arith_386.s
+++ b/src/math/big/arith_386.s
@@ -18,16 +18,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0
 	RET
 
 
-// func divWW(x1, x0, y Word) (q, r Word)
-TEXT ·divWW(SB),NOSPLIT,$0
-	MOVL x1+0(FP), DX
-	MOVL x0+4(FP), AX
-	DIVL y+8(FP)
-	MOVL AX, q+12(FP)
-	MOVL DX, r+16(FP)
-	RET
-
-
 // func addVV(z, x, y []Word) (c Word)
 TEXT ·addVV(SB),NOSPLIT,$0
 	MOVL z+0(FP), DI
@@ -251,21 +241,4 @@ E6:	CMPL BX, $0		// i < 0
 	RET
 
 
-// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word)
-TEXT ·divWVW(SB),NOSPLIT,$0
-	MOVL z+0(FP), DI
-	MOVL xn+12(FP), DX	// r = xn
-	MOVL x+16(FP), SI
-	MOVL y+28(FP), CX
-	MOVL z_len+4(FP), BX	// i = z
-	JMP E7
 
-L7:	MOVL (SI)(BX*4), AX
-	DIVL CX
-	MOVL AX, (DI)(BX*4)
-
-E7:	SUBL $1, BX		// i--
-	JGE L7			// i >= 0
-
-	MOVL DX, r+32(FP)
-	RET
diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s
index b75639f540..61043ca2d9 100644
--- a/src/math/big/arith_amd64.s
+++ b/src/math/big/arith_amd64.s
@@ -18,14 +18,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0
 	RET
 
 
-// func divWW(x1, x0, y Word) (q, r Word)
-TEXT ·divWW(SB),NOSPLIT,$0
-	MOVQ x1+0(FP), DX
-	MOVQ x0+8(FP), AX
-	DIVQ y+16(FP)
-	MOVQ AX, q+24(FP)
-	MOVQ DX, r+32(FP)
-	RET
 
 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
@@ -531,21 +523,3 @@ adx_short:
 
 
 
-// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
-TEXT ·divWVW(SB),NOSPLIT,$0
-	MOVQ z+0(FP), R10
-	MOVQ xn+24(FP), DX	// r = xn
-	MOVQ x+32(FP), R8
-	MOVQ y+56(FP), R9
-	MOVQ z_len+8(FP), BX	// i = z
-	JMP E7
-
-L7:	MOVQ (R8)(BX*8), AX
-	DIVQ R9
-	MOVQ AX, (R10)(BX*8)
-
-E7:	SUBQ $1, BX		// i--
-	JGE L7			// i >= 0
-
-	MOVQ DX, r+64(FP)
-	RET
diff --git a/src/math/big/arith_arm.s b/src/math/big/arith_arm.s
index 33aa36f709..cbf7445e7a 100644
--- a/src/math/big/arith_arm.s
+++ b/src/math/big/arith_arm.s
@@ -272,17 +272,6 @@ E9:
 	RET
 
 
-// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word)
-TEXT ·divWVW(SB),NOSPLIT,$0
-	// ARM has no multiword division, so use portable code.
-	B ·divWVW_g(SB)
-
-
-// func divWW(x1, x0, y Word) (q, r Word)
-TEXT ·divWW(SB),NOSPLIT,$0
-	// ARM has no multiword division, so use portable code.
-	B ·divWW_g(SB)
-
 
 // func mulWW(x, y Word) (z1, z0 Word)
 TEXT ·mulWW(SB),NOSPLIT,$0
diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s
index da6e408e19..22357d088e 100644
--- a/src/math/big/arith_arm64.s
+++ b/src/math/big/arith_arm64.s
@@ -23,11 +23,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0
 	RET
 
 
-// func divWW(x1, x0, y Word) (q, r Word)
-TEXT ·divWW(SB),NOSPLIT,$0
-	B	·divWW_g(SB) // ARM64 has no multiword division
-
-
 // func addVV(z, x, y []Word) (c Word)
 TEXT ·addVV(SB),NOSPLIT,$0
 	MOVD	z_len+8(FP), R0
@@ -585,6 +580,4 @@ done:
 	MOVD	R4, c+56(FP)
 	RET
 
-// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
-TEXT ·divWVW(SB),NOSPLIT,$0
-	B ·divWVW_g(SB)
+
diff --git a/src/math/big/arith_decl.go b/src/math/big/arith_decl.go
index 41e592334c..d519bdc87b 100644
--- a/src/math/big/arith_decl.go
+++ b/src/math/big/arith_decl.go
@@ -8,7 +8,6 @@ package big
 
 // implemented in arith_$GOARCH.s
 func mulWW(x, y Word) (z1, z0 Word)
-func divWW(x1, x0, y Word) (q, r Word)
 func addVV(z, x, y []Word) (c Word)
 func subVV(z, x, y []Word) (c Word)
 func addVW(z, x []Word, y Word) (c Word)
@@ -17,4 +16,3 @@ func shlVU(z, x []Word, s uint) (c Word)
 func shrVU(z, x []Word, s uint) (c Word)
 func mulAddVWW(z, x []Word, y, r Word) (c Word)
 func addMulVVW(z, x []Word, y Word) (c Word)
-func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
diff --git a/src/math/big/arith_decl_pure.go b/src/math/big/arith_decl_pure.go
index 305f7ee03b..5faa3bd281 100644
--- a/src/math/big/arith_decl_pure.go
+++ b/src/math/big/arith_decl_pure.go
@@ -10,10 +10,6 @@ func mulWW(x, y Word) (z1, z0 Word) {
 	return mulWW_g(x, y)
 }
 
-func divWW(x1, x0, y Word) (q, r Word) {
-	return divWW_g(x1, x0, y)
-}
-
 func addVV(z, x, y []Word) (c Word) {
 	return addVV_g(z, x, y)
 }
@@ -55,7 +51,3 @@ func mulAddVWW(z, x []Word, y, r Word) (c Word) {
 func addMulVVW(z, x []Word, y Word) (c Word) {
 	return addMulVVW_g(z, x, y)
 }
-
-func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) {
-	return divWVW_g(z, xn, x, y)
-}
diff --git a/src/math/big/arith_mips64x.s b/src/math/big/arith_mips64x.s
index 983510ee3d..804b9fe06e 100644
--- a/src/math/big/arith_mips64x.s
+++ b/src/math/big/arith_mips64x.s
@@ -12,9 +12,6 @@
 TEXT ·mulWW(SB),NOSPLIT,$0
 	JMP ·mulWW_g(SB)
 
-TEXT ·divWW(SB),NOSPLIT,$0
-	JMP ·divWW_g(SB)
-
 TEXT ·addVV(SB),NOSPLIT,$0
 	JMP ·addVV_g(SB)
 
@@ -39,5 +36,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0
 TEXT ·addMulVVW(SB),NOSPLIT,$0
 	JMP ·addMulVVW_g(SB)
 
-TEXT ·divWVW(SB),NOSPLIT,$0
-	JMP ·divWVW_g(SB)
diff --git a/src/math/big/arith_mipsx.s b/src/math/big/arith_mipsx.s
index 54cafbd9c0..efdecb80f3 100644
--- a/src/math/big/arith_mipsx.s
+++ b/src/math/big/arith_mipsx.s
@@ -12,9 +12,6 @@
 TEXT ·mulWW(SB),NOSPLIT,$0
 	JMP	·mulWW_g(SB)
 
-TEXT ·divWW(SB),NOSPLIT,$0
-	JMP	·divWW_g(SB)
-
 TEXT ·addVV(SB),NOSPLIT,$0
 	JMP	·addVV_g(SB)
 
@@ -39,5 +36,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0
 TEXT ·addMulVVW(SB),NOSPLIT,$0
 	JMP	·addMulVVW_g(SB)
 
-TEXT ·divWVW(SB),NOSPLIT,$0
-	JMP	·divWVW_g(SB)
diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s
index 409e10ab48..b299ccc2fb 100644
--- a/src/math/big/arith_ppc64x.s
+++ b/src/math/big/arith_ppc64x.s
@@ -478,44 +478,4 @@ done:
 	MOVD R4, c+56(FP)
 	RET
 
-// func divWW(x1, x0, y Word) (q, r Word)
-TEXT ·divWW(SB), NOSPLIT, $0
-	MOVD x1+0(FP), R4
-	MOVD x0+8(FP), R5
-	MOVD y+16(FP), R6
 
-	CMPU R4, R6
-	BGE  divbigger
-
-	// from the programmer's note in ch. 3 of the ISA manual, p.74
-	DIVDEU R6, R4, R3
-	DIVDU  R6, R5, R7
-	MULLD  R6, R3, R8
-	MULLD  R6, R7, R20
-	SUB    R20, R5, R10
-	ADD    R7, R3, R3
-	SUB    R8, R10, R4
-	CMPU   R4, R10
-	BLT    adjust
-	CMPU   R4, R6
-	BLT    end
-
-adjust:
-	MOVD $1, R21
-	ADD  R21, R3, R3
-	SUB  R6, R4, R4
-
-end:
-	MOVD R3, q+24(FP)
-	MOVD R4, r+32(FP)
-
-	RET
-
-divbigger:
-	MOVD $-1, R7
-	MOVD R7, q+24(FP)
-	MOVD R7, r+32(FP)
-	RET
-
-TEXT ·divWVW(SB), NOSPLIT, $0
-	BR ·divWVW_g(SB)
diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s
index 59065c3f7b..a2f7666c7b 100644
--- a/src/math/big/arith_riscv64.s
+++ b/src/math/big/arith_riscv64.s
@@ -19,9 +19,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0
 	MOV	X8, z0+24(FP)
 	RET
 
-// func divWW(x1, x0, y Word) (q, r Word)
-TEXT ·divWW(SB),NOSPLIT,$0
-	JMP ·divWW_g(SB)		// riscv64 has no multiword division
 
 TEXT ·addVV(SB),NOSPLIT,$0
 	JMP ·addVV_g(SB)
@@ -47,5 +44,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0
 TEXT ·addMulVVW(SB),NOSPLIT,$0
 	JMP ·addMulVVW_g(SB)
 
-TEXT ·divWVW(SB),NOSPLIT,$0
-	JMP ·divWVW_g(SB)
diff --git a/src/math/big/arith_s390x.s b/src/math/big/arith_s390x.s
index 4891768111..caa4db0829 100644
--- a/src/math/big/arith_s390x.s
+++ b/src/math/big/arith_s390x.s
@@ -17,15 +17,6 @@ TEXT ·mulWW(SB), NOSPLIT, $0
 	MOVD   R11, z0+24(FP)
 	RET
 
-// func divWW(x1, x0, y Word) (q, r Word)
-TEXT ·divWW(SB), NOSPLIT, $0
-	MOVD x1+0(FP), R10
-	MOVD x0+8(FP), R11
-	MOVD y+16(FP), R5
-	WORD $0xb98700a5   // dlgr r10,r5
-	MOVD R11, q+24(FP)
-	MOVD R10, r+32(FP)
-	RET
 
 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
 // func addVV(z, x, y []Word) (c Word)
@@ -702,199 +693,11 @@ returnC:
 
 // func shlVU(z, x []Word, s uint) (c Word)
 TEXT ·shlVU(SB), NOSPLIT, $0
-	MOVD z_len+8(FP), R5
-	MOVD $0, R0
-	SUB  $1, R5          // n--
-	BLT  X8b             // n < 0        (n <= 0)
-
-	// n > 0
-	MOVD   s+48(FP), R4
-	CMPBEQ R0, R4, Z80     // handle 0 case beq
-	MOVD   $64, R6
-	CMPBEQ R6, R4, Z864    // handle 64 case beq
-	MOVD   z+0(FP), R2
-	MOVD   x+24(FP), R8
-	SLD    $3, R5          // n = n*8
-	SUB    R4, R6, R7
-	MOVD   (R8)(R5*1), R10 // w1 = x[i-1]
-	SRD    R7, R10, R3
-	MOVD   R3, c+56(FP)
-
-	MOVD $0, R1 // i = 0
-	BR   E8
-
-	// i < n-1
-L8:
-	MOVD R10, R3           // w = w1
-	MOVD -8(R8)(R5*1), R10 // w1 = x[i+1]
-
-	SLD  R4, R3         // w<<s | w1>>ŝ
-	SRD  R7, R10, R6
-	OR   R6, R3
-	MOVD R3, (R2)(R5*1) // z[i] = w<<s | w1>>ŝ
-	SUB  $8, R5         // i--
-
-E8:
-	CMPBGT R5, R0, L8 // i < n-1
-
-	// i >= n-1
-X8a:
-	SLD  R4, R10   // w1<<s
-	MOVD R10, (R2) // z[0] = w1<<s
-	RET
-
-X8b:
-	MOVD R0, c+56(FP)
-	RET
-
-Z80:
-	MOVD z+0(FP), R2
-	MOVD x+24(FP), R8
-	SLD  $3, R5       // n = n*8
-
-	MOVD (R8), R10
-	MOVD $0, R3
-	MOVD R3, c+56(FP)
-
-	MOVD $0, R1 // i = 0
-	BR   E8Z
+	BR ·shlVU_g(SB)
 
-	// i < n-1
-L8Z:
-	MOVD R10, R3
-	MOVD 8(R8)(R1*1), R10
-
-	MOVD R3, (R2)(R1*1)
-	ADD  $8, R1
-
-E8Z:
-	CMPBLT R1, R5, L8Z
-
-	// i >= n-1
-	MOVD R10, (R2)(R5*1)
-	RET
-
-Z864:
-	MOVD z+0(FP), R2
-	MOVD x+24(FP), R8
-	SLD  $3, R5         // n = n*8
-	MOVD (R8)(R5*1), R3 // w1 = x[n-1]
-	MOVD R3, c+56(FP)   // z[i] = x[n-1]
-
-	BR E864
-
-	// i < n-1
-L864:
-	MOVD -8(R8)(R5*1), R3
-
-	MOVD R3, (R2)(R5*1) // z[i] = x[n-1]
-	SUB  $8, R5         // i--
-
-E864:
-	CMPBGT R5, R0, L864 // i < n-1
-
-	MOVD R0, (R2) // z[n-1] = 0
-	RET
-
-// CX = R4, r8 = r8, r10 = r2 , r11 = r5, DX = r3, AX = r10 , BX = R1 , 64-count = r7 (R0 set to 0) temp = R6
 // func shrVU(z, x []Word, s uint) (c Word)
 TEXT ·shrVU(SB), NOSPLIT, $0
-	MOVD z_len+8(FP), R5
-	MOVD $0, R0
-	SUB  $1, R5          // n--
-	BLT  X9b             // n < 0        (n <= 0)
-
-	// n > 0
-	MOVD   s+48(FP), R4
-	CMPBEQ R0, R4, ZB0  // handle 0 case beq
-	MOVD   $64, R6
-	CMPBEQ R6, R4, ZB64 // handle 64 case beq
-	MOVD   z+0(FP), R2
-	MOVD   x+24(FP), R8
-	SLD    $3, R5       // n = n*8
-	SUB    R4, R6, R7
-	MOVD   (R8), R10    // w1 = x[0]
-	SLD    R7, R10, R3
-	MOVD   R3, c+56(FP)
-
-	MOVD $0, R1 // i = 0
-	BR   E9
-
-	// i < n-1
-L9:
-	MOVD R10, R3          // w = w1
-	MOVD 8(R8)(R1*1), R10 // w1 = x[i+1]
-
-	SRD  R4, R3         // w>>s | w1<<s
-	SLD  R7, R10, R6
-	OR   R6, R3
-	MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s
-	ADD  $8, R1         // i++
-
-E9:
-	CMPBLT R1, R5, L9 // i < n-1
-
-	// i >= n-1
-X9a:
-	SRD  R4, R10         // w1>>s
-	MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s
-	RET
-
-X9b:
-	MOVD R0, c+56(FP)
-	RET
-
-ZB0:
-	MOVD z+0(FP), R2
-	MOVD x+24(FP), R8
-	SLD  $3, R5       // n = n*8
-
-	MOVD (R8), R10    // w1 = x[0]
-	MOVD $0, R3       // R10 << 64
-	MOVD R3, c+56(FP)
-
-	MOVD $0, R1 // i = 0
-	BR   E9Z
-
-	// i < n-1
-L9Z:
-	MOVD R10, R3          // w = w1
-	MOVD 8(R8)(R1*1), R10 // w1 = x[i+1]
-
-	MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s
-	ADD  $8, R1         // i++
-
-E9Z:
-	CMPBLT R1, R5, L9Z // i < n-1
-
-	// i >= n-1
-	MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s
-	RET
-
-ZB64:
-	MOVD z+0(FP), R2
-	MOVD x+24(FP), R8
-	SLD  $3, R5       // n = n*8
-	MOVD (R8), R3     // w1 = x[0]
-	MOVD R3, c+56(FP)
-
-	MOVD $0, R1 // i = 0
-	BR   E964
-
-	// i < n-1
-L964:
-	MOVD 8(R8)(R1*1), R3 // w1 = x[i+1]
-
-	MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s
-	ADD  $8, R1         // i++
-
-E964:
-	CMPBLT R1, R5, L964 // i < n-1
-
-	// i >= n-1
-	MOVD $0, R10         // w1>>s
-	MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s
-	RET
+	BR ·shrVU_g(SB)
 
 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i
 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
@@ -990,27 +793,3 @@ E6:
 	MOVD R4, c+56(FP)
 	RET
 
-// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
-// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i
-TEXT ·divWVW(SB), NOSPLIT, $0
-	MOVD z+0(FP), R2
-	MOVD xn+24(FP), R10  // r = xn
-	MOVD x+32(FP), R8
-	MOVD y+56(FP), R9
-	MOVD z_len+8(FP), R7 // i = z
-	SLD  $3, R7, R1      // i*8
-	MOVD $0, R0          // make sure it's zero
-	BR   E7
-
-L7:
-	MOVD (R8)(R1*1), R11
-	WORD $0xB98700A9     // DLGR R10,R9
-	MOVD R11, (R2)(R1*1)
-
-E7:
-	SUB $1, R7 // i--
-	SUB $8, R1
-	BGE L7     // i >= 0
-
-	MOVD R10, r+64(FP)
-	RET
diff --git a/src/math/big/arith_test.go b/src/math/big/arith_test.go
index fc205934c5..2aca0effde 100644
--- a/src/math/big/arith_test.go
+++ b/src/math/big/arith_test.go
@@ -7,6 +7,7 @@ package big
 import (
 	"fmt"
 	"internal/testenv"
+	"math/bits"
 	"math/rand"
 	"strings"
 	"testing"
@@ -284,20 +285,56 @@ type argVU struct {
 	m  string // message.
 }
 
+var argshlVUIn = []Word{1, 2, 4, 8, 16, 32, 64, 0, 0, 0}
+var argshlVUr0 = []Word{1, 2, 4, 8, 16, 32, 64}
+var argshlVUr1 = []Word{2, 4, 8, 16, 32, 64, 128}
+var argshlVUrWm1 = []Word{1 << (_W - 1), 0, 1, 2, 4, 8, 16}
+
 var argshlVU = []argVU{
 	// test cases for shlVU
 	{[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0}, 7, 0, 0, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "complete overlap of shlVU"},
 	{[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0}, 7, 0, 3, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "partial overlap by half of shlVU"},
 	{[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0, 0, 0, 0}, 7, 0, 6, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "partial overlap by 1 Word of shlVU"},
 	{[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0, 0, 0, 0, 0}, 7, 0, 7, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "no overlap of shlVU"},
+	// additional test cases with shift values of 0, 1 and (_W-1)
+	{argshlVUIn, 7, 0, 0, 0, argshlVUr0, 0, "complete overlap of shlVU and shift of 0"},
+	{argshlVUIn, 7, 0, 0, 1, argshlVUr1, 0, "complete overlap of shlVU and shift of 1"},
+	{argshlVUIn, 7, 0, 0, _W - 1, argshlVUrWm1, 32, "complete overlap of shlVU and shift of _W - 1"},
+	{argshlVUIn, 7, 0, 1, 0, argshlVUr0, 0, "partial overlap by 6 Words of shlVU and shift of 0"},
+	{argshlVUIn, 7, 0, 1, 1, argshlVUr1, 0, "partial overlap by 6 Words of shlVU and shift of 1"},
+	{argshlVUIn, 7, 0, 1, _W - 1, argshlVUrWm1, 32, "partial overlap by 6 Words of shlVU and shift of _W - 1"},
+	{argshlVUIn, 7, 0, 2, 0, argshlVUr0, 0, "partial overlap by 5 Words of shlVU and shift of 0"},
+	{argshlVUIn, 7, 0, 2, 1, argshlVUr1, 0, "partial overlap by 5 Words of shlVU and shift of 1"},
+	{argshlVUIn, 7, 0, 2, _W - 1, argshlVUrWm1, 32, "partial overlap by 5 Words of shlVU abd shift of _W - 1"},
+	{argshlVUIn, 7, 0, 3, 0, argshlVUr0, 0, "partial overlap by 4 Words of shlVU and shift of 0"},
+	{argshlVUIn, 7, 0, 3, 1, argshlVUr1, 0, "partial overlap by 4 Words of shlVU and shift of 1"},
+	{argshlVUIn, 7, 0, 3, _W - 1, argshlVUrWm1, 32, "partial overlap by 4 Words of shlVU and shift of _W - 1"},
 }
 
+var argshrVUIn = []Word{0, 0, 0, 1, 2, 4, 8, 16, 32, 64}
+var argshrVUr0 = []Word{1, 2, 4, 8, 16, 32, 64}
+var argshrVUr1 = []Word{0, 1, 2, 4, 8, 16, 32}
+var argshrVUrWm1 = []Word{4, 8, 16, 32, 64, 128, 0}
+
 var argshrVU = []argVU{
 	// test cases for shrVU
 	{[]Word{0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 1, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "complete overlap of shrVU"},
 	{[]Word{0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 4, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "partial overlap by half of shrVU"},
 	{[]Word{0, 0, 0, 0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 7, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "partial overlap by 1 Word of shrVU"},
 	{[]Word{0, 0, 0, 0, 0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 8, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "no overlap of shrVU"},
+	// additional test cases with shift values of 0, 1 and (_W-1)
+	{argshrVUIn, 7, 3, 3, 0, argshrVUr0, 0, "complete overlap of shrVU and shift of 0"},
+	{argshrVUIn, 7, 3, 3, 1, argshrVUr1, 1 << (_W - 1), "complete overlap of shrVU and shift of 1"},
+	{argshrVUIn, 7, 3, 3, _W - 1, argshrVUrWm1, 2, "complete overlap of shrVU and shift of _W - 1"},
+	{argshrVUIn, 7, 3, 2, 0, argshrVUr0, 0, "partial overlap by 6 Words of shrVU and shift of 0"},
+	{argshrVUIn, 7, 3, 2, 1, argshrVUr1, 1 << (_W - 1), "partial overlap by 6 Words of shrVU and shift of 1"},
+	{argshrVUIn, 7, 3, 2, _W - 1, argshrVUrWm1, 2, "partial overlap by 6 Words of shrVU and shift of _W - 1"},
+	{argshrVUIn, 7, 3, 1, 0, argshrVUr0, 0, "partial overlap by 5 Words of shrVU and shift of 0"},
+	{argshrVUIn, 7, 3, 1, 1, argshrVUr1, 1 << (_W - 1), "partial overlap by 5 Words of shrVU and shift of 1"},
+	{argshrVUIn, 7, 3, 1, _W - 1, argshrVUrWm1, 2, "partial overlap by 5 Words of shrVU and shift of _W - 1"},
+	{argshrVUIn, 7, 3, 0, 0, argshrVUr0, 0, "partial overlap by 4 Words of shrVU and shift of 0"},
+	{argshrVUIn, 7, 3, 0, 1, argshrVUr1, 1 << (_W - 1), "partial overlap by 4 Words of shrVU and shift of 1"},
+	{argshrVUIn, 7, 3, 0, _W - 1, argshrVUrWm1, 2, "partial overlap by 4 Words of shrVU and shift of _W - 1"},
 }
 
 func testShiftFunc(t *testing.T, f func(z, x []Word, s uint) Word, a argVU) {
@@ -334,11 +371,24 @@ func TestIssue31084(t *testing.T) {
 	// compute 10^n via 5^n << n.
 	const n = 165
 	p := nat(nil).expNN(nat{5}, nat{n}, nil)
-	p = p.shl(p, uint(n))
+	p = p.shl(p, n)
 	got := string(p.utoa(10))
 	want := "1" + strings.Repeat("0", n)
 	if got != want {
-		t.Errorf("shl(%v, %v)\n\tgot %s; want %s\n", p, uint(n), got, want)
+		t.Errorf("shl(%v, %v)\n\tgot  %s\n\twant %s", p, n, got, want)
+	}
+}
+
+const issue42838Value = "159309191113245227702888039776771180559110455519261878607388585338616290151305816094308987472018268594098344692611135542392730712890625"
+
+func TestIssue42838(t *testing.T) {
+	const s = 192
+	z, _, _, _ := nat(nil).scan(strings.NewReader(issue42838Value), 0, false)
+	z = z.shl(z, s)
+	got := string(z.utoa(10))
+	want := "1" + strings.Repeat("0", s)
+	if got != want {
+		t.Errorf("shl(%v, %v)\n\tgot  %s\n\twant %s", z, s, got, want)
 	}
 }
 
@@ -493,7 +543,6 @@ func TestFunVWW(t *testing.T) {
 
 		if a.y != 0 && a.r < a.y {
 			arg := argWVW{a.x, a.c, a.z, a.y, a.r}
-			testFunWVW(t, "divWVW_g", divWVW_g, arg)
 			testFunWVW(t, "divWVW", divWVW, arg)
 		}
 	}
@@ -536,6 +585,42 @@ func TestMulAddWWW(t *testing.T) {
 	}
 }
 
+var divWWTests = []struct {
+	x1, x0, y Word
+	q, r      Word
+}{
+	{_M >> 1, 0, _M, _M >> 1, _M >> 1},
+	{_M - (1 << (_W - 2)), _M, 3 << (_W - 2), _M, _M - (1 << (_W - 2))},
+}
+
+const testsNumber = 1 << 16
+
+func TestDivWW(t *testing.T) {
+	i := 0
+	for i, test := range divWWTests {
+		rec := reciprocalWord(test.y)
+		q, r := divWW(test.x1, test.x0, test.y, rec)
+		if q != test.q || r != test.r {
+			t.Errorf("#%d got (%x, %x) want (%x, %x)", i, q, r, test.q, test.r)
+		}
+	}
+	//random tests
+	for ; i < testsNumber; i++ {
+		x1 := rndW()
+		x0 := rndW()
+		y := rndW()
+		if x1 >= y {
+			continue
+		}
+		rec := reciprocalWord(y)
+		qGot, rGot := divWW(x1, x0, y, rec)
+		qWant, rWant := bits.Div(uint(x1), uint(x0), uint(y))
+		if uint(qGot) != qWant || uint(rGot) != rWant {
+			t.Errorf("#%d got (%x, %x) want (%x, %x)", i, qGot, rGot, qWant, rWant)
+		}
+	}
+}
+
 func BenchmarkMulAddVWW(b *testing.B) {
 	for _, n := range benchSizes {
 		if isRaceBuilder && n > 1e3 {
@@ -570,3 +655,19 @@ func BenchmarkAddMulVVW(b *testing.B) {
 		})
 	}
 }
+func BenchmarkDivWVW(b *testing.B) {
+	for _, n := range benchSizes {
+		if isRaceBuilder && n > 1e3 {
+			continue
+		}
+		x := rndV(n)
+		y := rndW()
+		z := make([]Word, n)
+		b.Run(fmt.Sprint(n), func(b *testing.B) {
+			b.SetBytes(int64(n * _W))
+			for i := 0; i < b.N; i++ {
+				divWVW(z, 0, x, y)
+			}
+		})
+	}
+}
diff --git a/src/math/big/arith_wasm.s b/src/math/big/arith_wasm.s
index 382597c694..add1064469 100644
--- a/src/math/big/arith_wasm.s
+++ b/src/math/big/arith_wasm.s
@@ -9,9 +9,6 @@
 TEXT ·mulWW(SB),NOSPLIT,$0
 	JMP ·mulWW_g(SB)
 
-TEXT ·divWW(SB),NOSPLIT,$0
-	JMP ·divWW_g(SB)
-
 TEXT ·addVV(SB),NOSPLIT,$0
 	JMP ·addVV_g(SB)
 
@@ -36,5 +33,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0
 TEXT ·addMulVVW(SB),NOSPLIT,$0
 	JMP ·addMulVVW_g(SB)
 
-TEXT ·divWVW(SB),NOSPLIT,$0
-	JMP ·divWVW_g(SB)
diff --git a/src/math/big/decimal.go b/src/math/big/decimal.go
index ae9ffb5db6..716f03bfa4 100644
--- a/src/math/big/decimal.go
+++ b/src/math/big/decimal.go
@@ -166,18 +166,21 @@ func (x *decimal) String() string {
 	switch {
 	case x.exp <= 0:
 		// 0.00ddd
+		buf = make([]byte, 0, 2+(-x.exp)+len(x.mant))
 		buf = append(buf, "0."...)
 		buf = appendZeros(buf, -x.exp)
 		buf = append(buf, x.mant...)
 
 	case /* 0 < */ x.exp < len(x.mant):
 		// dd.ddd
+		buf = make([]byte, 0, 1+len(x.mant))
 		buf = append(buf, x.mant[:x.exp]...)
 		buf = append(buf, '.')
 		buf = append(buf, x.mant[x.exp:]...)
 
 	default: // len(x.mant) <= x.exp
 		// ddd00
+		buf = make([]byte, 0, x.exp)
 		buf = append(buf, x.mant...)
 		buf = appendZeros(buf, x.exp-len(x.mant))
 	}
diff --git a/src/math/big/link_test.go b/src/math/big/link_test.go
index 2212bd444f..42f9cefca0 100644
--- a/src/math/big/link_test.go
+++ b/src/math/big/link_test.go
@@ -7,7 +7,7 @@ package big
 import (
 	"bytes"
 	"internal/testenv"
-	"io/ioutil"
+	"os"
 	"os/exec"
 	"path/filepath"
 	"testing"
@@ -27,7 +27,7 @@ func TestLinkerGC(t *testing.T) {
 import _ "math/big"
 func main() {}
 `)
-	if err := ioutil.WriteFile(goFile, file, 0644); err != nil {
+	if err := os.WriteFile(goFile, file, 0644); err != nil {
 		t.Fatal(err)
 	}
 	cmd := exec.Command(goBin, "build", "-o", "x.exe", "x.go")
diff --git a/src/math/big/nat.go b/src/math/big/nat.go
index 6a3989bf9d..068176e1c1 100644
--- a/src/math/big/nat.go
+++ b/src/math/big/nat.go
@@ -751,6 +751,7 @@ func (q nat) divBasic(u, v nat) {
 
 	// D2.
 	vn1 := v[n-1]
+	rec := reciprocalWord(vn1)
 	for j := m; j >= 0; j-- {
 		// D3.
 		qhat := Word(_M)
@@ -760,7 +761,7 @@ func (q nat) divBasic(u, v nat) {
 		}
 		if ujn != vn1 {
 			var rhat Word
-			qhat, rhat = divWW(ujn, u[j+n-1], vn1)
+			qhat, rhat = divWW(ujn, u[j+n-1], vn1, rec)
 
 			// x1 | x2 = q̂v_{n-2}
 			vn2 := v[n-2]
@@ -928,7 +929,7 @@ func (z nat) divRecursiveStep(u, v nat, depth int, tmp *nat, temps []*nat) {
 
 	// Now u < (v<<B), compute lower bits in the same way.
 	// Choose shift = B-1 again.
-	s := B
+	s := B - 1
 	qhat := *temps[depth]
 	qhat.clear()
 	qhat.divRecursiveStep(u[s:].norm(), v[s:], depth+1, tmp, temps)