From d54a9a9c42e751a020308cae296add426b56d0f0 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Tue, 25 Aug 2020 16:33:50 +0800 Subject: math/big: replace division with multiplication by reciprocal word MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Division is much slower than multiplication. And the method of using multiplication by multiplying reciprocal and replacing division with it can increase the speed of divWVW algorithm by three times,and at the same time increase the speed of nats division. The benchmark test on arm64 is as follows: name old time/op new time/op delta DivWVW/1-4 13.1ns ± 4% 13.3ns ± 4% ~ (p=0.444 n=5+5) DivWVW/2-4 48.6ns ± 1% 51.2ns ± 2% +5.39% (p=0.008 n=5+5) DivWVW/3-4 82.0ns ± 1% 69.7ns ± 1% -15.03% (p=0.008 n=5+5) DivWVW/4-4 116ns ± 1% 71ns ± 2% -38.88% (p=0.008 n=5+5) DivWVW/5-4 152ns ± 1% 84ns ± 4% -44.70% (p=0.008 n=5+5) DivWVW/10-4 319ns ± 1% 155ns ± 4% -51.50% (p=0.008 n=5+5) DivWVW/100-4 3.44µs ± 3% 1.30µs ± 8% -62.30% (p=0.008 n=5+5) DivWVW/1000-4 33.8µs ± 0% 10.9µs ± 1% -67.74% (p=0.008 n=5+5) DivWVW/10000-4 343µs ± 4% 111µs ± 5% -67.63% (p=0.008 n=5+5) DivWVW/100000-4 3.35ms ± 1% 1.25ms ± 3% -62.79% (p=0.008 n=5+5) QuoRem-4 3.08µs ± 2% 2.21µs ± 4% -28.40% (p=0.008 n=5+5) ModSqrt225_Tonelli-4 444µs ± 2% 457µs ± 3% ~ (p=0.095 n=5+5) ModSqrt225_3Mod4-4 136µs ± 1% 138µs ± 3% ~ (p=0.151 n=5+5) ModSqrt231_Tonelli-4 473µs ± 3% 483µs ± 4% ~ (p=0.548 n=5+5) ModSqrt231_5Mod8-4 164µs ± 9% 169µs ±12% ~ (p=0.421 n=5+5) Sqrt-4 36.8µs ± 1% 28.6µs ± 0% -22.17% (p=0.016 n=5+4) Div/20/10-4 50.0ns ± 3% 51.3ns ± 6% ~ (p=0.238 n=5+5) Div/40/20-4 49.8ns ± 2% 51.3ns ± 6% ~ (p=0.222 n=5+5) Div/100/50-4 85.8ns ± 4% 86.5ns ± 5% ~ (p=0.246 n=5+5) Div/200/100-4 335ns ± 3% 296ns ± 2% -11.60% (p=0.008 n=5+5) Div/400/200-4 442ns ± 2% 359ns ± 5% -18.81% (p=0.008 n=5+5) Div/1000/500-4 858ns ± 3% 643ns ± 6% -25.06% (p=0.008 n=5+5) Div/2000/1000-4 1.70µs ± 3% 1.28µs ± 4% -24.80% (p=0.008 n=5+5) Div/20000/10000-4 45.0µs ± 5% 41.8µs ± 4% -7.17% (p=0.016 n=5+5) Div/200000/100000-4 1.51ms ± 7% 1.43ms ± 3% -5.42% (p=0.016 n=5+5) Div/2000000/1000000-4 57.6ms ± 4% 57.5ms ± 3% ~ (p=1.000 n=5+5) Div/20000000/10000000-4 2.08s ± 3% 2.04s ± 1% ~ (p=0.095 n=5+5) name old speed new speed delta DivWVW/1-4 4.87GB/s ± 4% 4.80GB/s ± 4% ~ (p=0.310 n=5+5) DivWVW/2-4 2.63GB/s ± 1% 2.50GB/s ± 2% -5.07% (p=0.008 n=5+5) DivWVW/3-4 2.34GB/s ± 1% 2.76GB/s ± 1% +17.70% (p=0.008 n=5+5) DivWVW/4-4 2.21GB/s ± 1% 3.61GB/s ± 2% +63.42% (p=0.008 n=5+5) DivWVW/5-4 2.10GB/s ± 2% 3.81GB/s ± 4% +80.89% (p=0.008 n=5+5) DivWVW/10-4 2.01GB/s ± 0% 4.13GB/s ± 4% +105.91% (p=0.008 n=5+5) DivWVW/100-4 1.86GB/s ± 2% 4.95GB/s ± 7% +165.63% (p=0.008 n=5+5) DivWVW/1000-4 1.89GB/s ± 0% 5.86GB/s ± 1% +209.96% (p=0.008 n=5+5) DivWVW/10000-4 1.87GB/s ± 4% 5.76GB/s ± 5% +208.96% (p=0.008 n=5+5) DivWVW/100000-4 1.91GB/s ± 1% 5.14GB/s ± 3% +168.85% (p=0.008 n=5+5) Change-Id: I049f1196562b20800e6ef8a6493fd147f93ad830 Reviewed-on: https://go-review.googlesource.com/c/go/+/250417 Trust: Giovanni Bajo Trust: Keith Randall Run-TryBot: Giovanni Bajo TryBot-Result: Go Bot Reviewed-by: Keith Randall --- src/math/big/arith.go | 89 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 9 deletions(-) (limited to 'src/math/big/arith.go') diff --git a/src/math/big/arith.go b/src/math/big/arith.go index b0885f261f..750ce8aa39 100644 --- a/src/math/big/arith.go +++ b/src/math/big/arith.go @@ -60,12 +60,6 @@ func nlz(x Word) uint { return uint(bits.LeadingZeros(uint(x))) } -// q = (u1<<_W + u0 - r)/v -func divWW_g(u1, u0, v Word) (q, r Word) { - qq, rr := bits.Div(uint(u1), uint(u0), uint(v)) - return Word(qq), Word(rr) -} - // The resulting carry c is either 0 or 1. func addVV_g(z, x, y []Word) (c Word) { // The comment near the top of this file discusses this for loop condition. @@ -207,10 +201,87 @@ func addMulVVW_g(z, x []Word, y Word) (c Word) { return } -func divWVW_g(z []Word, xn Word, x []Word, y Word) (r Word) { +// q = ( x1 << _W + x0 - r)/y. m = floor(( _B^2 - 1 ) / d - _B). Requiring x1>(_W-s) + x0 <<= s + y <<= s + } + d := uint(y) + // We know that + // m = ⎣(B^2-1)/d⎦-B + // ⎣(B^2-1)/d⎦ = m+B + // (B^2-1)/d = m+B+delta1 0 <= delta1 <= (d-1)/d + // B^2/d = m+B+delta2 0 <= delta2 <= 1 + // The quotient we're trying to compute is + // quotient = ⎣(x1*B+x0)/d⎦ + // = ⎣(x1*B*(B^2/d)+x0*(B^2/d))/B^2⎦ + // = ⎣(x1*B*(m+B+delta2)+x0*(m+B+delta2))/B^2⎦ + // = ⎣(x1*m+x1*B+x0)/B + x0*m/B^2 + delta2*(x1*B+x0)/B^2⎦ + // The latter two terms of this three-term sum are between 0 and 1. + // So we can compute just the first term, and we will be low by at most 2. + t1, t0 := bits.Mul(uint(m), uint(x1)) + _, c := bits.Add(t0, uint(x0), 0) + t1, _ = bits.Add(t1, uint(x1), c) + // The quotient is either t1, t1+1, or t1+2. + // We'll try t1 and adjust if needed. + qq := t1 + // compute remainder r=x-d*q. + dq1, dq0 := bits.Mul(d, qq) + r0, b := bits.Sub(uint(x0), dq0, 0) + r1, _ := bits.Sub(uint(x1), dq1, b) + // The remainder we just computed is bounded above by B+d: + // r = x1*B + x0 - d*q. + // = x1*B + x0 - d*⎣(x1*m+x1*B+x0)/B⎦ + // = x1*B + x0 - d*((x1*m+x1*B+x0)/B-alpha) 0 <= alpha < 1 + // = x1*B + x0 - x1*d/B*m - x1*d - x0*d/B + d*alpha + // = x1*B + x0 - x1*d/B*⎣(B^2-1)/d-B⎦ - x1*d - x0*d/B + d*alpha + // = x1*B + x0 - x1*d/B*⎣(B^2-1)/d-B⎦ - x1*d - x0*d/B + d*alpha + // = x1*B + x0 - x1*d/B*((B^2-1)/d-B-beta) - x1*d - x0*d/B + d*alpha 0 <= beta < 1 + // = x1*B + x0 - x1*B + x1/B + x1*d + x1*d/B*beta - x1*d - x0*d/B + d*alpha + // = x0 + x1/B + x1*d/B*beta - x0*d/B + d*alpha + // = x0*(1-d/B) + x1*(1+d*beta)/B + d*alpha + // < B*(1-d/B) + d*B/B + d because x00), x1= d { + qq++ + r0 -= d + } + return Word(qq), Word(r0 >> s) +} + +func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) { r = xn + if len(x) == 1 { + qq, rr := bits.Div(uint(r), uint(x[0]), uint(y)) + z[0] = Word(qq) + return Word(rr) + } + rec := reciprocalWord(y) for i := len(z) - 1; i >= 0; i-- { - z[i], r = divWW_g(r, x[i], y) + z[i], r = divWW(r, x[i], y, rec) } - return + return r +} + +// reciprocalWord return the reciprocal of the divisor. rec = floor(( _B^2 - 1 ) / u - _B). u = d1 << nlz(d1). +func reciprocalWord(d1 Word) Word { + u := uint(d1 << nlz(d1)) + x1 := ^u + x0 := uint(_M) + rec, _ := bits.Div(x1, x0, u) // (_B^2-1)/U-_B = (_B*(_M-C)+_M)/U + return Word(rec) } -- cgit v1.3