From d54a9a9c42e751a020308cae296add426b56d0f0 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Tue, 25 Aug 2020 16:33:50 +0800 Subject: math/big: replace division with multiplication by reciprocal word MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Division is much slower than multiplication. And the method of using multiplication by multiplying reciprocal and replacing division with it can increase the speed of divWVW algorithm by three times,and at the same time increase the speed of nats division. The benchmark test on arm64 is as follows: name old time/op new time/op delta DivWVW/1-4 13.1ns ± 4% 13.3ns ± 4% ~ (p=0.444 n=5+5) DivWVW/2-4 48.6ns ± 1% 51.2ns ± 2% +5.39% (p=0.008 n=5+5) DivWVW/3-4 82.0ns ± 1% 69.7ns ± 1% -15.03% (p=0.008 n=5+5) DivWVW/4-4 116ns ± 1% 71ns ± 2% -38.88% (p=0.008 n=5+5) DivWVW/5-4 152ns ± 1% 84ns ± 4% -44.70% (p=0.008 n=5+5) DivWVW/10-4 319ns ± 1% 155ns ± 4% -51.50% (p=0.008 n=5+5) DivWVW/100-4 3.44µs ± 3% 1.30µs ± 8% -62.30% (p=0.008 n=5+5) DivWVW/1000-4 33.8µs ± 0% 10.9µs ± 1% -67.74% (p=0.008 n=5+5) DivWVW/10000-4 343µs ± 4% 111µs ± 5% -67.63% (p=0.008 n=5+5) DivWVW/100000-4 3.35ms ± 1% 1.25ms ± 3% -62.79% (p=0.008 n=5+5) QuoRem-4 3.08µs ± 2% 2.21µs ± 4% -28.40% (p=0.008 n=5+5) ModSqrt225_Tonelli-4 444µs ± 2% 457µs ± 3% ~ (p=0.095 n=5+5) ModSqrt225_3Mod4-4 136µs ± 1% 138µs ± 3% ~ (p=0.151 n=5+5) ModSqrt231_Tonelli-4 473µs ± 3% 483µs ± 4% ~ (p=0.548 n=5+5) ModSqrt231_5Mod8-4 164µs ± 9% 169µs ±12% ~ (p=0.421 n=5+5) Sqrt-4 36.8µs ± 1% 28.6µs ± 0% -22.17% (p=0.016 n=5+4) Div/20/10-4 50.0ns ± 3% 51.3ns ± 6% ~ (p=0.238 n=5+5) Div/40/20-4 49.8ns ± 2% 51.3ns ± 6% ~ (p=0.222 n=5+5) Div/100/50-4 85.8ns ± 4% 86.5ns ± 5% ~ (p=0.246 n=5+5) Div/200/100-4 335ns ± 3% 296ns ± 2% -11.60% (p=0.008 n=5+5) Div/400/200-4 442ns ± 2% 359ns ± 5% -18.81% (p=0.008 n=5+5) Div/1000/500-4 858ns ± 3% 643ns ± 6% -25.06% (p=0.008 n=5+5) Div/2000/1000-4 1.70µs ± 3% 1.28µs ± 4% -24.80% (p=0.008 n=5+5) Div/20000/10000-4 45.0µs ± 5% 41.8µs ± 4% -7.17% (p=0.016 n=5+5) Div/200000/100000-4 1.51ms ± 7% 1.43ms ± 3% -5.42% (p=0.016 n=5+5) Div/2000000/1000000-4 57.6ms ± 4% 57.5ms ± 3% ~ (p=1.000 n=5+5) Div/20000000/10000000-4 2.08s ± 3% 2.04s ± 1% ~ (p=0.095 n=5+5) name old speed new speed delta DivWVW/1-4 4.87GB/s ± 4% 4.80GB/s ± 4% ~ (p=0.310 n=5+5) DivWVW/2-4 2.63GB/s ± 1% 2.50GB/s ± 2% -5.07% (p=0.008 n=5+5) DivWVW/3-4 2.34GB/s ± 1% 2.76GB/s ± 1% +17.70% (p=0.008 n=5+5) DivWVW/4-4 2.21GB/s ± 1% 3.61GB/s ± 2% +63.42% (p=0.008 n=5+5) DivWVW/5-4 2.10GB/s ± 2% 3.81GB/s ± 4% +80.89% (p=0.008 n=5+5) DivWVW/10-4 2.01GB/s ± 0% 4.13GB/s ± 4% +105.91% (p=0.008 n=5+5) DivWVW/100-4 1.86GB/s ± 2% 4.95GB/s ± 7% +165.63% (p=0.008 n=5+5) DivWVW/1000-4 1.89GB/s ± 0% 5.86GB/s ± 1% +209.96% (p=0.008 n=5+5) DivWVW/10000-4 1.87GB/s ± 4% 5.76GB/s ± 5% +208.96% (p=0.008 n=5+5) DivWVW/100000-4 1.91GB/s ± 1% 5.14GB/s ± 3% +168.85% (p=0.008 n=5+5) Change-Id: I049f1196562b20800e6ef8a6493fd147f93ad830 Reviewed-on: https://go-review.googlesource.com/c/go/+/250417 Trust: Giovanni Bajo Trust: Keith Randall Run-TryBot: Giovanni Bajo TryBot-Result: Go Bot Reviewed-by: Keith Randall --- src/math/big/arith_amd64.s | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'src/math/big/arith_amd64.s') diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s index b75639f540..61043ca2d9 100644 --- a/src/math/big/arith_amd64.s +++ b/src/math/big/arith_amd64.s @@ -18,14 +18,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 RET -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB),NOSPLIT,$0 - MOVQ x1+0(FP), DX - MOVQ x0+8(FP), AX - DIVQ y+16(FP) - MOVQ AX, q+24(FP) - MOVQ DX, r+32(FP) - RET // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. @@ -531,21 +523,3 @@ adx_short: -// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) -TEXT ·divWVW(SB),NOSPLIT,$0 - MOVQ z+0(FP), R10 - MOVQ xn+24(FP), DX // r = xn - MOVQ x+32(FP), R8 - MOVQ y+56(FP), R9 - MOVQ z_len+8(FP), BX // i = z - JMP E7 - -L7: MOVQ (R8)(BX*8), AX - DIVQ R9 - MOVQ AX, (R10)(BX*8) - -E7: SUBQ $1, BX // i-- - JGE L7 // i >= 0 - - MOVQ DX, r+64(FP) - RET -- cgit v1.3