From d54a9a9c42e751a020308cae296add426b56d0f0 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Tue, 25 Aug 2020 16:33:50 +0800 Subject: math/big: replace division with multiplication by reciprocal word MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Division is much slower than multiplication. And the method of using multiplication by multiplying reciprocal and replacing division with it can increase the speed of divWVW algorithm by three times,and at the same time increase the speed of nats division. The benchmark test on arm64 is as follows: name old time/op new time/op delta DivWVW/1-4 13.1ns ± 4% 13.3ns ± 4% ~ (p=0.444 n=5+5) DivWVW/2-4 48.6ns ± 1% 51.2ns ± 2% +5.39% (p=0.008 n=5+5) DivWVW/3-4 82.0ns ± 1% 69.7ns ± 1% -15.03% (p=0.008 n=5+5) DivWVW/4-4 116ns ± 1% 71ns ± 2% -38.88% (p=0.008 n=5+5) DivWVW/5-4 152ns ± 1% 84ns ± 4% -44.70% (p=0.008 n=5+5) DivWVW/10-4 319ns ± 1% 155ns ± 4% -51.50% (p=0.008 n=5+5) DivWVW/100-4 3.44µs ± 3% 1.30µs ± 8% -62.30% (p=0.008 n=5+5) DivWVW/1000-4 33.8µs ± 0% 10.9µs ± 1% -67.74% (p=0.008 n=5+5) DivWVW/10000-4 343µs ± 4% 111µs ± 5% -67.63% (p=0.008 n=5+5) DivWVW/100000-4 3.35ms ± 1% 1.25ms ± 3% -62.79% (p=0.008 n=5+5) QuoRem-4 3.08µs ± 2% 2.21µs ± 4% -28.40% (p=0.008 n=5+5) ModSqrt225_Tonelli-4 444µs ± 2% 457µs ± 3% ~ (p=0.095 n=5+5) ModSqrt225_3Mod4-4 136µs ± 1% 138µs ± 3% ~ (p=0.151 n=5+5) ModSqrt231_Tonelli-4 473µs ± 3% 483µs ± 4% ~ (p=0.548 n=5+5) ModSqrt231_5Mod8-4 164µs ± 9% 169µs ±12% ~ (p=0.421 n=5+5) Sqrt-4 36.8µs ± 1% 28.6µs ± 0% -22.17% (p=0.016 n=5+4) Div/20/10-4 50.0ns ± 3% 51.3ns ± 6% ~ (p=0.238 n=5+5) Div/40/20-4 49.8ns ± 2% 51.3ns ± 6% ~ (p=0.222 n=5+5) Div/100/50-4 85.8ns ± 4% 86.5ns ± 5% ~ (p=0.246 n=5+5) Div/200/100-4 335ns ± 3% 296ns ± 2% -11.60% (p=0.008 n=5+5) Div/400/200-4 442ns ± 2% 359ns ± 5% -18.81% (p=0.008 n=5+5) Div/1000/500-4 858ns ± 3% 643ns ± 6% -25.06% (p=0.008 n=5+5) Div/2000/1000-4 1.70µs ± 3% 1.28µs ± 4% -24.80% (p=0.008 n=5+5) Div/20000/10000-4 45.0µs ± 5% 41.8µs ± 4% -7.17% (p=0.016 n=5+5) Div/200000/100000-4 1.51ms ± 7% 1.43ms ± 3% -5.42% (p=0.016 n=5+5) Div/2000000/1000000-4 57.6ms ± 4% 57.5ms ± 3% ~ (p=1.000 n=5+5) Div/20000000/10000000-4 2.08s ± 3% 2.04s ± 1% ~ (p=0.095 n=5+5) name old speed new speed delta DivWVW/1-4 4.87GB/s ± 4% 4.80GB/s ± 4% ~ (p=0.310 n=5+5) DivWVW/2-4 2.63GB/s ± 1% 2.50GB/s ± 2% -5.07% (p=0.008 n=5+5) DivWVW/3-4 2.34GB/s ± 1% 2.76GB/s ± 1% +17.70% (p=0.008 n=5+5) DivWVW/4-4 2.21GB/s ± 1% 3.61GB/s ± 2% +63.42% (p=0.008 n=5+5) DivWVW/5-4 2.10GB/s ± 2% 3.81GB/s ± 4% +80.89% (p=0.008 n=5+5) DivWVW/10-4 2.01GB/s ± 0% 4.13GB/s ± 4% +105.91% (p=0.008 n=5+5) DivWVW/100-4 1.86GB/s ± 2% 4.95GB/s ± 7% +165.63% (p=0.008 n=5+5) DivWVW/1000-4 1.89GB/s ± 0% 5.86GB/s ± 1% +209.96% (p=0.008 n=5+5) DivWVW/10000-4 1.87GB/s ± 4% 5.76GB/s ± 5% +208.96% (p=0.008 n=5+5) DivWVW/100000-4 1.91GB/s ± 1% 5.14GB/s ± 3% +168.85% (p=0.008 n=5+5) Change-Id: I049f1196562b20800e6ef8a6493fd147f93ad830 Reviewed-on: https://go-review.googlesource.com/c/go/+/250417 Trust: Giovanni Bajo Trust: Keith Randall Run-TryBot: Giovanni Bajo TryBot-Result: Go Bot Reviewed-by: Keith Randall --- src/math/big/arith_arm.s | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'src/math/big/arith_arm.s') diff --git a/src/math/big/arith_arm.s b/src/math/big/arith_arm.s index 33aa36f709..cbf7445e7a 100644 --- a/src/math/big/arith_arm.s +++ b/src/math/big/arith_arm.s @@ -272,17 +272,6 @@ E9: RET -// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word) -TEXT ·divWVW(SB),NOSPLIT,$0 - // ARM has no multiword division, so use portable code. - B ·divWVW_g(SB) - - -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB),NOSPLIT,$0 - // ARM has no multiword division, so use portable code. - B ·divWW_g(SB) - // func mulWW(x, y Word) (z1, z0 Word) TEXT ·mulWW(SB),NOSPLIT,$0 -- cgit v1.3