From 0c43878baa035db39d9bbf84ce8721cd8a97c78a Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Tue, 8 Jun 2021 13:16:01 -0500 Subject: cmd/compile: lower Add64/Sub64 into ssa on PPC64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit math/bits.Add64 and math/bits.Sub64 now lower and optimize directly in SSA form. The optimization of carry chains focuses around eliding XER<->GPR transfers of the CA bit when used exclusively as an input to a single carry operations, or when the CA value is known. This also adds support for handling XER spills in the assembler which could happen if carry chains contain inter-dependencies on each other (which seems very unlikely with practical usage), or a clobber happens (SRAW/SRAD/SUBFC operations clobber CA). With PPC64 Add64/Sub64 lowering into SSA and this patch, the net performance difference in crypto/elliptic benchmarks on P9/ppc64le are: name old time/op new time/op delta ScalarBaseMult/P256 46.3µs ± 0% 46.9µs ± 0% +1.34% ScalarBaseMult/P224 356µs ± 0% 209µs ± 0% -41.14% ScalarBaseMult/P384 1.20ms ± 0% 0.57ms ± 0% -52.14% ScalarBaseMult/P521 3.38ms ± 0% 1.44ms ± 0% -57.27% ScalarMult/P256 199µs ± 0% 199µs ± 0% -0.17% ScalarMult/P224 357µs ± 0% 212µs ± 0% -40.56% ScalarMult/P384 1.20ms ± 0% 0.58ms ± 0% -51.86% ScalarMult/P521 3.37ms ± 0% 1.44ms ± 0% -57.32% MarshalUnmarshal/P256/Uncompressed 2.59µs ± 0% 2.52µs ± 0% -2.63% MarshalUnmarshal/P256/Compressed 2.58µs ± 0% 2.52µs ± 0% -2.06% MarshalUnmarshal/P224/Uncompressed 1.54µs ± 0% 1.40µs ± 0% -9.42% MarshalUnmarshal/P224/Compressed 1.54µs ± 0% 1.39µs ± 0% -9.87% MarshalUnmarshal/P384/Uncompressed 2.40µs ± 0% 1.80µs ± 0% -24.93% MarshalUnmarshal/P384/Compressed 2.35µs ± 0% 1.81µs ± 0% -23.03% MarshalUnmarshal/P521/Uncompressed 3.79µs ± 0% 2.58µs ± 0% -31.81% MarshalUnmarshal/P521/Compressed 3.80µs ± 0% 2.60µs ± 0% -31.67% Note, P256 uses an asm implementation, thus, little variation is expected. Change-Id: I88a24f6bf0f4f285c649e40243b1ab69cc452b71 Reviewed-on: https://go-review.googlesource.com/c/go/+/346870 Reviewed-by: Lynn Boger Reviewed-by: Dmitri Shuralyov Run-TryBot: Paul Murphy TryBot-Result: Gopher Robot Reviewed-by: Ian Lance Taylor --- test/codegen/mathbits.go | 56 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 6 deletions(-) (limited to 'test/codegen/mathbits.go') diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go index 58d57b3523..1ddb5c75cc 100644 --- a/test/codegen/mathbits.go +++ b/test/codegen/mathbits.go @@ -423,6 +423,8 @@ func IterateBits8(n uint8) int { func Add(x, y, ci uint) (r, co uint) { // arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP" // amd64:"NEGL","ADCQ","SBBQ","NEGQ" + // ppc64: "ADDC", "ADDE", "ADDZE" + // ppc64le: "ADDC", "ADDE", "ADDZE" // s390x:"ADDE","ADDC\t[$]-1," return bits.Add(x, y, ci) } @@ -430,6 +432,8 @@ func Add(x, y, ci uint) (r, co uint) { func AddC(x, ci uint) (r, co uint) { // arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP" // amd64:"NEGL","ADCQ","SBBQ","NEGQ" + // ppc64: "ADDC", "ADDE", "ADDZE" + // ppc64le: "ADDC", "ADDE", "ADDZE" // s390x:"ADDE","ADDC\t[$]-1," return bits.Add(x, 7, ci) } @@ -437,6 +441,8 @@ func AddC(x, ci uint) (r, co uint) { func AddZ(x, y uint) (r, co uint) { // arm64:"ADDS","ADC",-"ADCS",-"ADD\t",-"CMP" // amd64:"ADDQ","SBBQ","NEGQ",-"NEGL",-"ADCQ" + // ppc64: "ADDC", -"ADDE", "ADDZE" + // ppc64le: "ADDC", -"ADDE", "ADDZE" // s390x:"ADDC",-"ADDC\t[$]-1," return bits.Add(x, y, 0) } @@ -444,6 +450,8 @@ func AddZ(x, y uint) (r, co uint) { func AddR(x, y, ci uint) uint { // arm64:"ADDS","ADCS",-"ADD\t",-"CMP" // amd64:"NEGL","ADCQ",-"SBBQ",-"NEGQ" + // ppc64: "ADDC", "ADDE", -"ADDZE" + // ppc64le: "ADDC", "ADDE", -"ADDZE" // s390x:"ADDE","ADDC\t[$]-1," r, _ := bits.Add(x, y, ci) return r @@ -480,8 +488,8 @@ func Add64C(x, ci uint64) (r, co uint64) { func Add64Z(x, y uint64) (r, co uint64) { // arm64:"ADDS","ADC",-"ADCS",-"ADD\t",-"CMP" // amd64:"ADDQ","SBBQ","NEGQ",-"NEGL",-"ADCQ" - // ppc64: "ADDC", "ADDE", "ADDZE" - // ppc64le: "ADDC", "ADDE", "ADDZE" + // ppc64: "ADDC", -"ADDE", "ADDZE" + // ppc64le: "ADDC", -"ADDE", "ADDZE" // s390x:"ADDC",-"ADDC\t[$]-1," return bits.Add64(x, y, 0) } @@ -489,8 +497,8 @@ func Add64Z(x, y uint64) (r, co uint64) { func Add64R(x, y, ci uint64) uint64 { // arm64:"ADDS","ADCS",-"ADD\t",-"CMP" // amd64:"NEGL","ADCQ",-"SBBQ",-"NEGQ" - // ppc64: "ADDC", "ADDE", "ADDZE" - // ppc64le: "ADDC", "ADDE", "ADDZE" + // ppc64: "ADDC", "ADDE", -"ADDZE" + // ppc64le: "ADDC", "ADDE", -"ADDZE" // s390x:"ADDE","ADDC\t[$]-1," r, _ := bits.Add64(x, y, ci) return r @@ -500,13 +508,22 @@ func Add64M(p, q, r *[3]uint64) { r[0], c = bits.Add64(p[0], q[0], c) // arm64:"ADCS",-"ADD\t",-"CMP" // amd64:"ADCQ",-"NEGL",-"SBBQ",-"NEGQ" - // ppc64: "ADDC", "ADDE", "ADDZE" - // ppc64le: "ADDC", "ADDE", "ADDZE" + // ppc64: -"ADDC", "ADDE", -"ADDZE" + // ppc64le: -"ADDC", "ADDE", -"ADDZE" // s390x:"ADDE",-"ADDC\t[$]-1," r[1], c = bits.Add64(p[1], q[1], c) r[2], c = bits.Add64(p[2], q[2], c) } +func Add64MSaveC(p, q, r, c *[2]uint64) { + // ppc64: "ADDC\tR", "ADDZE" + // ppc64le: "ADDC\tR", "ADDZE" + r[0], c[0] = bits.Add64(p[0], q[0], 0) + // ppc64: "ADDC\t[$]-1", "ADDE", "ADDZE" + // ppc64le: "ADDC\t[$]-1", "ADDE", "ADDZE" + r[1], c[1] = bits.Add64(p[1], q[1], c[0]) +} + func Add64PanicOnOverflowEQ(a, b uint64) uint64 { r, c := bits.Add64(a, b, 0) // s390x:"BRC\t[$]3,",-"ADDE" @@ -577,6 +594,8 @@ func Add64MPanicOnOverflowGT(a, b [2]uint64) [2]uint64 { func Sub(x, y, ci uint) (r, co uint) { // amd64:"NEGL","SBBQ","NEGQ" // arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP" + // ppc64:"SUBC", "SUBE", "SUBZE", "NEG" + // ppc64le:"SUBC", "SUBE", "SUBZE", "NEG" // s390x:"SUBE" return bits.Sub(x, y, ci) } @@ -584,6 +603,8 @@ func Sub(x, y, ci uint) (r, co uint) { func SubC(x, ci uint) (r, co uint) { // amd64:"NEGL","SBBQ","NEGQ" // arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP" + // ppc64:"SUBC", "SUBE", "SUBZE", "NEG" + // ppc64le:"SUBC", "SUBE", "SUBZE", "NEG" // s390x:"SUBE" return bits.Sub(x, 7, ci) } @@ -591,6 +612,8 @@ func SubC(x, ci uint) (r, co uint) { func SubZ(x, y uint) (r, co uint) { // amd64:"SUBQ","SBBQ","NEGQ",-"NEGL" // arm64:"SUBS","NGC","NEG",-"SBCS",-"ADD",-"SUB\t",-"CMP" + // ppc64:"SUBC", -"SUBE", "SUBZE", "NEG" + // ppc64le:"SUBC", -"SUBE", "SUBZE", "NEG" // s390x:"SUBC" return bits.Sub(x, y, 0) } @@ -598,6 +621,8 @@ func SubZ(x, y uint) (r, co uint) { func SubR(x, y, ci uint) uint { // amd64:"NEGL","SBBQ",-"NEGQ" // arm64:"NEGS","SBCS",-"NGC",-"NEG\t",-"ADD",-"SUB",-"CMP" + // ppc64:"SUBC", "SUBE", -"SUBZE", -"NEG" + // ppc64le:"SUBC", "SUBE", -"SUBZE", -"NEG" // s390x:"SUBE" r, _ := bits.Sub(x, y, ci) return r @@ -607,6 +632,8 @@ func SubM(p, q, r *[3]uint) { r[0], c = bits.Sub(p[0], q[0], c) // amd64:"SBBQ",-"NEGL",-"NEGQ" // arm64:"SBCS",-"NEGS",-"NGC",-"NEG",-"ADD",-"SUB",-"CMP" + // ppc64:-"SUBC", "SUBE", -"SUBZE", -"NEG" + // ppc64le:-"SUBC", "SUBE", -"SUBZE", -"NEG" // s390x:"SUBE" r[1], c = bits.Sub(p[1], q[1], c) r[2], c = bits.Sub(p[2], q[2], c) @@ -615,6 +642,8 @@ func SubM(p, q, r *[3]uint) { func Sub64(x, y, ci uint64) (r, co uint64) { // amd64:"NEGL","SBBQ","NEGQ" // arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP" + // ppc64:"SUBC", "SUBE", "SUBZE", "NEG" + // ppc64le:"SUBC", "SUBE", "SUBZE", "NEG" // s390x:"SUBE" return bits.Sub64(x, y, ci) } @@ -622,6 +651,8 @@ func Sub64(x, y, ci uint64) (r, co uint64) { func Sub64C(x, ci uint64) (r, co uint64) { // amd64:"NEGL","SBBQ","NEGQ" // arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP" + // ppc64:"SUBC", "SUBE", "SUBZE", "NEG" + // ppc64le:"SUBC", "SUBE", "SUBZE", "NEG" // s390x:"SUBE" return bits.Sub64(x, 7, ci) } @@ -629,6 +660,8 @@ func Sub64C(x, ci uint64) (r, co uint64) { func Sub64Z(x, y uint64) (r, co uint64) { // amd64:"SUBQ","SBBQ","NEGQ",-"NEGL" // arm64:"SUBS","NGC","NEG",-"SBCS",-"ADD",-"SUB\t",-"CMP" + // ppc64:"SUBC", -"SUBE", "SUBZE", "NEG" + // ppc64le:"SUBC", -"SUBE", "SUBZE", "NEG" // s390x:"SUBC" return bits.Sub64(x, y, 0) } @@ -636,6 +669,8 @@ func Sub64Z(x, y uint64) (r, co uint64) { func Sub64R(x, y, ci uint64) uint64 { // amd64:"NEGL","SBBQ",-"NEGQ" // arm64:"NEGS","SBCS",-"NGC",-"NEG\t",-"ADD",-"SUB",-"CMP" + // ppc64:"SUBC", "SUBE", -"SUBZE", -"NEG" + // ppc64le:"SUBC", "SUBE", -"SUBZE", -"NEG" // s390x:"SUBE" r, _ := bits.Sub64(x, y, ci) return r @@ -650,6 +685,15 @@ func Sub64M(p, q, r *[3]uint64) { r[2], c = bits.Sub64(p[2], q[2], c) } +func Sub64MSaveC(p, q, r, c *[2]uint64) { + // ppc64:"SUBC\tR\\d+, R\\d+,", "SUBZE", "NEG" + // ppc64le:"SUBC\tR\\d+, R\\d+,", "SUBZE", "NEG" + r[0], c[0] = bits.Sub64(p[0], q[0], 0) + // ppc64:"SUBC\tR\\d+, [$]0,", "SUBE", "SUBZE", "NEG" + // ppc64le:"SUBC\tR\\d+, [$]0,", "SUBE", "SUBZE", "NEG" + r[1], c[1] = bits.Sub64(p[1], q[1], c[0]) +} + func Sub64PanicOnOverflowEQ(a, b uint64) uint64 { r, b := bits.Sub64(a, b, 0) // s390x:"BRC\t[$]12,",-"ADDE",-"SUBE" -- cgit v1.3