aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGarrett Bodley <garrett.bodley@gmail.com>2024-07-21 14:23:57 -0400
committerRoland Shoemaker <roland@golang.org>2024-09-04 20:35:25 +0000
commitbcb0f91bbceb3486cc7f10102ff046661fb4d364 (patch)
treef498babedd4176afa3d38607403fc38f4c7571b6
parent7eace71069e621a910a5158a1b46314d38f724ae (diff)
downloadgo-x-crypto-bcb0f91bbceb3486cc7f10102ff046661fb4d364.tar.xz
internal/poly1305: Port sum_amd64.s to Avo
This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="internal/poly1305/sum_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: I80212c95d1b05335d7f6b73a3030b6f812f6105b Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600035 Reviewed-by: Roland Shoemaker <roland@golang.org> Reviewed-by: Filippo Valsorda <filippo@golang.org> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
-rw-r--r--internal/poly1305/_asm/go.mod15
-rw-r--r--internal/poly1305/_asm/go.sum12
-rw-r--r--internal/poly1305/_asm/sum_amd64_asm.go126
-rw-r--r--internal/poly1305/sum_amd64.s133
4 files changed, 212 insertions, 74 deletions
diff --git a/internal/poly1305/_asm/go.mod b/internal/poly1305/_asm/go.mod
new file mode 100644
index 0000000..47f2b75
--- /dev/null
+++ b/internal/poly1305/_asm/go.mod
@@ -0,0 +1,15 @@
+module internal/poly1305/_asm
+
+go 1.23
+
+require (
+ github.com/mmcloughlin/avo v0.6.0
+ golang.org/x/crypto v0.26.0
+)
+
+require (
+ golang.org/x/mod v0.20.0 // indirect
+ golang.org/x/sync v0.8.0 // indirect
+ golang.org/x/sys v0.24.0 // indirect
+ golang.org/x/tools v0.24.0 // indirect
+)
diff --git a/internal/poly1305/_asm/go.sum b/internal/poly1305/_asm/go.sum
new file mode 100644
index 0000000..62ea9df
--- /dev/null
+++ b/internal/poly1305/_asm/go.sum
@@ -0,0 +1,12 @@
+github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY=
+github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8=
+golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
+golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
+golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
+golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
+golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
+golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
+golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=
diff --git a/internal/poly1305/_asm/sum_amd64_asm.go b/internal/poly1305/_asm/sum_amd64_asm.go
new file mode 100644
index 0000000..a445c68
--- /dev/null
+++ b/internal/poly1305/_asm/sum_amd64_asm.go
@@ -0,0 +1,126 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ . "github.com/mmcloughlin/avo/build"
+ . "github.com/mmcloughlin/avo/operand"
+ . "github.com/mmcloughlin/avo/reg"
+ _ "golang.org/x/crypto/sha3"
+)
+
+//go:generate go run . -out ../sum_amd64.s -pkg poly1305
+
+func main() {
+ Package("golang.org/x/crypto/internal/poly1305")
+ ConstraintExpr("gc,!purego")
+ update()
+ Generate()
+}
+
+func update() {
+ Implement("update")
+
+ Load(Param("state"), RDI)
+ MOVQ(NewParamAddr("msg_base", 8), RSI)
+ MOVQ(NewParamAddr("msg_len", 16), R15)
+
+ MOVQ(Mem{Base: DI}.Offset(0), R8) // h0
+ MOVQ(Mem{Base: DI}.Offset(8), R9) // h1
+ MOVQ(Mem{Base: DI}.Offset(16), R10) // h2
+ MOVQ(Mem{Base: DI}.Offset(24), R11) // r0
+ MOVQ(Mem{Base: DI}.Offset(32), R12) // r1
+
+ CMPQ(R15, Imm(16))
+ JB(LabelRef("bytes_between_0_and_15"))
+
+ Label("loop")
+ POLY1305_ADD(RSI, R8, R9, R10)
+
+ Label("multiply")
+ POLY1305_MUL(R8, R9, R10, R11, R12, RBX, RCX, R13, R14)
+ SUBQ(Imm(16), R15)
+ CMPQ(R15, Imm(16))
+ JAE(LabelRef("loop"))
+
+ Label("bytes_between_0_and_15")
+ TESTQ(R15, R15)
+ JZ(LabelRef("done"))
+ MOVQ(U32(1), RBX)
+ XORQ(RCX, RCX)
+ XORQ(R13, R13)
+ ADDQ(R15, RSI)
+
+ Label("flush_buffer")
+ SHLQ(Imm(8), RBX, RCX)
+ SHLQ(Imm(8), RBX)
+ MOVB(Mem{Base: SI}.Offset(-1), R13B)
+ XORQ(R13, RBX)
+ DECQ(RSI)
+ DECQ(R15)
+ JNZ(LabelRef("flush_buffer"))
+
+ ADDQ(RBX, R8)
+ ADCQ(RCX, R9)
+ ADCQ(Imm(0), R10)
+ MOVQ(U32(16), R15)
+ JMP(LabelRef("multiply"))
+
+ Label("done")
+ MOVQ(R8, Mem{Base: DI}.Offset(0))
+ MOVQ(R9, Mem{Base: DI}.Offset(8))
+ MOVQ(R10, Mem{Base: DI}.Offset(16))
+ RET()
+}
+
+func POLY1305_ADD(msg, h0, h1, h2 GPPhysical) {
+ ADDQ(Mem{Base: msg}.Offset(0), h0)
+ ADCQ(Mem{Base: msg}.Offset(8), h1)
+ ADCQ(Imm(1), h2)
+ LEAQ(Mem{Base: msg}.Offset(16), msg)
+}
+
+func POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3 GPPhysical) {
+ MOVQ(r0, RAX)
+ MULQ(h0)
+ MOVQ(RAX, t0)
+ MOVQ(RDX, t1)
+ MOVQ(r0, RAX)
+ MULQ(h1)
+ ADDQ(RAX, t1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(r0, t2)
+ IMULQ(h2, t2)
+ ADDQ(RDX, t2)
+
+ MOVQ(r1, RAX)
+ MULQ(h0)
+ ADDQ(RAX, t1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, h0)
+ MOVQ(r1, t3)
+ IMULQ(h2, t3)
+ MOVQ(r1, RAX)
+ MULQ(h1)
+ ADDQ(RAX, t2)
+ ADCQ(RDX, t3)
+ ADDQ(h0, t2)
+ ADCQ(Imm(0), t3)
+
+ MOVQ(t0, h0)
+ MOVQ(t1, h1)
+ MOVQ(t2, h2)
+ ANDQ(Imm(3), h2)
+ MOVQ(t2, t0)
+ ANDQ(I32(-4), t0)
+ ADDQ(t0, h0)
+ ADCQ(t3, h1)
+ ADCQ(Imm(0), h2)
+ SHRQ(Imm(2), t3, t2)
+ SHRQ(Imm(2), t3)
+ ADDQ(t2, h0)
+ ADCQ(t3, h1)
+ ADCQ(Imm(0), h2)
+}
diff --git a/internal/poly1305/sum_amd64.s b/internal/poly1305/sum_amd64.s
index e0d3c64..1337573 100644
--- a/internal/poly1305/sum_amd64.s
+++ b/internal/poly1305/sum_amd64.s
@@ -1,108 +1,93 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT.
//go:build gc && !purego
-#include "textflag.h"
-
-#define POLY1305_ADD(msg, h0, h1, h2) \
- ADDQ 0(msg), h0; \
- ADCQ 8(msg), h1; \
- ADCQ $1, h2; \
- LEAQ 16(msg), msg
-
-#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
- MOVQ r0, AX; \
- MULQ h0; \
- MOVQ AX, t0; \
- MOVQ DX, t1; \
- MOVQ r0, AX; \
- MULQ h1; \
- ADDQ AX, t1; \
- ADCQ $0, DX; \
- MOVQ r0, t2; \
- IMULQ h2, t2; \
- ADDQ DX, t2; \
- \
- MOVQ r1, AX; \
- MULQ h0; \
- ADDQ AX, t1; \
- ADCQ $0, DX; \
- MOVQ DX, h0; \
- MOVQ r1, t3; \
- IMULQ h2, t3; \
- MOVQ r1, AX; \
- MULQ h1; \
- ADDQ AX, t2; \
- ADCQ DX, t3; \
- ADDQ h0, t2; \
- ADCQ $0, t3; \
- \
- MOVQ t0, h0; \
- MOVQ t1, h1; \
- MOVQ t2, h2; \
- ANDQ $3, h2; \
- MOVQ t2, t0; \
- ANDQ $0xFFFFFFFFFFFFFFFC, t0; \
- ADDQ t0, h0; \
- ADCQ t3, h1; \
- ADCQ $0, h2; \
- SHRQ $2, t3, t2; \
- SHRQ $2, t3; \
- ADDQ t2, h0; \
- ADCQ t3, h1; \
- ADCQ $0, h2
-
-// func update(state *[7]uint64, msg []byte)
+// func update(state *macState, msg []byte)
TEXT ·update(SB), $0-32
MOVQ state+0(FP), DI
MOVQ msg_base+8(FP), SI
MOVQ msg_len+16(FP), R15
-
- MOVQ 0(DI), R8 // h0
- MOVQ 8(DI), R9 // h1
- MOVQ 16(DI), R10 // h2
- MOVQ 24(DI), R11 // r0
- MOVQ 32(DI), R12 // r1
-
- CMPQ R15, $16
+ MOVQ (DI), R8
+ MOVQ 8(DI), R9
+ MOVQ 16(DI), R10
+ MOVQ 24(DI), R11
+ MOVQ 32(DI), R12
+ CMPQ R15, $0x10
JB bytes_between_0_and_15
loop:
- POLY1305_ADD(SI, R8, R9, R10)
+ ADDQ (SI), R8
+ ADCQ 8(SI), R9
+ ADCQ $0x01, R10
+ LEAQ 16(SI), SI
multiply:
- POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
- SUBQ $16, R15
- CMPQ R15, $16
- JAE loop
+ MOVQ R11, AX
+ MULQ R8
+ MOVQ AX, BX
+ MOVQ DX, CX
+ MOVQ R11, AX
+ MULQ R9
+ ADDQ AX, CX
+ ADCQ $0x00, DX
+ MOVQ R11, R13
+ IMULQ R10, R13
+ ADDQ DX, R13
+ MOVQ R12, AX
+ MULQ R8
+ ADDQ AX, CX
+ ADCQ $0x00, DX
+ MOVQ DX, R8
+ MOVQ R12, R14
+ IMULQ R10, R14
+ MOVQ R12, AX
+ MULQ R9
+ ADDQ AX, R13
+ ADCQ DX, R14
+ ADDQ R8, R13
+ ADCQ $0x00, R14
+ MOVQ BX, R8
+ MOVQ CX, R9
+ MOVQ R13, R10
+ ANDQ $0x03, R10
+ MOVQ R13, BX
+ ANDQ $-4, BX
+ ADDQ BX, R8
+ ADCQ R14, R9
+ ADCQ $0x00, R10
+ SHRQ $0x02, R14, R13
+ SHRQ $0x02, R14
+ ADDQ R13, R8
+ ADCQ R14, R9
+ ADCQ $0x00, R10
+ SUBQ $0x10, R15
+ CMPQ R15, $0x10
+ JAE loop
bytes_between_0_and_15:
TESTQ R15, R15
JZ done
- MOVQ $1, BX
+ MOVQ $0x00000001, BX
XORQ CX, CX
XORQ R13, R13
ADDQ R15, SI
flush_buffer:
- SHLQ $8, BX, CX
- SHLQ $8, BX
+ SHLQ $0x08, BX, CX
+ SHLQ $0x08, BX
MOVB -1(SI), R13
XORQ R13, BX
DECQ SI
DECQ R15
JNZ flush_buffer
-
ADDQ BX, R8
ADCQ CX, R9
- ADCQ $0, R10
- MOVQ $16, R15
+ ADCQ $0x00, R10
+ MOVQ $0x00000010, R15
JMP multiply
done:
- MOVQ R8, 0(DI)
+ MOVQ R8, (DI)
MOVQ R9, 8(DI)
MOVQ R10, 16(DI)
RET