aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorPaul E. Murphy <murp@ibm.com>2022-03-24 10:09:13 -0500
committerPaul Murphy <murp@ibm.com>2022-04-07 20:56:02 +0000
commit8d581f589eb9b9eac05c75f0dfe82a49c3afcd2f (patch)
treef7436b36f32eaa23bdf26b3e70f61c833dc95e1e /src
parent6f6942ef7afc34c6b7e1eea3031ed61acc458d2a (diff)
downloadgo-8d581f589eb9b9eac05c75f0dfe82a49c3afcd2f.tar.xz
crypto/aes: simplify key expansion in ppc64le asm
The ported cryptogam implementation uses a subtle and tricky mechanism using lxv/vperm/lvsl to load unaligned vectors. This is difficult to read, and may read and write unrelated bytes if reading from an unaligned address. Instead, POWER8 instructions can be used to load from unaligned memory with much less overhead. Alignment interrupts only occur when reading or writing cache-inhibited memory, which we assume isn't used in go today, otherwise alignment penalties are usually marginal. Instead lxvd2x+xxpermdi and xxpermdi+stxvd2x can be used to emulate unaligned LE bytewise loads, similar to lxv/stxv on POWER9 in little-endian mode. Likewise, a custom permute vector is used to emulate BE bytewise storage operations, lxvb16x/stxvb16x, on POWER9. This greatly simplifies the code, and it makes it much easier to store the keys in reverse (which is exactly how the decrypt keys are expected to be stored). Change-Id: I2334337e31a8fdf8d13ba96231142a039f237098 Reviewed-on: https://go-review.googlesource.com/c/go/+/395494 Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Trust: Paul Murphy <murp@ibm.com> Run-TryBot: Paul Murphy <murp@ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org>
Diffstat (limited to 'src')
-rw-r--r--src/crypto/aes/asm_ppc64le.s132
1 files changed, 54 insertions, 78 deletions
diff --git a/src/crypto/aes/asm_ppc64le.s b/src/crypto/aes/asm_ppc64le.s
index 5eae675322..192a8096cd 100644
--- a/src/crypto/aes/asm_ppc64le.s
+++ b/src/crypto/aes/asm_ppc64le.s
@@ -43,6 +43,10 @@
#define OUTHEAD V10
#define OUTTAIL V11
+// For P9 instruction emulation
+#define ESPERM V21 // Endian swapping permute into BE
+#define TMP2 V22 // Temporary for P8_STXVB16X/P8_STXV
+
// For {en,de}cryptBlockAsm
#define BLK_INP R3
#define BLK_OUT R4
@@ -50,15 +54,38 @@
#define BLK_ROUNDS R6
#define BLK_IDX R7
-DATA ·rcon+0x00(SB)/8, $0x0100000001000000 // RCON
-DATA ·rcon+0x08(SB)/8, $0x0100000001000000 // RCON
-DATA ·rcon+0x10(SB)/8, $0x1b0000001b000000
-DATA ·rcon+0x18(SB)/8, $0x1b0000001b000000
-DATA ·rcon+0x20(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
-DATA ·rcon+0x28(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
-DATA ·rcon+0x30(SB)/8, $0x0000000000000000
-DATA ·rcon+0x38(SB)/8, $0x0000000000000000
-GLOBL ·rcon(SB), RODATA, $64
+DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap
+DATA ·rcon+0x08(SB)/8, $0x0706050403020100
+DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON
+DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON
+DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000
+DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000
+DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
+DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
+DATA ·rcon+0x40(SB)/8, $0x0000000000000000
+DATA ·rcon+0x48(SB)/8, $0x0000000000000000
+GLOBL ·rcon(SB), RODATA, $80
+
+// Emulate unaligned BE vector load/stores on LE targets
+#define P8_LXVB16X(RA,RB,VT) \
+ LXVD2X (RA+RB), VT \
+ VPERM VT, VT, ESPERM, VT
+
+#define P8_STXVB16X(VS,RA,RB) \
+ VPERM VS, VS, ESPERM, TMP2 \
+ STXVD2X TMP2, (RA+RB)
+
+#define P8_STXV(VS,RA,RB) \
+ XXPERMDI VS, VS, $2, TMP2 \
+ STXVD2X TMP2, (RA+RB)
+
+#define P8_LXV(RA,RB,VT) \
+ LXVD2X (RA+RB), VT \
+ XXPERMDI VT, VT, $2, VT
+
+#define LXSDX_BE(RA,RB,VT) \
+ LXSDX (RA+RB), VT \
+ VPERM VT, VT, ESPERM, VT
// func setEncryptKeyAsm(key *byte, keylen int, enc *uint32) int
TEXT ·setEncryptKeyAsm(SB), NOSPLIT|NOFRAME, $0
@@ -87,45 +114,32 @@ TEXT ·doEncryptKeyAsm(SB), NOSPLIT|NOFRAME, $0
BC 0x06, 2, enc_key_abort // bne- .Lenc_key_abort
MOVD $·rcon(SB), PTR // PTR point to rcon addr
+ LVX (PTR), ESPERM
+ ADD $0x10, PTR
// Get key from memory and write aligned into VR
- NEG INP, R9 // neg 9,3 R9 is ~INP + 1
- LVX (INP)(R0), IN0 // lvx 1,0,3 Load key inside IN0
- ADD $15, INP, INP // addi 3,3,15 Add 15B to INP addr
- LVSR (R9)(R0), KEY // lvsr 3,0,9
+ P8_LXVB16X(INP, R0, IN0)
+ ADD $0x10, INP, INP
MOVD $0x20, R8 // li 8,0x20 R8 = 32
+
CMPW BITS, $192 // cmpwi 4,192 Key size == 192?
- LVX (INP)(R0), IN1 // lvx 2,0,3
- VSPLTISB $0x0f, MASK// vspltisb 5,0x0f 0x0f0f0f0f... mask
LVX (PTR)(R0), RCON // lvx 4,0,6 Load first 16 bytes into RCON
- VXOR KEY, MASK, KEY // vxor 3,3,5 Adjust for byte swap
LVX (PTR)(R8), MASK // lvx 5,8,6
ADD $0x10, PTR, PTR // addi 6,6,0x10 PTR to next 16 bytes of RCON
- VPERM IN0, IN1, KEY, IN0 // vperm 1,1,2,3 Align
MOVD $8, CNT // li 7,8 CNT = 8
VXOR ZERO, ZERO, ZERO // vxor 0,0,0 Zero to be zero :)
MOVD CNT, CTR // mtctr 7 Set the counter to 8 (rounds)
- LVSL (OUT)(R0), OUTPERM // lvsl 8,0,5
- VSPLTISB $-1, OUTMASK // vspltisb 9,-1
- LVX (OUT)(R0), OUTHEAD // lvx 10,0,5
- VPERM OUTMASK, ZERO, OUTPERM, OUTMASK // vperm 9,9,0,8
-
BLT loop128 // blt .Loop128
- ADD $8, INP, INP // addi 3,3,8
BEQ l192 // beq .L192
- ADD $8, INP, INP // addi 3,3,8
JMP l256 // b .L256
loop128:
// Key schedule (Round 1 to 8)
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 Rotate
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
+ P8_STXV(IN0, R0, OUT)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
- STVX STAGE, (OUT+R0) // stvx 7,0,5 Write to output
ADD $16, OUT, OUT // addi 5,5,16 Point to the next round
VXOR IN0, TMP, IN0 // vxor 1,1,6
@@ -142,11 +156,8 @@ loop128:
// Key schedule (Round 9)
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-spat
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 Rotate
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
+ P8_STXV(IN0, R0, OUT)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
- STVX STAGE, (OUT+R0) // stvx 7,0,5 Round 9
ADD $16, OUT, OUT // addi 5,5,16
// Key schedule (Round 10)
@@ -160,11 +171,8 @@ loop128:
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 Rotate
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
+ P8_STXV(IN0, R0, OUT)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
- STVX STAGE, (OUT+R0) // stvx 7,0,5 Round 10
ADD $16, OUT, OUT // addi 5,5,16
// Key schedule (Round 11)
@@ -174,26 +182,18 @@ loop128:
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VXOR IN0, KEY, IN0 // vxor 1,1,3
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
- STVX STAGE, (OUT+R0) // stvx 7,0,5 Round 11
+ P8_STXV(IN0, R0, OUT)
- ADD $15, OUT, INP // addi 3,5,15
ADD $0x50, OUT, OUT // addi 5,5,0x50
MOVD $10, ROUNDS // li 8,10
JMP done // b .Ldone
l192:
- LVX (INP)(R0), TMP // lvx 6,0,3
+ LXSDX_BE(INP, R0, IN1) // Load next 8 bytes into upper half of VSR in BE order.
MOVD $4, CNT // li 7,4
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
- STVX STAGE, (OUT+R0) // stvx 7,0,5
+ P8_STXV(IN0, R0, OUT)
ADD $16, OUT, OUT // addi 5,5,16
- VPERM IN1, TMP, KEY, IN1 // vperm 2,2,6,3
VSPLTISB $8, KEY // vspltisb 3,8
MOVD CNT, CTR // mtctr 7
VSUBUBM MASK, KEY, MASK // vsububm 5,5,3
@@ -221,23 +221,17 @@ loop192:
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
- VPERM STAGE, STAGE, OUTPERM, OUTTAIL // vperm 11,7,7,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
+ P8_STXV(STAGE, R0, OUT)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
- STVX STAGE, (OUT+R0) // stvx 7,0,5
ADD $16, OUT, OUT // addi 5,5,16
VSLDOI $8, IN0, IN1, STAGE // vsldoi 7,1,2,8
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
- VPERM STAGE, STAGE, OUTPERM, OUTTAIL // vperm 11,7,7,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
+ P8_STXV(STAGE, R0, OUT)
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
- STVX STAGE, (OUT+R0) // stvx 7,0,5
ADD $16, OUT, OUT // addi 5,5,16
VSPLTW $3, IN0, TMP // vspltw 6,1,3
@@ -247,11 +241,7 @@ loop192:
VXOR IN1, TMP, IN1 // vxor 2,2,6
VXOR IN0, KEY, IN0 // vxor 1,1,3
VXOR IN1, KEY, IN1 // vxor 2,2,3
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
- STVX STAGE, (OUT+R0) // stvx 7,0,5
- ADD $15, OUT, INP // addi 3,5,15
+ P8_STXV(IN0, R0, OUT)
ADD $16, OUT, OUT // addi 5,5,16
BC 0x10, 0, loop192 // bdnz .Loop192
@@ -260,25 +250,18 @@ loop192:
BR done // b .Ldone
l256:
- LVX (INP)(R0), TMP // lvx 6,0,3
+ P8_LXVB16X(INP, R0, IN1)
MOVD $7, CNT // li 7,7
MOVD $14, ROUNDS // li 8,14
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
- STVX STAGE, (OUT+R0) // stvx 7,0,5
+ P8_STXV(IN0, R0, OUT)
ADD $16, OUT, OUT // addi 5,5,16
- VPERM IN1, TMP, KEY, IN1 // vperm 2,2,6,3
MOVD CNT, CTR // mtctr 7
loop256:
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
- VPERM IN1, IN1, OUTPERM, OUTTAIL // vperm 11,2,2,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
+ P8_STXV(IN1, R0, OUT)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
- STVX STAGE, (OUT+R0) // stvx 7,0,5
ADD $16, OUT, OUT // addi 5,5,16
VXOR IN0, TMP, IN0 // vxor 1,1,6
@@ -288,11 +271,7 @@ loop256:
VXOR IN0, TMP, IN0 // vxor 1,1,6
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
VXOR IN0, KEY, IN0 // vxor 1,1,3
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
- STVX STAGE, (OUT+R0) // stvx 7,0,5
- ADD $15, OUT, INP // addi 3,5,15
+ P8_STXV(IN0, R0, OUT)
ADD $16, OUT, OUT // addi 5,5,16
BC 0x12, 0, done // bdz .Ldone
@@ -310,9 +289,6 @@ loop256:
JMP loop256 // b .Loop256
done:
- LVX (INP)(R0), IN1 // lvx 2,0,3
- VSEL OUTHEAD, IN1, OUTMASK, IN1 // vsel 2,10,2,9
- STVX IN1, (INP+R0) // stvx 2,0,3
MOVD $0, PTR // li 6,0 set PTR to 0 (exit code 0)
MOVW ROUNDS, 0(OUT) // stw 8,0(5)