aboutsummaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorMeng Zhuo <mengzhuo1203@gmail.com>2018-04-04 13:15:22 +0800
committerBrad Fitzpatrick <bradfitz@golang.org>2018-04-04 16:25:39 +0000
commit73e0c302b2bcf26c009f682c2c11ef3567854d46 (patch)
tree8769fd8e6fa921267ea7befd324ed8c676b25311 /src/runtime
parent28c1ad9d35f27b3b57afff4ee78faac746a8ed0a (diff)
downloadgo-73e0c302b2bcf26c009f682c2c11ef3567854d46.tar.xz
runtime: implement aeshash for arm64 platform
Fix #10109 name old time/op new time/op delta Hash5 72.3ns ± 0% 51.5ns ± 0% -28.71% (p=0.000 n=4+5) Hash16 78.8ns ± 0% 48.7ns ± 0% ~ (p=0.079 n=4+5) Hash64 196ns ±25% 73ns ±16% -62.68% (p=0.008 n=5+5) Hash1024 1.57µs ± 0% 0.27µs ± 1% -82.90% (p=0.000 n=5+4) Hash65536 96.5µs ± 0% 14.3µs ± 0% -85.14% (p=0.016 n=5+4) HashStringSpeed 156ns ± 6% 129ns ± 3% -17.56% (p=0.008 n=5+5) HashBytesSpeed 227ns ± 1% 200ns ± 1% -11.98% (p=0.008 n=5+5) HashInt32Speed 116ns ± 2% 102ns ± 0% -11.92% (p=0.016 n=5+4) HashInt64Speed 120ns ± 3% 101ns ± 2% -15.55% (p=0.008 n=5+5) HashStringArraySpeed 342ns ± 0% 306ns ± 2% -10.58% (p=0.008 n=5+5) FastrandHashiter 217ns ± 1% 217ns ± 1% ~ (p=1.000 n=5+5) name old speed new speed delta Hash5 69.1MB/s ± 0% 97.0MB/s ± 0% +40.32% (p=0.008 n=5+5) Hash16 203MB/s ± 0% 329MB/s ± 0% +61.76% (p=0.016 n=4+5) Hash64 332MB/s ±21% 881MB/s ±14% +165.66% (p=0.008 n=5+5) Hash1024 651MB/s ± 0% 3652MB/s ±17% +460.73% (p=0.008 n=5+5) Hash65536 679MB/s ± 0% 4570MB/s ± 0% +572.85% (p=0.016 n=5+4) Change-Id: I573028979f84cf2e0e087951271d5de8865dbf04 Reviewed-on: https://go-review.googlesource.com/89755 Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/alg.go24
-rw-r--r--src/runtime/asm_arm64.s393
-rw-r--r--src/runtime/hash64.go3
-rw-r--r--src/runtime/os_linux_arm64.go4
-rw-r--r--src/runtime/runtime2.go2
5 files changed, 402 insertions, 24 deletions
diff --git a/src/runtime/alg.go b/src/runtime/alg.go
index 89125f48ba..cc723e49e2 100644
--- a/src/runtime/alg.go
+++ b/src/runtime/alg.go
@@ -272,25 +272,24 @@ func ifaceHash(i interface {
const hashRandomBytes = sys.PtrSize / 4 * 64
-// used in asm_{386,amd64}.s to seed the hash function
+// used in asm_{386,amd64,arm64}.s to seed the hash function
var aeskeysched [hashRandomBytes]byte
// used in hash{32,64}.go to seed the hash function
var hashkey [4]uintptr
func alginit() {
- // Install aes hash algorithm if we have the instructions we need
+ // Install AES hash algorithms if the instructions needed are present.
if (GOARCH == "386" || GOARCH == "amd64") &&
GOOS != "nacl" &&
support_aes && // AESENC
support_ssse3 && // PSHUFB
support_sse41 { // PINSR{D,Q}
- useAeshash = true
- algarray[alg_MEM32].hash = aeshash32
- algarray[alg_MEM64].hash = aeshash64
- algarray[alg_STRING].hash = aeshashstr
- // Initialize with random data so hash collisions will be hard to engineer.
- getRandomData(aeskeysched[:])
+ initAlgAES()
+ return
+ }
+ if GOARCH == "arm64" && arm64_support_aes {
+ initAlgAES()
return
}
getRandomData((*[len(hashkey) * sys.PtrSize]byte)(unsafe.Pointer(&hashkey))[:])
@@ -299,3 +298,12 @@ func alginit() {
hashkey[2] |= 1
hashkey[3] |= 1
}
+
+func initAlgAES() {
+ useAeshash = true
+ algarray[alg_MEM32].hash = aeshash32
+ algarray[alg_MEM64].hash = aeshash64
+ algarray[alg_STRING].hash = aeshashstr
+ // Initialize with random data so hash collisions will be hard to engineer.
+ getRandomData(aeskeysched[:])
+}
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 2b39d2ec72..1e0d71ab3b 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -436,20 +436,385 @@ CALLFN(·call268435456, 268435464 )
CALLFN(·call536870912, 536870920 )
CALLFN(·call1073741824, 1073741832 )
-// AES hashing not implemented for ARM64, issue #10109.
-TEXT runtime·aeshash(SB),NOSPLIT|NOFRAME,$0-0
- MOVW $0, R0
- MOVW (R0), R1
-TEXT runtime·aeshash32(SB),NOSPLIT|NOFRAME,$0-0
- MOVW $0, R0
- MOVW (R0), R1
-TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-0
- MOVW $0, R0
- MOVW (R0), R1
-TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-0
- MOVW $0, R0
- MOVW (R0), R1
-
+// func aeshash32(p unsafe.Pointer, h uintptr) uintptr
+TEXT runtime·aeshash32(SB),NOSPLIT|NOFRAME,$0-24
+ MOVD p+0(FP), R0
+ MOVD h+8(FP), R1
+ MOVD $ret+16(FP), R2
+ MOVD $runtime·aeskeysched+0(SB), R3
+
+ VEOR V0.B16, V0.B16, V0.B16
+ VLD1 (R3), [V2.B16]
+ VLD1 (R0), V0.S[1]
+ VMOV R1, V0.S[0]
+
+ AESE V2.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V2.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V2.B16, V0.B16
+
+ VST1 [V0.D1], (R2)
+ RET
+
+// func aeshash64(p unsafe.Pointer, h uintptr) uintptr
+TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-24
+ MOVD p+0(FP), R0
+ MOVD h+8(FP), R1
+ MOVD $ret+16(FP), R2
+ MOVD $runtime·aeskeysched+0(SB), R3
+
+ VEOR V0.B16, V0.B16, V0.B16
+ VLD1 (R3), [V2.B16]
+ VLD1 (R0), V0.D[1]
+ VMOV R1, V0.D[0]
+
+ AESE V2.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V2.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V2.B16, V0.B16
+
+ VST1 [V0.D1], (R2)
+ RET
+
+// func aeshash(p unsafe.Pointer, h, size uintptr) uintptr
+TEXT runtime·aeshash(SB),NOSPLIT|NOFRAME,$0-32
+ MOVD p+0(FP), R0
+ MOVD s+16(FP), R1
+ MOVWU h+8(FP), R3
+ MOVD $ret+24(FP), R2
+ B aeshashbody<>(SB)
+
+// func aeshashstr(p unsafe.Pointer, h uintptr) uintptr
+TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-24
+ MOVD p+0(FP), R10 // string pointer
+ LDP (R10), (R0, R1) //string data/ length
+ MOVWU h+8(FP), R3
+ MOVD $ret+16(FP), R2 // return adddress
+ B aeshashbody<>(SB)
+
+// R0: data
+// R1: length (maximum 32 bits)
+// R2: address to put return value
+// R3: seed data
+TEXT aeshashbody<>(SB),NOSPLIT|NOFRAME,$0
+ VEOR V30.B16, V30.B16, V30.B16
+ VMOV R3, V30.S[0]
+ VMOV R1, V30.S[1] // load length into seed
+
+ MOVD $runtime·aeskeysched+0(SB), R4
+ VLD1.P 16(R4), [V0.B16]
+ AESE V30.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ CMP $16, R1
+ BLO aes0to15
+ BEQ aes16
+ CMP $32, R1
+ BLS aes17to32
+ CMP $64, R1
+ BLS aes33to64
+ CMP $128, R1
+ BLS aes65to128
+ B aes129plus
+
+aes0to15:
+ CMP $0, R1
+ BEQ aes0
+ VEOR V2.B16, V2.B16, V2.B16
+ TBZ $3, R1, less_than_8
+ VLD1.P 8(R0), V2.D[0]
+
+less_than_8:
+ TBZ $2, R1, less_than_4
+ VLD1.P 4(R0), V2.S[2]
+
+less_than_4:
+ TBZ $1, R1, less_than_2
+ VLD1.P 2(R0), V2.H[6]
+
+less_than_2:
+ TBZ $0, R1, done
+ VLD1 (R0), V2.B[14]
+done:
+ AESE V0.B16, V2.B16
+ AESMC V2.B16, V2.B16
+ AESE V0.B16, V2.B16
+ AESMC V2.B16, V2.B16
+ AESE V0.B16, V2.B16
+
+ VST1 [V2.D1], (R2)
+ RET
+aes0:
+ VST1 [V0.D1], (R2)
+ RET
+aes16:
+ VLD1 (R0), [V2.B16]
+ B done
+
+aes17to32:
+ // make second seed
+ VLD1 (R4), [V1.B16]
+ AESE V30.B16, V1.B16
+ AESMC V1.B16, V1.B16
+ SUB $16, R1, R10
+ VLD1.P (R0)(R10), [V2.B16]
+ VLD1 (R0), [V3.B16]
+
+ AESE V0.B16, V2.B16
+ AESMC V2.B16, V2.B16
+ AESE V1.B16, V3.B16
+ AESMC V3.B16, V3.B16
+
+ AESE V0.B16, V2.B16
+ AESMC V2.B16, V2.B16
+ AESE V1.B16, V3.B16
+ AESMC V3.B16, V3.B16
+
+ AESE V0.B16, V2.B16
+ AESE V1.B16, V3.B16
+
+ VEOR V3.B16, V2.B16, V2.B16
+ VST1 [V2.D1], (R2)
+ RET
+
+aes33to64:
+ VLD1 (R4), [V1.B16, V2.B16, V3.B16]
+ AESE V30.B16, V1.B16
+ AESMC V1.B16, V1.B16
+ AESE V30.B16, V2.B16
+ AESMC V2.B16, V2.B16
+ AESE V30.B16, V3.B16
+ AESMC V3.B16, V3.B16
+ SUB $32, R1, R10
+
+ VLD1.P (R0)(R10), [V4.B16, V5.B16]
+ VLD1 (R0), [V6.B16, V7.B16]
+
+ AESE V0.B16, V4.B16
+ AESMC V4.B16, V4.B16
+ AESE V1.B16, V5.B16
+ AESMC V5.B16, V5.B16
+ AESE V2.B16, V6.B16
+ AESMC V6.B16, V6.B16
+ AESE V3.B16, V7.B16
+ AESMC V7.B16, V7.B16
+
+ AESE V0.B16, V4.B16
+ AESMC V4.B16, V4.B16
+ AESE V1.B16, V5.B16
+ AESMC V5.B16, V5.B16
+ AESE V2.B16, V6.B16
+ AESMC V6.B16, V6.B16
+ AESE V3.B16, V7.B16
+ AESMC V7.B16, V7.B16
+
+ AESE V0.B16, V4.B16
+ AESE V1.B16, V5.B16
+ AESE V2.B16, V6.B16
+ AESE V3.B16, V7.B16
+
+ VEOR V6.B16, V4.B16, V4.B16
+ VEOR V7.B16, V5.B16, V5.B16
+ VEOR V5.B16, V4.B16, V4.B16
+
+ VST1 [V4.D1], (R2)
+ RET
+
+aes65to128:
+ VLD1.P 64(R4), [V1.B16, V2.B16, V3.B16, V4.B16]
+ VLD1 (R4), [V5.B16, V6.B16, V7.B16]
+ AESE V30.B16, V1.B16
+ AESMC V1.B16, V1.B16
+ AESE V30.B16, V2.B16
+ AESMC V2.B16, V2.B16
+ AESE V30.B16, V3.B16
+ AESMC V3.B16, V3.B16
+ AESE V30.B16, V4.B16
+ AESMC V4.B16, V4.B16
+ AESE V30.B16, V5.B16
+ AESMC V5.B16, V5.B16
+ AESE V30.B16, V6.B16
+ AESMC V6.B16, V6.B16
+ AESE V30.B16, V7.B16
+ AESMC V7.B16, V7.B16
+
+ SUB $64, R1, R10
+ VLD1.P (R0)(R10), [V8.B16, V9.B16, V10.B16, V11.B16]
+ VLD1 (R0), [V12.B16, V13.B16, V14.B16, V15.B16]
+ AESE V0.B16, V8.B16
+ AESMC V8.B16, V8.B16
+ AESE V1.B16, V9.B16
+ AESMC V9.B16, V9.B16
+ AESE V2.B16, V10.B16
+ AESMC V10.B16, V10.B16
+ AESE V3.B16, V11.B16
+ AESMC V11.B16, V11.B16
+ AESE V4.B16, V12.B16
+ AESMC V12.B16, V12.B16
+ AESE V5.B16, V13.B16
+ AESMC V13.B16, V13.B16
+ AESE V6.B16, V14.B16
+ AESMC V14.B16, V14.B16
+ AESE V7.B16, V15.B16
+ AESMC V15.B16, V15.B16
+
+ AESE V0.B16, V8.B16
+ AESMC V8.B16, V8.B16
+ AESE V1.B16, V9.B16
+ AESMC V9.B16, V9.B16
+ AESE V2.B16, V10.B16
+ AESMC V10.B16, V10.B16
+ AESE V3.B16, V11.B16
+ AESMC V11.B16, V11.B16
+ AESE V4.B16, V12.B16
+ AESMC V12.B16, V12.B16
+ AESE V5.B16, V13.B16
+ AESMC V13.B16, V13.B16
+ AESE V6.B16, V14.B16
+ AESMC V14.B16, V14.B16
+ AESE V7.B16, V15.B16
+ AESMC V15.B16, V15.B16
+
+ AESE V0.B16, V8.B16
+ AESE V1.B16, V9.B16
+ AESE V2.B16, V10.B16
+ AESE V3.B16, V11.B16
+ AESE V4.B16, V12.B16
+ AESE V5.B16, V13.B16
+ AESE V6.B16, V14.B16
+ AESE V7.B16, V15.B16
+
+ VEOR V12.B16, V8.B16, V8.B16
+ VEOR V13.B16, V9.B16, V9.B16
+ VEOR V14.B16, V10.B16, V10.B16
+ VEOR V15.B16, V11.B16, V11.B16
+ VEOR V10.B16, V8.B16, V8.B16
+ VEOR V11.B16, V9.B16, V9.B16
+ VEOR V9.B16, V8.B16, V8.B16
+
+ VST1 [V8.D1], (R2)
+ RET
+
+aes129plus:
+ PRFM (R0), PLDL1KEEP
+ VLD1.P 64(R4), [V1.B16, V2.B16, V3.B16, V4.B16]
+ VLD1 (R4), [V5.B16, V6.B16, V7.B16]
+ AESE V30.B16, V1.B16
+ AESMC V1.B16, V1.B16
+ AESE V30.B16, V2.B16
+ AESMC V2.B16, V2.B16
+ AESE V30.B16, V3.B16
+ AESMC V3.B16, V3.B16
+ AESE V30.B16, V4.B16
+ AESMC V4.B16, V4.B16
+ AESE V30.B16, V5.B16
+ AESMC V5.B16, V5.B16
+ AESE V30.B16, V6.B16
+ AESMC V6.B16, V6.B16
+ AESE V30.B16, V7.B16
+ AESMC V7.B16, V7.B16
+ ADD R0, R1, R10
+ SUB $128, R10, R10
+ VLD1.P 64(R10), [V8.B16, V9.B16, V10.B16, V11.B16]
+ VLD1 (R10), [V12.B16, V13.B16, V14.B16, V15.B16]
+ SUB $1, R1, R1
+ LSR $7, R1, R1
+
+aesloop:
+ AESE V8.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V9.B16, V1.B16
+ AESMC V1.B16, V1.B16
+ AESE V10.B16, V2.B16
+ AESMC V2.B16, V2.B16
+ AESE V11.B16, V3.B16
+ AESMC V3.B16, V3.B16
+ AESE V12.B16, V4.B16
+ AESMC V4.B16, V4.B16
+ AESE V13.B16, V5.B16
+ AESMC V5.B16, V5.B16
+ AESE V14.B16, V6.B16
+ AESMC V6.B16, V6.B16
+ AESE V15.B16, V7.B16
+ AESMC V7.B16, V7.B16
+
+ VLD1.P 64(R0), [V8.B16, V9.B16, V10.B16, V11.B16]
+ AESE V8.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V9.B16, V1.B16
+ AESMC V1.B16, V1.B16
+ AESE V10.B16, V2.B16
+ AESMC V2.B16, V2.B16
+ AESE V11.B16, V3.B16
+ AESMC V3.B16, V3.B16
+
+ VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16]
+ AESE V12.B16, V4.B16
+ AESMC V4.B16, V4.B16
+ AESE V13.B16, V5.B16
+ AESMC V5.B16, V5.B16
+ AESE V14.B16, V6.B16
+ AESMC V6.B16, V6.B16
+ AESE V15.B16, V7.B16
+ AESMC V7.B16, V7.B16
+ SUB $1, R1, R1
+ CBNZ R1, aesloop
+
+ AESE V8.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V9.B16, V1.B16
+ AESMC V1.B16, V1.B16
+ AESE V10.B16, V2.B16
+ AESMC V2.B16, V2.B16
+ AESE V11.B16, V3.B16
+ AESMC V3.B16, V3.B16
+ AESE V12.B16, V4.B16
+ AESMC V4.B16, V4.B16
+ AESE V13.B16, V5.B16
+ AESMC V5.B16, V5.B16
+ AESE V14.B16, V6.B16
+ AESMC V6.B16, V6.B16
+ AESE V15.B16, V7.B16
+ AESMC V7.B16, V7.B16
+
+ AESE V8.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V9.B16, V1.B16
+ AESMC V1.B16, V1.B16
+ AESE V10.B16, V2.B16
+ AESMC V2.B16, V2.B16
+ AESE V11.B16, V3.B16
+ AESMC V3.B16, V3.B16
+ AESE V12.B16, V4.B16
+ AESMC V4.B16, V4.B16
+ AESE V13.B16, V5.B16
+ AESMC V5.B16, V5.B16
+ AESE V14.B16, V6.B16
+ AESMC V6.B16, V6.B16
+ AESE V15.B16, V7.B16
+ AESMC V7.B16, V7.B16
+
+ AESE V8.B16, V0.B16
+ AESE V9.B16, V1.B16
+ AESE V10.B16, V2.B16
+ AESE V11.B16, V3.B16
+ AESE V12.B16, V4.B16
+ AESE V13.B16, V5.B16
+ AESE V14.B16, V6.B16
+ AESE V15.B16, V7.B16
+
+ VEOR V0.B16, V1.B16, V0.B16
+ VEOR V2.B16, V3.B16, V2.B16
+ VEOR V4.B16, V5.B16, V4.B16
+ VEOR V6.B16, V7.B16, V6.B16
+ VEOR V0.B16, V2.B16, V0.B16
+ VEOR V4.B16, V6.B16, V4.B16
+ VEOR V4.B16, V0.B16, V0.B16
+
+ VST1 [V0.D1], (R2)
+ RET
+
TEXT runtime·procyield(SB),NOSPLIT,$0-0
MOVWU cycles+0(FP), R0
again:
diff --git a/src/runtime/hash64.go b/src/runtime/hash64.go
index 3cf3f4629b..54098d9d2a 100644
--- a/src/runtime/hash64.go
+++ b/src/runtime/hash64.go
@@ -21,7 +21,8 @@ const (
)
func memhash(p unsafe.Pointer, seed, s uintptr) uintptr {
- if GOARCH == "amd64" && GOOS != "nacl" && useAeshash {
+ if (GOARCH == "amd64" || GOARCH == "arm64") &&
+ GOOS != "nacl" && useAeshash {
return aeshash(p, seed, s)
}
h := uint64(seed + s*hashkey[0])
diff --git a/src/runtime/os_linux_arm64.go b/src/runtime/os_linux_arm64.go
index 96827e7c9f..9342a042ac 100644
--- a/src/runtime/os_linux_arm64.go
+++ b/src/runtime/os_linux_arm64.go
@@ -15,8 +15,9 @@ var randomNumber uint32
// HWCAP/HWCAP2 bits for hardware capabilities.
//go:linkname cpu_hwcap internal/cpu.arm64_hwcap
-//go:linkname cpu_hwcap2 internal/cpu.arm64_hwcap2
var cpu_hwcap uint
+
+//go:linkname cpu_hwcap2 internal/cpu.arm64_hwcap2
var cpu_hwcap2 uint
func archauxv(tag, val uintptr) {
@@ -28,6 +29,7 @@ func archauxv(tag, val uintptr) {
randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
case _AT_HWCAP:
+ arm64_support_aes = ((val>>3)&0x1 == 0x1)
cpu_hwcap = uint(val)
case _AT_HWCAP2:
cpu_hwcap2 = uint(val)
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 059e14e002..72a80a6907 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -784,6 +784,8 @@ var (
support_sse42 bool
support_ssse3 bool
+ arm64_support_aes bool
+
goarm uint8 // set by cmd/link on arm systems
framepointer_enabled bool // set by cmd/link
)