diff options
| author | Keith Randall <khr@golang.org> | 2013-04-02 16:26:15 -0700 |
|---|---|---|
| committer | Keith Randall <khr@golang.org> | 2013-04-02 16:26:15 -0700 |
| commit | 3d5daa23198f4b7ee71dd7647d5d061e1c883fce (patch) | |
| tree | 6aa7fe53a84cc324261013cb5e912bbc33521e98 /src/pkg/runtime | |
| parent | 6ca1fa625c2377071163399f1579a440e7d29502 (diff) | |
| download | go-3d5daa23198f4b7ee71dd7647d5d061e1c883fce.tar.xz | |
runtime: Implement faster equals for strings and bytes.
(amd64)
benchmark old ns/op new ns/op delta
BenchmarkEqual0 16 6 -63.15%
BenchmarkEqual9 22 7 -65.37%
BenchmarkEqual32 36 9 -74.91%
BenchmarkEqual4K 2187 120 -94.51%
benchmark old MB/s new MB/s speedup
BenchmarkEqual9 392.22 1134.38 2.89x
BenchmarkEqual32 866.72 3457.39 3.99x
BenchmarkEqual4K 1872.73 33998.87 18.15x
(386)
benchmark old ns/op new ns/op delta
BenchmarkEqual0 16 5 -63.85%
BenchmarkEqual9 22 7 -67.84%
BenchmarkEqual32 34 12 -64.94%
BenchmarkEqual4K 2196 113 -94.85%
benchmark old MB/s new MB/s speedup
BenchmarkEqual9 405.81 1260.18 3.11x
BenchmarkEqual32 919.55 2631.21 2.86x
BenchmarkEqual4K 1864.85 36072.54 19.34x
Update #3751
R=bradfitz, r, khr, dave, remyoudompheng, fullung, minux.ma, ality
CC=golang-dev
https://golang.org/cl/8056043
Diffstat (limited to 'src/pkg/runtime')
| -rw-r--r-- | src/pkg/runtime/alg.c | 23 | ||||
| -rw-r--r-- | src/pkg/runtime/asm_386.s | 115 | ||||
| -rw-r--r-- | src/pkg/runtime/asm_amd64.s | 112 | ||||
| -rw-r--r-- | src/pkg/runtime/asm_arm.s | 19 | ||||
| -rw-r--r-- | src/pkg/runtime/hashmap.c | 2 | ||||
| -rw-r--r-- | src/pkg/runtime/mapspeed_test.go | 11 | ||||
| -rw-r--r-- | src/pkg/runtime/runtime.h | 2 | ||||
| -rw-r--r-- | src/pkg/runtime/string.goc | 10 | ||||
| -rw-r--r-- | src/pkg/runtime/string_test.go | 28 |
9 files changed, 294 insertions, 28 deletions
diff --git a/src/pkg/runtime/alg.c b/src/pkg/runtime/alg.c index 2dc8212566..a78d9780c7 100644 --- a/src/pkg/runtime/alg.c +++ b/src/pkg/runtime/alg.c @@ -37,25 +37,11 @@ runtime·memhash(uintptr *h, uintptr s, void *a) void runtime·memequal(bool *eq, uintptr s, void *a, void *b) { - byte *ba, *bb, *aend; - if(a == b) { *eq = 1; return; } - ba = a; - bb = b; - aend = ba+s; - while(ba != aend) { - if(*ba != *bb) { - *eq = 0; - return; - } - ba++; - bb++; - } - *eq = 1; - return; + *eq = runtime·memeq(a, b, s); } void @@ -323,6 +309,7 @@ void runtime·strequal(bool *eq, uintptr s, void *a, void *b) { intgo alen; + byte *s1, *s2; USED(s); alen = ((String*)a)->len; @@ -330,11 +317,13 @@ runtime·strequal(bool *eq, uintptr s, void *a, void *b) *eq = false; return; } - if(((String*)a)->str == ((String*)b)->str) { + s1 = ((String*)a)->str; + s2 = ((String*)b)->str; + if(s1 == s2) { *eq = true; return; } - runtime·memequal(eq, alen, ((String*)a)->str, ((String*)b)->str); + *eq = runtime·memeq(s1, s2, alen); } void diff --git a/src/pkg/runtime/asm_386.s b/src/pkg/runtime/asm_386.s index 57de87b8d4..531057ff8a 100644 --- a/src/pkg/runtime/asm_386.s +++ b/src/pkg/runtime/asm_386.s @@ -986,3 +986,118 @@ TEXT shifts(SB),7,$0 LONG $0x0c0b0a09 LONG $0xff0f0e0d +TEXT runtime·memeq(SB),7,$0 + MOVL a+0(FP), SI + MOVL b+4(FP), DI + MOVL count+8(FP), BX + JMP runtime·memeqbody(SB) + + +TEXT bytes·Equal(SB),7,$0 + MOVL a_len+4(FP), BX + MOVL b_len+16(FP), CX + XORL AX, AX + CMPL BX, CX + JNE eqret + MOVL a+0(FP), SI + MOVL b+12(FP), DI + CALL runtime·memeqbody(SB) +eqret: + MOVB AX, ret+24(FP) + RET + +// a in SI +// b in DI +// count in BX +TEXT runtime·memeqbody(SB),7,$0 + XORL AX, AX + + CMPL BX, $4 + JB small + + // 64 bytes at a time using xmm registers +hugeloop: + CMPL BX, $64 + JB bigloop + TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2 + JE bigloop + MOVOU (SI), X0 + MOVOU (DI), X1 + MOVOU 16(SI), X2 + MOVOU 16(DI), X3 + MOVOU 32(SI), X4 + MOVOU 32(DI), X5 + MOVOU 48(SI), X6 + MOVOU 48(DI), X7 + PCMPEQB X1, X0 + PCMPEQB X3, X2 + PCMPEQB X5, X4 + PCMPEQB X7, X6 + PAND X2, X0 + PAND X6, X4 + PAND X4, X0 + PMOVMSKB X0, DX + ADDL $64, SI + ADDL $64, DI + SUBL $64, BX + CMPL DX, $0xffff + JEQ hugeloop + RET + + // 4 bytes at a time using 32-bit register +bigloop: + CMPL BX, $4 + JBE leftover + MOVL (SI), CX + MOVL (DI), DX + ADDL $4, SI + ADDL $4, DI + SUBL $4, BX + CMPL CX, DX + JEQ bigloop + RET + + // remaining 0-4 bytes +leftover: + MOVL -4(SI)(BX*1), CX + MOVL -4(DI)(BX*1), DX + CMPL CX, DX + SETEQ AX + RET + +small: + CMPL BX, $0 + JEQ equal + + LEAL 0(BX*8), CX + NEGL CX + + MOVL SI, DX + CMPB DX, $0xfc + JA si_high + + // load at SI won't cross a page boundary. + MOVL (SI), SI + JMP si_finish +si_high: + // address ends in 111111xx. Load up to bytes we want, move to correct position. + MOVL -4(SI)(BX*1), SI + SHRL CX, SI +si_finish: + + // same for DI. + MOVL DI, DX + CMPB DX, $0xfc + JA di_high + MOVL (DI), DI + JMP di_finish +di_high: + MOVL -4(DI)(BX*1), DI + SHRL CX, DI +di_finish: + + SUBL SI, DI + SHLL CX, DI +equal: + SETEQ AX + RET diff --git a/src/pkg/runtime/asm_amd64.s b/src/pkg/runtime/asm_amd64.s index af2064ff3a..0dee1556da 100644 --- a/src/pkg/runtime/asm_amd64.s +++ b/src/pkg/runtime/asm_amd64.s @@ -907,3 +907,115 @@ TEXT shifts(SB),7,$0 QUAD $0xffff0f0e0d0c0b0a QUAD $0x0807060504030201 QUAD $0xff0f0e0d0c0b0a09 + +TEXT runtime·memeq(SB),7,$0 + MOVQ a+0(FP), SI + MOVQ b+8(FP), DI + MOVQ count+16(FP), BX + JMP runtime·memeqbody(SB) + + +TEXT bytes·Equal(SB),7,$0 + MOVQ a_len+8(FP), BX + MOVQ b_len+32(FP), CX + XORQ AX, AX + CMPQ BX, CX + JNE eqret + MOVQ a+0(FP), SI + MOVQ b+24(FP), DI + CALL runtime·memeqbody(SB) +eqret: + MOVB AX, ret+48(FP) + RET + +// a in SI +// b in DI +// count in BX +TEXT runtime·memeqbody(SB),7,$0 + XORQ AX, AX + + CMPQ BX, $8 + JB small + + // 64 bytes at a time using xmm registers +hugeloop: + CMPQ BX, $64 + JB bigloop + MOVOU (SI), X0 + MOVOU (DI), X1 + MOVOU 16(SI), X2 + MOVOU 16(DI), X3 + MOVOU 32(SI), X4 + MOVOU 32(DI), X5 + MOVOU 48(SI), X6 + MOVOU 48(DI), X7 + PCMPEQB X1, X0 + PCMPEQB X3, X2 + PCMPEQB X5, X4 + PCMPEQB X7, X6 + PAND X2, X0 + PAND X6, X4 + PAND X4, X0 + PMOVMSKB X0, DX + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, BX + CMPL DX, $0xffff + JEQ hugeloop + RET + + // 8 bytes at a time using 64-bit register +bigloop: + CMPQ BX, $8 + JBE leftover + MOVQ (SI), CX + MOVQ (DI), DX + ADDQ $8, SI + ADDQ $8, DI + SUBQ $8, BX + CMPQ CX, DX + JEQ bigloop + RET + + // remaining 0-8 bytes +leftover: + MOVQ -8(SI)(BX*1), CX + MOVQ -8(DI)(BX*1), DX + CMPQ CX, DX + SETEQ AX + RET + +small: + CMPQ BX, $0 + JEQ equal + + LEAQ 0(BX*8), CX + NEGQ CX + + CMPB SI, $0xf8 + JA si_high + + // load at SI won't cross a page boundary. + MOVQ (SI), SI + JMP si_finish +si_high: + // address ends in 11111xxx. Load up to bytes we want, move to correct position. + MOVQ -8(SI)(BX*1), SI + SHRQ CX, SI +si_finish: + + // same for DI. + CMPB DI, $0xf8 + JA di_high + MOVQ (DI), DI + JMP di_finish +di_high: + MOVQ -8(DI)(BX*1), DI + SHRQ CX, DI +di_finish: + + SUBQ SI, DI + SHLQ CX, DI +equal: + SETEQ AX + RET diff --git a/src/pkg/runtime/asm_arm.s b/src/pkg/runtime/asm_arm.s index e544933326..ee9acb749c 100644 --- a/src/pkg/runtime/asm_arm.s +++ b/src/pkg/runtime/asm_arm.s @@ -487,7 +487,7 @@ TEXT runtime·stackguard(SB),7,$0 MOVW R2, limit+4(FP) RET -// not implemented for ARM +// AES hashing not implemented for ARM TEXT runtime·aeshash(SB),7,$-4 MOVW $0, R0 MOVW (R0), R1 @@ -500,3 +500,20 @@ TEXT runtime·aeshash64(SB),7,$-4 TEXT runtime·aeshashstr(SB),7,$-4 MOVW $0, R0 MOVW (R0), R1 + +TEXT runtime·memeq(SB),7,$-4 + MOVW a+0(FP), R1 + MOVW b+4(FP), R2 + MOVW n+8(FP), R3 + ADD R1, R3, R6 + MOVW $1, R0 +_next: + CMP R1, R6 + RET.EQ + MOVBU.P 1(R1), R4 + MOVBU.P 1(R2), R5 + CMP R4, R5 + BEQ _next + + MOVW $0, R0 + RET diff --git a/src/pkg/runtime/hashmap.c b/src/pkg/runtime/hashmap.c index 3f26a157bd..d639be3c3d 100644 --- a/src/pkg/runtime/hashmap.c +++ b/src/pkg/runtime/hashmap.c @@ -562,7 +562,7 @@ static uint8 empty_value[MAXVALUESIZE]; #define HASH_LOOKUP2 runtime·mapaccess2_faststr #define KEYTYPE String #define HASHFUNC runtime·algarray[ASTRING].hash -#define EQFUNC(x,y) ((x).len == (y).len && ((x).str == (y).str || runtime·mcmp((x).str, (y).str, (x).len) == 0)) +#define EQFUNC(x,y) ((x).len == (y).len && ((x).str == (y).str || runtime·memeq((x).str, (y).str, (x).len))) #define QUICKEQ(x) ((x).len < 32) #include "hashmap_fast.c" diff --git a/src/pkg/runtime/mapspeed_test.go b/src/pkg/runtime/mapspeed_test.go index 4d77347b24..73be434535 100644 --- a/src/pkg/runtime/mapspeed_test.go +++ b/src/pkg/runtime/mapspeed_test.go @@ -118,6 +118,17 @@ func BenchmarkMegOneMap(b *testing.B) { } } +func BenchmarkMegEqMap(b *testing.B) { + m := make(map[string]bool) + key1 := strings.Repeat("X", 1<<20) + key2 := strings.Repeat("X", 1<<20) // equal but different instance + m[key1] = true + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _ = m[key2] + } +} + func BenchmarkMegEmptyMap(b *testing.B) { m := make(map[string]bool) key := strings.Repeat("X", 1<<20) diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h index 11e713a2bc..864b2aa5f7 100644 --- a/src/pkg/runtime/runtime.h +++ b/src/pkg/runtime/runtime.h @@ -593,6 +593,8 @@ void runtime·strequal(bool*, uintptr, void*, void*); void runtime·interequal(bool*, uintptr, void*, void*); void runtime·nilinterequal(bool*, uintptr, void*, void*); +bool runtime·memeq(void*, void*, uintptr); + void runtime·memprint(uintptr, void*); void runtime·strprint(uintptr, void*); void runtime·interprint(uintptr, void*); diff --git a/src/pkg/runtime/string.goc b/src/pkg/runtime/string.goc index c0d3f2bde9..49bf1148b8 100644 --- a/src/pkg/runtime/string.goc +++ b/src/pkg/runtime/string.goc @@ -206,8 +206,6 @@ func cmpstring(s1 String, s2 String) (v int) { } func eqstring(s1 String, s2 String) (v bool) { - uintgo i, l; - if(s1.len != s2.len) { v = false; return; @@ -216,13 +214,7 @@ func eqstring(s1 String, s2 String) (v bool) { v = true; return; } - l = s1.len; - for(i=0; i<l; i++) - if(s1.str[i] != s2.str[i]) { - v = false; - return; - } - v = true; + v = runtime·memeq(s1.str, s2.str, s1.len); } int32 diff --git a/src/pkg/runtime/string_test.go b/src/pkg/runtime/string_test.go index 6ba3c1d292..df3ff06a7d 100644 --- a/src/pkg/runtime/string_test.go +++ b/src/pkg/runtime/string_test.go @@ -47,3 +47,31 @@ func BenchmarkCompareStringDifferentLength(b *testing.B) { } } } + +func BenchmarkCompareStringBigUnaligned(b *testing.B) { + bytes := make([]byte, 0, 1<<20) + for len(bytes) < 1<<20 { + bytes = append(bytes, "Hello Gophers!"...) + } + s1, s2 := string(bytes), "hello"+string(bytes) + for i := 0; i < b.N; i++ { + if s1 != s2[len("hello"):] { + b.Fatal("s1 != s2") + } + } + b.SetBytes(int64(len(s1))) +} + +func BenchmarkCompareStringBig(b *testing.B) { + bytes := make([]byte, 0, 1<<20) + for len(bytes) < 1<<20 { + bytes = append(bytes, "Hello Gophers!"...) + } + s1, s2 := string(bytes), string(bytes) + for i := 0; i < b.N; i++ { + if s1 != s2 { + b.Fatal("s1 != s2") + } + } + b.SetBytes(int64(len(s1))) +} |
