aboutsummaryrefslogtreecommitdiff
path: root/src/pkg/runtime
diff options
context:
space:
mode:
authorKeith Randall <khr@golang.org>2013-04-02 16:26:15 -0700
committerKeith Randall <khr@golang.org>2013-04-02 16:26:15 -0700
commit3d5daa23198f4b7ee71dd7647d5d061e1c883fce (patch)
tree6aa7fe53a84cc324261013cb5e912bbc33521e98 /src/pkg/runtime
parent6ca1fa625c2377071163399f1579a440e7d29502 (diff)
downloadgo-3d5daa23198f4b7ee71dd7647d5d061e1c883fce.tar.xz
runtime: Implement faster equals for strings and bytes.
(amd64) benchmark old ns/op new ns/op delta BenchmarkEqual0 16 6 -63.15% BenchmarkEqual9 22 7 -65.37% BenchmarkEqual32 36 9 -74.91% BenchmarkEqual4K 2187 120 -94.51% benchmark old MB/s new MB/s speedup BenchmarkEqual9 392.22 1134.38 2.89x BenchmarkEqual32 866.72 3457.39 3.99x BenchmarkEqual4K 1872.73 33998.87 18.15x (386) benchmark old ns/op new ns/op delta BenchmarkEqual0 16 5 -63.85% BenchmarkEqual9 22 7 -67.84% BenchmarkEqual32 34 12 -64.94% BenchmarkEqual4K 2196 113 -94.85% benchmark old MB/s new MB/s speedup BenchmarkEqual9 405.81 1260.18 3.11x BenchmarkEqual32 919.55 2631.21 2.86x BenchmarkEqual4K 1864.85 36072.54 19.34x Update #3751 R=bradfitz, r, khr, dave, remyoudompheng, fullung, minux.ma, ality CC=golang-dev https://golang.org/cl/8056043
Diffstat (limited to 'src/pkg/runtime')
-rw-r--r--src/pkg/runtime/alg.c23
-rw-r--r--src/pkg/runtime/asm_386.s115
-rw-r--r--src/pkg/runtime/asm_amd64.s112
-rw-r--r--src/pkg/runtime/asm_arm.s19
-rw-r--r--src/pkg/runtime/hashmap.c2
-rw-r--r--src/pkg/runtime/mapspeed_test.go11
-rw-r--r--src/pkg/runtime/runtime.h2
-rw-r--r--src/pkg/runtime/string.goc10
-rw-r--r--src/pkg/runtime/string_test.go28
9 files changed, 294 insertions, 28 deletions
diff --git a/src/pkg/runtime/alg.c b/src/pkg/runtime/alg.c
index 2dc8212566..a78d9780c7 100644
--- a/src/pkg/runtime/alg.c
+++ b/src/pkg/runtime/alg.c
@@ -37,25 +37,11 @@ runtime·memhash(uintptr *h, uintptr s, void *a)
void
runtime·memequal(bool *eq, uintptr s, void *a, void *b)
{
- byte *ba, *bb, *aend;
-
if(a == b) {
*eq = 1;
return;
}
- ba = a;
- bb = b;
- aend = ba+s;
- while(ba != aend) {
- if(*ba != *bb) {
- *eq = 0;
- return;
- }
- ba++;
- bb++;
- }
- *eq = 1;
- return;
+ *eq = runtime·memeq(a, b, s);
}
void
@@ -323,6 +309,7 @@ void
runtime·strequal(bool *eq, uintptr s, void *a, void *b)
{
intgo alen;
+ byte *s1, *s2;
USED(s);
alen = ((String*)a)->len;
@@ -330,11 +317,13 @@ runtime·strequal(bool *eq, uintptr s, void *a, void *b)
*eq = false;
return;
}
- if(((String*)a)->str == ((String*)b)->str) {
+ s1 = ((String*)a)->str;
+ s2 = ((String*)b)->str;
+ if(s1 == s2) {
*eq = true;
return;
}
- runtime·memequal(eq, alen, ((String*)a)->str, ((String*)b)->str);
+ *eq = runtime·memeq(s1, s2, alen);
}
void
diff --git a/src/pkg/runtime/asm_386.s b/src/pkg/runtime/asm_386.s
index 57de87b8d4..531057ff8a 100644
--- a/src/pkg/runtime/asm_386.s
+++ b/src/pkg/runtime/asm_386.s
@@ -986,3 +986,118 @@ TEXT shifts(SB),7,$0
LONG $0x0c0b0a09
LONG $0xff0f0e0d
+TEXT runtime·memeq(SB),7,$0
+ MOVL a+0(FP), SI
+ MOVL b+4(FP), DI
+ MOVL count+8(FP), BX
+ JMP runtime·memeqbody(SB)
+
+
+TEXT bytes·Equal(SB),7,$0
+ MOVL a_len+4(FP), BX
+ MOVL b_len+16(FP), CX
+ XORL AX, AX
+ CMPL BX, CX
+ JNE eqret
+ MOVL a+0(FP), SI
+ MOVL b+12(FP), DI
+ CALL runtime·memeqbody(SB)
+eqret:
+ MOVB AX, ret+24(FP)
+ RET
+
+// a in SI
+// b in DI
+// count in BX
+TEXT runtime·memeqbody(SB),7,$0
+ XORL AX, AX
+
+ CMPL BX, $4
+ JB small
+
+ // 64 bytes at a time using xmm registers
+hugeloop:
+ CMPL BX, $64
+ JB bigloop
+ TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2
+ JE bigloop
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ MOVOU 16(SI), X2
+ MOVOU 16(DI), X3
+ MOVOU 32(SI), X4
+ MOVOU 32(DI), X5
+ MOVOU 48(SI), X6
+ MOVOU 48(DI), X7
+ PCMPEQB X1, X0
+ PCMPEQB X3, X2
+ PCMPEQB X5, X4
+ PCMPEQB X7, X6
+ PAND X2, X0
+ PAND X6, X4
+ PAND X4, X0
+ PMOVMSKB X0, DX
+ ADDL $64, SI
+ ADDL $64, DI
+ SUBL $64, BX
+ CMPL DX, $0xffff
+ JEQ hugeloop
+ RET
+
+ // 4 bytes at a time using 32-bit register
+bigloop:
+ CMPL BX, $4
+ JBE leftover
+ MOVL (SI), CX
+ MOVL (DI), DX
+ ADDL $4, SI
+ ADDL $4, DI
+ SUBL $4, BX
+ CMPL CX, DX
+ JEQ bigloop
+ RET
+
+ // remaining 0-4 bytes
+leftover:
+ MOVL -4(SI)(BX*1), CX
+ MOVL -4(DI)(BX*1), DX
+ CMPL CX, DX
+ SETEQ AX
+ RET
+
+small:
+ CMPL BX, $0
+ JEQ equal
+
+ LEAL 0(BX*8), CX
+ NEGL CX
+
+ MOVL SI, DX
+ CMPB DX, $0xfc
+ JA si_high
+
+ // load at SI won't cross a page boundary.
+ MOVL (SI), SI
+ JMP si_finish
+si_high:
+ // address ends in 111111xx. Load up to bytes we want, move to correct position.
+ MOVL -4(SI)(BX*1), SI
+ SHRL CX, SI
+si_finish:
+
+ // same for DI.
+ MOVL DI, DX
+ CMPB DX, $0xfc
+ JA di_high
+ MOVL (DI), DI
+ JMP di_finish
+di_high:
+ MOVL -4(DI)(BX*1), DI
+ SHRL CX, DI
+di_finish:
+
+ SUBL SI, DI
+ SHLL CX, DI
+equal:
+ SETEQ AX
+ RET
diff --git a/src/pkg/runtime/asm_amd64.s b/src/pkg/runtime/asm_amd64.s
index af2064ff3a..0dee1556da 100644
--- a/src/pkg/runtime/asm_amd64.s
+++ b/src/pkg/runtime/asm_amd64.s
@@ -907,3 +907,115 @@ TEXT shifts(SB),7,$0
QUAD $0xffff0f0e0d0c0b0a
QUAD $0x0807060504030201
QUAD $0xff0f0e0d0c0b0a09
+
+TEXT runtime·memeq(SB),7,$0
+ MOVQ a+0(FP), SI
+ MOVQ b+8(FP), DI
+ MOVQ count+16(FP), BX
+ JMP runtime·memeqbody(SB)
+
+
+TEXT bytes·Equal(SB),7,$0
+ MOVQ a_len+8(FP), BX
+ MOVQ b_len+32(FP), CX
+ XORQ AX, AX
+ CMPQ BX, CX
+ JNE eqret
+ MOVQ a+0(FP), SI
+ MOVQ b+24(FP), DI
+ CALL runtime·memeqbody(SB)
+eqret:
+ MOVB AX, ret+48(FP)
+ RET
+
+// a in SI
+// b in DI
+// count in BX
+TEXT runtime·memeqbody(SB),7,$0
+ XORQ AX, AX
+
+ CMPQ BX, $8
+ JB small
+
+ // 64 bytes at a time using xmm registers
+hugeloop:
+ CMPQ BX, $64
+ JB bigloop
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ MOVOU 16(SI), X2
+ MOVOU 16(DI), X3
+ MOVOU 32(SI), X4
+ MOVOU 32(DI), X5
+ MOVOU 48(SI), X6
+ MOVOU 48(DI), X7
+ PCMPEQB X1, X0
+ PCMPEQB X3, X2
+ PCMPEQB X5, X4
+ PCMPEQB X7, X6
+ PAND X2, X0
+ PAND X6, X4
+ PAND X4, X0
+ PMOVMSKB X0, DX
+ ADDQ $64, SI
+ ADDQ $64, DI
+ SUBQ $64, BX
+ CMPL DX, $0xffff
+ JEQ hugeloop
+ RET
+
+ // 8 bytes at a time using 64-bit register
+bigloop:
+ CMPQ BX, $8
+ JBE leftover
+ MOVQ (SI), CX
+ MOVQ (DI), DX
+ ADDQ $8, SI
+ ADDQ $8, DI
+ SUBQ $8, BX
+ CMPQ CX, DX
+ JEQ bigloop
+ RET
+
+ // remaining 0-8 bytes
+leftover:
+ MOVQ -8(SI)(BX*1), CX
+ MOVQ -8(DI)(BX*1), DX
+ CMPQ CX, DX
+ SETEQ AX
+ RET
+
+small:
+ CMPQ BX, $0
+ JEQ equal
+
+ LEAQ 0(BX*8), CX
+ NEGQ CX
+
+ CMPB SI, $0xf8
+ JA si_high
+
+ // load at SI won't cross a page boundary.
+ MOVQ (SI), SI
+ JMP si_finish
+si_high:
+ // address ends in 11111xxx. Load up to bytes we want, move to correct position.
+ MOVQ -8(SI)(BX*1), SI
+ SHRQ CX, SI
+si_finish:
+
+ // same for DI.
+ CMPB DI, $0xf8
+ JA di_high
+ MOVQ (DI), DI
+ JMP di_finish
+di_high:
+ MOVQ -8(DI)(BX*1), DI
+ SHRQ CX, DI
+di_finish:
+
+ SUBQ SI, DI
+ SHLQ CX, DI
+equal:
+ SETEQ AX
+ RET
diff --git a/src/pkg/runtime/asm_arm.s b/src/pkg/runtime/asm_arm.s
index e544933326..ee9acb749c 100644
--- a/src/pkg/runtime/asm_arm.s
+++ b/src/pkg/runtime/asm_arm.s
@@ -487,7 +487,7 @@ TEXT runtime·stackguard(SB),7,$0
MOVW R2, limit+4(FP)
RET
-// not implemented for ARM
+// AES hashing not implemented for ARM
TEXT runtime·aeshash(SB),7,$-4
MOVW $0, R0
MOVW (R0), R1
@@ -500,3 +500,20 @@ TEXT runtime·aeshash64(SB),7,$-4
TEXT runtime·aeshashstr(SB),7,$-4
MOVW $0, R0
MOVW (R0), R1
+
+TEXT runtime·memeq(SB),7,$-4
+ MOVW a+0(FP), R1
+ MOVW b+4(FP), R2
+ MOVW n+8(FP), R3
+ ADD R1, R3, R6
+ MOVW $1, R0
+_next:
+ CMP R1, R6
+ RET.EQ
+ MOVBU.P 1(R1), R4
+ MOVBU.P 1(R2), R5
+ CMP R4, R5
+ BEQ _next
+
+ MOVW $0, R0
+ RET
diff --git a/src/pkg/runtime/hashmap.c b/src/pkg/runtime/hashmap.c
index 3f26a157bd..d639be3c3d 100644
--- a/src/pkg/runtime/hashmap.c
+++ b/src/pkg/runtime/hashmap.c
@@ -562,7 +562,7 @@ static uint8 empty_value[MAXVALUESIZE];
#define HASH_LOOKUP2 runtime·mapaccess2_faststr
#define KEYTYPE String
#define HASHFUNC runtime·algarray[ASTRING].hash
-#define EQFUNC(x,y) ((x).len == (y).len && ((x).str == (y).str || runtime·mcmp((x).str, (y).str, (x).len) == 0))
+#define EQFUNC(x,y) ((x).len == (y).len && ((x).str == (y).str || runtime·memeq((x).str, (y).str, (x).len)))
#define QUICKEQ(x) ((x).len < 32)
#include "hashmap_fast.c"
diff --git a/src/pkg/runtime/mapspeed_test.go b/src/pkg/runtime/mapspeed_test.go
index 4d77347b24..73be434535 100644
--- a/src/pkg/runtime/mapspeed_test.go
+++ b/src/pkg/runtime/mapspeed_test.go
@@ -118,6 +118,17 @@ func BenchmarkMegOneMap(b *testing.B) {
}
}
+func BenchmarkMegEqMap(b *testing.B) {
+ m := make(map[string]bool)
+ key1 := strings.Repeat("X", 1<<20)
+ key2 := strings.Repeat("X", 1<<20) // equal but different instance
+ m[key1] = true
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ _, _ = m[key2]
+ }
+}
+
func BenchmarkMegEmptyMap(b *testing.B) {
m := make(map[string]bool)
key := strings.Repeat("X", 1<<20)
diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h
index 11e713a2bc..864b2aa5f7 100644
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -593,6 +593,8 @@ void runtime·strequal(bool*, uintptr, void*, void*);
void runtime·interequal(bool*, uintptr, void*, void*);
void runtime·nilinterequal(bool*, uintptr, void*, void*);
+bool runtime·memeq(void*, void*, uintptr);
+
void runtime·memprint(uintptr, void*);
void runtime·strprint(uintptr, void*);
void runtime·interprint(uintptr, void*);
diff --git a/src/pkg/runtime/string.goc b/src/pkg/runtime/string.goc
index c0d3f2bde9..49bf1148b8 100644
--- a/src/pkg/runtime/string.goc
+++ b/src/pkg/runtime/string.goc
@@ -206,8 +206,6 @@ func cmpstring(s1 String, s2 String) (v int) {
}
func eqstring(s1 String, s2 String) (v bool) {
- uintgo i, l;
-
if(s1.len != s2.len) {
v = false;
return;
@@ -216,13 +214,7 @@ func eqstring(s1 String, s2 String) (v bool) {
v = true;
return;
}
- l = s1.len;
- for(i=0; i<l; i++)
- if(s1.str[i] != s2.str[i]) {
- v = false;
- return;
- }
- v = true;
+ v = runtime·memeq(s1.str, s2.str, s1.len);
}
int32
diff --git a/src/pkg/runtime/string_test.go b/src/pkg/runtime/string_test.go
index 6ba3c1d292..df3ff06a7d 100644
--- a/src/pkg/runtime/string_test.go
+++ b/src/pkg/runtime/string_test.go
@@ -47,3 +47,31 @@ func BenchmarkCompareStringDifferentLength(b *testing.B) {
}
}
}
+
+func BenchmarkCompareStringBigUnaligned(b *testing.B) {
+ bytes := make([]byte, 0, 1<<20)
+ for len(bytes) < 1<<20 {
+ bytes = append(bytes, "Hello Gophers!"...)
+ }
+ s1, s2 := string(bytes), "hello"+string(bytes)
+ for i := 0; i < b.N; i++ {
+ if s1 != s2[len("hello"):] {
+ b.Fatal("s1 != s2")
+ }
+ }
+ b.SetBytes(int64(len(s1)))
+}
+
+func BenchmarkCompareStringBig(b *testing.B) {
+ bytes := make([]byte, 0, 1<<20)
+ for len(bytes) < 1<<20 {
+ bytes = append(bytes, "Hello Gophers!"...)
+ }
+ s1, s2 := string(bytes), string(bytes)
+ for i := 0; i < b.N; i++ {
+ if s1 != s2 {
+ b.Fatal("s1 != s2")
+ }
+ }
+ b.SetBytes(int64(len(s1)))
+}