aboutsummaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorKeith Randall <khr@golang.org>2016-03-06 16:58:30 -0800
committerKeith Randall <khr@golang.org>2016-03-21 19:10:24 +0000
commit6a33f7765f79cf2f00f5ca55832d2cfab8beb289 (patch)
treed1cd6a803e32d952e60290c4c863c65bd88db808 /src/runtime
parentb07a214d39814545bbcd1d30f1850a95752dac65 (diff)
downloadgo-6a33f7765f79cf2f00f5ca55832d2cfab8beb289.tar.xz
runtime: use MOVSB instead of MOVSQ for unaligned moves
MOVSB is quite a bit faster for unaligned moves. Possibly we should use MOVSB all of the time, but Intel folks say it might be a bit faster to use MOVSQ on some processors (but not any I have access to at the moment). benchmark old ns/op new ns/op delta BenchmarkMemmove4096-8 93.9 93.2 -0.75% BenchmarkMemmoveUnalignedDst4096-8 256 151 -41.02% BenchmarkMemmoveUnalignedSrc4096-8 175 90.5 -48.29% Fixes #14630 Change-Id: I568e6d6590eb3615e6a699fb474020596be665ff Reviewed-on: https://go-review.googlesource.com/20293 Reviewed-by: Ian Lance Taylor <iant@golang.org>
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/memmove_386.s14
-rw-r--r--src/runtime/memmove_amd64.s13
-rw-r--r--src/runtime/memmove_test.go87
3 files changed, 87 insertions, 27 deletions
diff --git a/src/runtime/memmove_386.s b/src/runtime/memmove_386.s
index 9a21e84136..d4baf2280a 100644
--- a/src/runtime/memmove_386.s
+++ b/src/runtime/memmove_386.s
@@ -69,13 +69,25 @@ nosse2:
/*
* forward copy loop
*/
-forward:
+forward:
+ // Check alignment
+ MOVL SI, AX
+ ORL DI, AX
+ TESTL $3, AX
+ JNE unaligned_fwd
+
MOVL BX, CX
SHRL $2, CX
ANDL $3, BX
REP; MOVSL
JMP tail
+
+unaligned_fwd:
+ MOVL BX, CX
+ REP; MOVSB
+ RET
+
/*
* check overlap
*/
diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
index ae95b155be..514eb169f1 100644
--- a/src/runtime/memmove_amd64.s
+++ b/src/runtime/memmove_amd64.s
@@ -77,12 +77,25 @@ forward:
CMPQ BX, $2048
JLS move_256through2048
+ // Check alignment
+ MOVQ SI, AX
+ ORQ DI, AX
+ TESTL $7, AX
+ JNE unaligned_fwd
+
+ // Aligned - do 8 bytes at a time
MOVQ BX, CX
SHRQ $3, CX
ANDQ $7, BX
REP; MOVSQ
JMP tail
+unaligned_fwd:
+ // Unaligned - do 1 byte at a time
+ MOVQ BX, CX
+ REP; MOVSB
+ RET
+
back:
/*
* check overlap
diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go
index 371d84b6be..8bf0c65e29 100644
--- a/src/runtime/memmove_test.go
+++ b/src/runtime/memmove_test.go
@@ -116,7 +116,7 @@ func BenchmarkMemmove1024(b *testing.B) { bmMemmove(b, 1024) }
func BenchmarkMemmove2048(b *testing.B) { bmMemmove(b, 2048) }
func BenchmarkMemmove4096(b *testing.B) { bmMemmove(b, 4096) }
-func bmMemmoveUnaligned(b *testing.B, n int) {
+func bmMemmoveUnalignedDst(b *testing.B, n int) {
x := make([]byte, n+1)
y := make([]byte, n)
b.SetBytes(int64(n))
@@ -125,31 +125,66 @@ func bmMemmoveUnaligned(b *testing.B, n int) {
}
}
-func BenchmarkMemmoveUnaligned0(b *testing.B) { bmMemmoveUnaligned(b, 0) }
-func BenchmarkMemmoveUnaligned1(b *testing.B) { bmMemmoveUnaligned(b, 1) }
-func BenchmarkMemmoveUnaligned2(b *testing.B) { bmMemmoveUnaligned(b, 2) }
-func BenchmarkMemmoveUnaligned3(b *testing.B) { bmMemmoveUnaligned(b, 3) }
-func BenchmarkMemmoveUnaligned4(b *testing.B) { bmMemmoveUnaligned(b, 4) }
-func BenchmarkMemmoveUnaligned5(b *testing.B) { bmMemmoveUnaligned(b, 5) }
-func BenchmarkMemmoveUnaligned6(b *testing.B) { bmMemmoveUnaligned(b, 6) }
-func BenchmarkMemmoveUnaligned7(b *testing.B) { bmMemmoveUnaligned(b, 7) }
-func BenchmarkMemmoveUnaligned8(b *testing.B) { bmMemmoveUnaligned(b, 8) }
-func BenchmarkMemmoveUnaligned9(b *testing.B) { bmMemmoveUnaligned(b, 9) }
-func BenchmarkMemmoveUnaligned10(b *testing.B) { bmMemmoveUnaligned(b, 10) }
-func BenchmarkMemmoveUnaligned11(b *testing.B) { bmMemmoveUnaligned(b, 11) }
-func BenchmarkMemmoveUnaligned12(b *testing.B) { bmMemmoveUnaligned(b, 12) }
-func BenchmarkMemmoveUnaligned13(b *testing.B) { bmMemmoveUnaligned(b, 13) }
-func BenchmarkMemmoveUnaligned14(b *testing.B) { bmMemmoveUnaligned(b, 14) }
-func BenchmarkMemmoveUnaligned15(b *testing.B) { bmMemmoveUnaligned(b, 15) }
-func BenchmarkMemmoveUnaligned16(b *testing.B) { bmMemmoveUnaligned(b, 16) }
-func BenchmarkMemmoveUnaligned32(b *testing.B) { bmMemmoveUnaligned(b, 32) }
-func BenchmarkMemmoveUnaligned64(b *testing.B) { bmMemmoveUnaligned(b, 64) }
-func BenchmarkMemmoveUnaligned128(b *testing.B) { bmMemmoveUnaligned(b, 128) }
-func BenchmarkMemmoveUnaligned256(b *testing.B) { bmMemmoveUnaligned(b, 256) }
-func BenchmarkMemmoveUnaligned512(b *testing.B) { bmMemmoveUnaligned(b, 512) }
-func BenchmarkMemmoveUnaligned1024(b *testing.B) { bmMemmoveUnaligned(b, 1024) }
-func BenchmarkMemmoveUnaligned2048(b *testing.B) { bmMemmoveUnaligned(b, 2048) }
-func BenchmarkMemmoveUnaligned4096(b *testing.B) { bmMemmoveUnaligned(b, 4096) }
+func BenchmarkMemmoveUnalignedDst0(b *testing.B) { bmMemmoveUnalignedDst(b, 0) }
+func BenchmarkMemmoveUnalignedDst1(b *testing.B) { bmMemmoveUnalignedDst(b, 1) }
+func BenchmarkMemmoveUnalignedDst2(b *testing.B) { bmMemmoveUnalignedDst(b, 2) }
+func BenchmarkMemmoveUnalignedDst3(b *testing.B) { bmMemmoveUnalignedDst(b, 3) }
+func BenchmarkMemmoveUnalignedDst4(b *testing.B) { bmMemmoveUnalignedDst(b, 4) }
+func BenchmarkMemmoveUnalignedDst5(b *testing.B) { bmMemmoveUnalignedDst(b, 5) }
+func BenchmarkMemmoveUnalignedDst6(b *testing.B) { bmMemmoveUnalignedDst(b, 6) }
+func BenchmarkMemmoveUnalignedDst7(b *testing.B) { bmMemmoveUnalignedDst(b, 7) }
+func BenchmarkMemmoveUnalignedDst8(b *testing.B) { bmMemmoveUnalignedDst(b, 8) }
+func BenchmarkMemmoveUnalignedDst9(b *testing.B) { bmMemmoveUnalignedDst(b, 9) }
+func BenchmarkMemmoveUnalignedDst10(b *testing.B) { bmMemmoveUnalignedDst(b, 10) }
+func BenchmarkMemmoveUnalignedDst11(b *testing.B) { bmMemmoveUnalignedDst(b, 11) }
+func BenchmarkMemmoveUnalignedDst12(b *testing.B) { bmMemmoveUnalignedDst(b, 12) }
+func BenchmarkMemmoveUnalignedDst13(b *testing.B) { bmMemmoveUnalignedDst(b, 13) }
+func BenchmarkMemmoveUnalignedDst14(b *testing.B) { bmMemmoveUnalignedDst(b, 14) }
+func BenchmarkMemmoveUnalignedDst15(b *testing.B) { bmMemmoveUnalignedDst(b, 15) }
+func BenchmarkMemmoveUnalignedDst16(b *testing.B) { bmMemmoveUnalignedDst(b, 16) }
+func BenchmarkMemmoveUnalignedDst32(b *testing.B) { bmMemmoveUnalignedDst(b, 32) }
+func BenchmarkMemmoveUnalignedDst64(b *testing.B) { bmMemmoveUnalignedDst(b, 64) }
+func BenchmarkMemmoveUnalignedDst128(b *testing.B) { bmMemmoveUnalignedDst(b, 128) }
+func BenchmarkMemmoveUnalignedDst256(b *testing.B) { bmMemmoveUnalignedDst(b, 256) }
+func BenchmarkMemmoveUnalignedDst512(b *testing.B) { bmMemmoveUnalignedDst(b, 512) }
+func BenchmarkMemmoveUnalignedDst1024(b *testing.B) { bmMemmoveUnalignedDst(b, 1024) }
+func BenchmarkMemmoveUnalignedDst2048(b *testing.B) { bmMemmoveUnalignedDst(b, 2048) }
+func BenchmarkMemmoveUnalignedDst4096(b *testing.B) { bmMemmoveUnalignedDst(b, 4096) }
+
+func bmMemmoveUnalignedSrc(b *testing.B, n int) {
+ x := make([]byte, n)
+ y := make([]byte, n+1)
+ b.SetBytes(int64(n))
+ for i := 0; i < b.N; i++ {
+ copy(x, y[1:])
+ }
+}
+
+func BenchmarkMemmoveUnalignedSrc0(b *testing.B) { bmMemmoveUnalignedSrc(b, 0) }
+func BenchmarkMemmoveUnalignedSrc1(b *testing.B) { bmMemmoveUnalignedSrc(b, 1) }
+func BenchmarkMemmoveUnalignedSrc2(b *testing.B) { bmMemmoveUnalignedSrc(b, 2) }
+func BenchmarkMemmoveUnalignedSrc3(b *testing.B) { bmMemmoveUnalignedSrc(b, 3) }
+func BenchmarkMemmoveUnalignedSrc4(b *testing.B) { bmMemmoveUnalignedSrc(b, 4) }
+func BenchmarkMemmoveUnalignedSrc5(b *testing.B) { bmMemmoveUnalignedSrc(b, 5) }
+func BenchmarkMemmoveUnalignedSrc6(b *testing.B) { bmMemmoveUnalignedSrc(b, 6) }
+func BenchmarkMemmoveUnalignedSrc7(b *testing.B) { bmMemmoveUnalignedSrc(b, 7) }
+func BenchmarkMemmoveUnalignedSrc8(b *testing.B) { bmMemmoveUnalignedSrc(b, 8) }
+func BenchmarkMemmoveUnalignedSrc9(b *testing.B) { bmMemmoveUnalignedSrc(b, 9) }
+func BenchmarkMemmoveUnalignedSrc10(b *testing.B) { bmMemmoveUnalignedSrc(b, 10) }
+func BenchmarkMemmoveUnalignedSrc11(b *testing.B) { bmMemmoveUnalignedSrc(b, 11) }
+func BenchmarkMemmoveUnalignedSrc12(b *testing.B) { bmMemmoveUnalignedSrc(b, 12) }
+func BenchmarkMemmoveUnalignedSrc13(b *testing.B) { bmMemmoveUnalignedSrc(b, 13) }
+func BenchmarkMemmoveUnalignedSrc14(b *testing.B) { bmMemmoveUnalignedSrc(b, 14) }
+func BenchmarkMemmoveUnalignedSrc15(b *testing.B) { bmMemmoveUnalignedSrc(b, 15) }
+func BenchmarkMemmoveUnalignedSrc16(b *testing.B) { bmMemmoveUnalignedSrc(b, 16) }
+func BenchmarkMemmoveUnalignedSrc32(b *testing.B) { bmMemmoveUnalignedSrc(b, 32) }
+func BenchmarkMemmoveUnalignedSrc64(b *testing.B) { bmMemmoveUnalignedSrc(b, 64) }
+func BenchmarkMemmoveUnalignedSrc128(b *testing.B) { bmMemmoveUnalignedSrc(b, 128) }
+func BenchmarkMemmoveUnalignedSrc256(b *testing.B) { bmMemmoveUnalignedSrc(b, 256) }
+func BenchmarkMemmoveUnalignedSrc512(b *testing.B) { bmMemmoveUnalignedSrc(b, 512) }
+func BenchmarkMemmoveUnalignedSrc1024(b *testing.B) { bmMemmoveUnalignedSrc(b, 1024) }
+func BenchmarkMemmoveUnalignedSrc2048(b *testing.B) { bmMemmoveUnalignedSrc(b, 2048) }
+func BenchmarkMemmoveUnalignedSrc4096(b *testing.B) { bmMemmoveUnalignedSrc(b, 4096) }
func TestMemclr(t *testing.T) {
size := 512