aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/internal/runtime/gc/scan/filter.go2
-rw-r--r--src/internal/runtime/gc/scan/filter_amd64.go9
-rw-r--r--src/internal/runtime/gc/scan/filter_amd64.s64
-rw-r--r--src/internal/runtime/gc/scan/filter_amd64_test.go19
-rw-r--r--src/internal/runtime/gc/scan/filter_test.go36
-rw-r--r--src/internal/runtime/gc/scan/scan_amd64.go2
6 files changed, 113 insertions, 19 deletions
diff --git a/src/internal/runtime/gc/scan/filter.go b/src/internal/runtime/gc/scan/filter.go
index 63cee9abf0..08b5fd5b54 100644
--- a/src/internal/runtime/gc/scan/filter.go
+++ b/src/internal/runtime/gc/scan/filter.go
@@ -9,8 +9,6 @@ import "unsafe"
// FilterNil packs non-nil (non-zero) values in bufp together
// at the beginning of bufp, returning the length of the
// packed buffer. It treats bufp as an array of size n.
-//
-// TODO(mknyszek): Add a faster SIMD-based implementation.
func FilterNil(bufp *uintptr, n int32) int32 {
buf := unsafe.Slice(bufp, int(n))
lo := 0
diff --git a/src/internal/runtime/gc/scan/filter_amd64.go b/src/internal/runtime/gc/scan/filter_amd64.go
new file mode 100644
index 0000000000..c750741f7f
--- /dev/null
+++ b/src/internal/runtime/gc/scan/filter_amd64.go
@@ -0,0 +1,9 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan
+
+// FilterNilAVX512 is the simd version of FilterNil,
+// it is implemented in assembly.
+func FilterNilAVX512(bufp *uintptr, n int32) int32
diff --git a/src/internal/runtime/gc/scan/filter_amd64.s b/src/internal/runtime/gc/scan/filter_amd64.s
new file mode 100644
index 0000000000..47330b62ee
--- /dev/null
+++ b/src/internal/runtime/gc/scan/filter_amd64.s
@@ -0,0 +1,64 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·FilterNilAVX512(SB), NOSPLIT, $0-20
+ // Load arguments
+ MOVQ bufp+0(FP), R8 // R8 = bufp (start of the uint64 array)
+ MOVL n+8(FP), R9 // R9 = n (total length)
+ XORL R10, R10 // R10 = 0 (scanned = 0)
+ XORL R11, R11 // R11 = 0 (cnt = 0)
+
+ MOVL R9, R12 // R12 = n
+ SUBL R10, R12 // R12 = n - scanned
+ CMPL R12, $8 // Compare (n - scanned) with 8
+ JLT scalar_loop // If (n - scanned) < 8, jump to the scalar cleanup
+
+vector_loop:
+ LEAQ (R8)(R10*8), R13 // R13 = buf[scanned:] address
+ VMOVDQU64 (R13), Z1 // Z1 = v (Load 8 uint64s)
+ VPCMPUQ $4, Z1, Z15, K1 // Z15 is always 0, compare Z1 with 0, results in K1.
+
+ LEAQ (R8)(R11*8), R14 // R14 = buf[cnt:] address
+ VPCOMPRESSQ Z1, K1, Z1 // compress v
+ VMOVDQU64 Z1, (R14) // store v to buf[cnt:]
+
+ KMOVW K1, R15
+ POPCNTL R15, R15 // R15 = popcount(K1)
+
+ ADDL R15, R11 // cnt += popcount(K1)
+ ADDL $8, R10 // scanned += 8
+
+ MOVL R9, R12 // R12 = n
+ SUBL R10, R12 // R12 = n - scanned
+ CMPL R12, $8 // Compare (n - scanned) with 8
+ JGE vector_loop // If (n - scanned) >= 8, continue loop
+
+scalar_loop:
+ CMPL R10, R9 // Compare scanned with n
+ JGE end // If scanned >= n, loop is done
+
+scalar_next_i:
+ LEAQ (R8)(R10*8), R13 // R13 = &buf[scanned]
+ MOVQ (R13), R14 // R14 = buf[scanned]
+
+ CMPQ R14, $0
+ JE scalar_increment_i // If buf[i] == 0, skip to increment i
+
+ LEAQ (R8)(R11*8), R15 // R15 = &buf[cnt]
+ MOVQ R14, (R15) // buf[cnt] = buf[scanned]
+
+ ADDL $1, R11 // cnt++
+
+scalar_increment_i:
+ ADDL $1, R10 // scanned++
+
+ CMPL R10, R9
+ JL scalar_next_i // if scanned < n, continue
+
+end:
+ MOVL R11, ret+16(FP)
+ RET
diff --git a/src/internal/runtime/gc/scan/filter_amd64_test.go b/src/internal/runtime/gc/scan/filter_amd64_test.go
new file mode 100644
index 0000000000..f5426385f6
--- /dev/null
+++ b/src/internal/runtime/gc/scan/filter_amd64_test.go
@@ -0,0 +1,19 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64
+
+package scan_test
+
+import (
+ "internal/runtime/gc/scan"
+ "testing"
+)
+
+func TestFilterNilAVX512(t *testing.T) {
+ if !scan.CanAVX512() {
+ t.Skip("AVX512 is required for TestFilterNilAVX512")
+ }
+ runTestFilterNil(t, scan.FilterNilAVX512)
+}
diff --git a/src/internal/runtime/gc/scan/filter_test.go b/src/internal/runtime/gc/scan/filter_test.go
index 115fbfb8bc..7b5dd32db3 100644
--- a/src/internal/runtime/gc/scan/filter_test.go
+++ b/src/internal/runtime/gc/scan/filter_test.go
@@ -10,56 +10,60 @@ import (
)
func TestFilterNil(t *testing.T) {
+ runTestFilterNil(t, scan.FilterNil)
+}
+
+func runTestFilterNil(t *testing.T, filterNil func(*uintptr, int32) int32) {
t.Run("empty", func(t *testing.T) {
- testFilterNil(t, []uintptr{}, []uintptr{})
+ testFilterNil(t, []uintptr{}, []uintptr{}, filterNil)
})
t.Run("one", func(t *testing.T) {
- testFilterNil(t, []uintptr{4}, []uintptr{4})
+ testFilterNil(t, []uintptr{4}, []uintptr{4}, filterNil)
})
t.Run("elimOne", func(t *testing.T) {
- testFilterNil(t, []uintptr{0}, []uintptr{})
+ testFilterNil(t, []uintptr{0}, []uintptr{}, filterNil)
})
t.Run("oneElimBegin", func(t *testing.T) {
- testFilterNil(t, []uintptr{0, 4}, []uintptr{4})
+ testFilterNil(t, []uintptr{0, 4}, []uintptr{4}, filterNil)
})
t.Run("oneElimEnd", func(t *testing.T) {
- testFilterNil(t, []uintptr{4, 0}, []uintptr{4})
+ testFilterNil(t, []uintptr{4, 0}, []uintptr{4}, filterNil)
})
t.Run("oneElimMultiBegin", func(t *testing.T) {
- testFilterNil(t, []uintptr{0, 0, 0, 4}, []uintptr{4})
+ testFilterNil(t, []uintptr{0, 0, 0, 4}, []uintptr{4}, filterNil)
})
t.Run("oneElimMultiEnd", func(t *testing.T) {
- testFilterNil(t, []uintptr{4, 0, 0, 0}, []uintptr{4})
+ testFilterNil(t, []uintptr{4, 0, 0, 0}, []uintptr{4}, filterNil)
})
t.Run("oneElimMulti", func(t *testing.T) {
- testFilterNil(t, []uintptr{0, 0, 0, 4, 0}, []uintptr{4})
+ testFilterNil(t, []uintptr{0, 0, 0, 4, 0}, []uintptr{4}, filterNil)
})
t.Run("two", func(t *testing.T) {
- testFilterNil(t, []uintptr{5, 12}, []uintptr{5, 12})
+ testFilterNil(t, []uintptr{5, 12}, []uintptr{5, 12}, filterNil)
})
t.Run("twoElimBegin", func(t *testing.T) {
- testFilterNil(t, []uintptr{0, 5, 12}, []uintptr{5, 12})
+ testFilterNil(t, []uintptr{0, 5, 12}, []uintptr{5, 12}, filterNil)
})
t.Run("twoElimMid", func(t *testing.T) {
- testFilterNil(t, []uintptr{5, 0, 12}, []uintptr{5, 12})
+ testFilterNil(t, []uintptr{5, 0, 12}, []uintptr{5, 12}, filterNil)
})
t.Run("twoElimEnd", func(t *testing.T) {
- testFilterNil(t, []uintptr{5, 12, 0}, []uintptr{5, 12})
+ testFilterNil(t, []uintptr{5, 12, 0}, []uintptr{5, 12}, filterNil)
})
t.Run("twoElimMulti", func(t *testing.T) {
- testFilterNil(t, []uintptr{0, 5, 0, 12, 0}, []uintptr{5, 12})
+ testFilterNil(t, []uintptr{0, 5, 0, 12, 0}, []uintptr{5, 12}, filterNil)
})
t.Run("Multi", func(t *testing.T) {
- testFilterNil(t, []uintptr{1, 5, 5, 0, 0, 0, 12, 0, 121, 5, 0}, []uintptr{1, 5, 5, 12, 121, 5})
+ testFilterNil(t, []uintptr{1, 5, 5, 0, 0, 0, 12, 0, 121, 5, 0}, []uintptr{1, 5, 5, 12, 121, 5}, filterNil)
})
}
-func testFilterNil(t *testing.T, buf, want []uintptr) {
+func testFilterNil(t *testing.T, buf, want []uintptr, filterNil func(*uintptr, int32) int32) {
var bufp *uintptr
if len(buf) != 0 {
bufp = &buf[0]
}
- n := scan.FilterNil(bufp, int32(len(buf)))
+ n := filterNil(bufp, int32(len(buf)))
if n > int32(len(buf)) {
t.Errorf("bogus new length returned: %d > %d", n, len(buf))
return
diff --git a/src/internal/runtime/gc/scan/scan_amd64.go b/src/internal/runtime/gc/scan/scan_amd64.go
index 2ac181f97e..0151804fa5 100644
--- a/src/internal/runtime/gc/scan/scan_amd64.go
+++ b/src/internal/runtime/gc/scan/scan_amd64.go
@@ -28,7 +28,7 @@ func CanAVX512() bool {
}
func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
- return FilterNil(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
+ return FilterNilAVX512(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
}
//go:noescape