aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorlimeidan <limeidan@loongson.cn>2025-04-22 10:24:27 +0800
committerabner chenc <chenguoqi@loongson.cn>2025-04-23 01:29:52 -0700
commitecdd429a3be7abde6e169b79da13bffdba064cb4 (patch)
treefda4186dbabf370eef6c4d3ad34171325eb6cde0 /src
parent93e4e26d5b909c3dbeeb638534461155f06ecf5c (diff)
downloadgo-ecdd429a3be7abde6e169b79da13bffdba064cb4.tar.xz
runtime: optimize the function memequal using SIMD on loong64
goos: linux goarch: loong64 pkg: bytes cpu: Loongson-3A6000-HV @ 2500.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ Equal/0 0.4012n ± 0% 0.4003n ± 0% -0.21% (p=0.000 n=10) Equal/same/1 2.555n ± 1% 2.419n ± 0% -5.32% (p=0.000 n=10) Equal/same/6 2.574n ± 1% 2.425n ± 1% -5.79% (p=0.000 n=10) Equal/same/9 2.578n ± 0% 2.419n ± 1% -6.19% (p=0.000 n=10) Equal/same/15 2.565n ± 1% 2.417n ± 0% -5.73% (p=0.000 n=10) Equal/same/16 2.576n ± 1% 2.414n ± 0% -6.31% (p=0.000 n=10) Equal/same/20 2.573n ± 1% 2.416n ± 0% -6.10% (p=0.000 n=10) Equal/same/32 2.559n ± 0% 2.411n ± 0% -5.80% (p=0.000 n=10) Equal/same/4K 2.579n ± 1% 2.410n ± 0% -6.53% (p=0.000 n=10) Equal/same/4M 2.571n ± 0% 2.411n ± 0% -6.22% (p=0.000 n=10) Equal/same/64M 2.568n ± 1% 2.413n ± 0% -6.05% (p=0.000 n=10) Equal/1 5.215n ± 0% 6.404n ± 0% +22.80% (p=0.000 n=10) Equal/6 11.630n ± 0% 6.404n ± 0% -44.94% (p=0.000 n=10) Equal/9 15.240n ± 0% 6.404n ± 0% -57.98% (p=0.000 n=10) Equal/15 22.925n ± 0% 6.404n ± 0% -72.07% (p=0.000 n=10) Equal/16 24.070n ± 0% 5.203n ± 0% -78.38% (p=0.000 n=10) Equal/20 28.880n ± 0% 6.404n ± 0% -77.83% (p=0.000 n=10) Equal/32 43.320n ± 0% 6.404n ± 0% -85.22% (p=0.000 n=10) Equal/4K 4938.50n ± 0% 55.43n ± 0% -98.88% (p=0.000 n=10) Equal/4M 5048.8µ ± 0% 202.0µ ± 0% -96.00% (p=0.000 n=10) Equal/64M 80.819m ± 0% 4.539m ± 0% -94.38% (p=0.000 n=10) EqualBothUnaligned/64_0 79.830n ± 0% 4.803n ± 0% -93.98% (p=0.000 n=10) EqualBothUnaligned/64_1 79.830n ± 0% 4.803n ± 0% -93.98% (p=0.000 n=10) EqualBothUnaligned/64_4 79.830n ± 0% 4.803n ± 0% -93.98% (p=0.000 n=10) EqualBothUnaligned/64_7 79.830n ± 0% 4.803n ± 0% -93.98% (p=0.000 n=10) EqualBothUnaligned/4096_0 4937.00n ± 0% 65.64n ± 0% -98.67% (p=0.000 n=10) EqualBothUnaligned/4096_1 4937.00n ± 0% 78.85n ± 0% -98.40% (p=0.000 n=10) EqualBothUnaligned/4096_4 4937.00n ± 0% 78.87n ± 0% -98.40% (p=0.000 n=10) EqualBothUnaligned/4096_7 4937.00n ± 0% 78.87n ± 0% -98.40% (p=0.000 n=10) EqualBothUnaligned/4194304_0 5049.2µ ± 0% 204.2µ ± 0% -95.96% (p=0.000 n=10) EqualBothUnaligned/4194304_1 5049.2µ ± 0% 205.1µ ± 0% -95.94% (p=0.000 n=10) EqualBothUnaligned/4194304_4 5049.4µ ± 0% 205.1µ ± 0% -95.94% (p=0.000 n=10) EqualBothUnaligned/4194304_7 5049.2µ ± 0% 205.1µ ± 0% -95.94% (p=0.000 n=10) EqualBothUnaligned/67108864_0 80.796m ± 0% 3.863m ± 0% -95.22% (p=0.000 n=10) EqualBothUnaligned/67108864_1 80.801m ± 0% 3.706m ± 0% -95.41% (p=0.000 n=10) EqualBothUnaligned/67108864_4 80.799m ± 0% 3.706m ± 0% -95.41% (p=0.000 n=10) EqualBothUnaligned/67108864_7 80.781m ± 0% 3.706m ± 0% -95.41% (p=0.000 n=10) geomean 1.040µ 149.6n -85.63% Change-Id: Id4c2bc0ca758337dd9759df83750c761814be488 Reviewed-on: https://go-review.googlesource.com/c/go/+/667255 Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com>
Diffstat (limited to 'src')
-rw-r--r--src/internal/bytealg/equal_loong64.s287
1 files changed, 258 insertions, 29 deletions
diff --git a/src/internal/bytealg/equal_loong64.s b/src/internal/bytealg/equal_loong64.s
index 830b09bd2c..8f570e8eae 100644
--- a/src/internal/bytealg/equal_loong64.s
+++ b/src/internal/bytealg/equal_loong64.s
@@ -8,37 +8,266 @@
#define REGCTXT R29
// memequal(a, b unsafe.Pointer, size uintptr) bool
-TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
- BEQ R4, R5, eq
- ADDV R4, R6, R7
- PCALIGN $16
-loop:
- BNE R4, R7, test
- MOVV $1, R4
- RET
-test:
- MOVBU (R4), R9
- ADDV $1, R4
- MOVBU (R5), R10
- ADDV $1, R5
- BEQ R9, R10, loop
-
- MOVB R0, R4
- RET
-eq:
- MOVV $1, R4
- RET
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
+ // R4 = a_base
+ // R5 = b_base
+ // R6 = size
+ JMP equalbody<>(SB)
// memequal_varlen(a, b unsafe.Pointer) bool
-TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$40-17
- BEQ R4, R5, eq
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0
+ // R4 = a_base
+ // R5 = b_base
MOVV 8(REGCTXT), R6 // compiler stores size at offset 8 in the closure
- MOVV R4, 8(R3)
- MOVV R5, 16(R3)
- MOVV R6, 24(R3)
- JAL runtime·memequal(SB)
- MOVBU 32(R3), R4
- RET
-eq:
+ JMP equalbody<>(SB)
+
+// input:
+// R4 = a_base
+// R5 = b_base
+// R6 = size
+TEXT equalbody<>(SB),NOSPLIT|NOFRAME,$0
+ // a_base == b_base
+ BEQ R4, R5, equal
+ // 0 bytes
+ BEQ R6, equal
+
+ MOVV $64, R7
+ BGE R6, R7, lasx
+
+ // size < 64 bytes
+tail:
+ MOVV $16, R7
+ BLT R6, R7, lt_16
+generic16_loop:
+ ADDV $-16, R6
+ MOVV 0(R4), R8
+ MOVV 8(R4), R9
+ MOVV 0(R5), R10
+ MOVV 8(R5), R11
+ BNE R8, R10, not_equal
+ BNE R9, R11, not_equal
+ BEQ R6, equal
+ ADDV $16, R4
+ ADDV $16, R5
+ BGE R6, R7, generic16_loop
+
+ // size < 16 bytes
+lt_16:
+ MOVV $8, R7
+ BLT R6, R7, lt_8
+ ADDV $-8, R6
+ MOVV 0(R4), R8
+ MOVV 0(R5), R9
+ BNE R8, R9, not_equal
+ BEQ R6, equal
+ ADDV $8, R4
+ ADDV $8, R5
+
+ // size < 8 bytes
+lt_8:
+ MOVV $4, R7
+ BLT R6, R7, lt_4
+ ADDV $-4, R6
+ MOVW 0(R4), R8
+ MOVW 0(R5), R9
+ BNE R8, R9, not_equal
+ BEQ R6, equal
+ ADDV $4, R4
+ ADDV $4, R5
+
+ // size < 4 bytes
+lt_4:
+ MOVV $2, R7
+ BLT R6, R7, lt_2
+ ADDV $-2, R6
+ MOVH 0(R4), R8
+ MOVH 0(R5), R9
+ BNE R8, R9, not_equal
+ BEQ R6, equal
+ ADDV $2, R4
+ ADDV $2, R5
+
+ // size < 2 bytes
+lt_2:
+ MOVB 0(R4), R8
+ MOVB 0(R5), R9
+ BNE R8, R9, not_equal
+
+equal:
MOVV $1, R4
RET
+
+not_equal:
+ MOVV R0, R4
+ RET
+
+ // Implemented using 256-bit SIMD instructions
+lasx:
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
+ BEQ R7, lsx
+
+lasx256:
+ MOVV $256, R7
+ BLT R6, R7, lasx64
+lasx256_loop:
+ ADDV $-256, R6
+ XVMOVQ 0(R4), X0
+ XVMOVQ 32(R4), X1
+ XVMOVQ 64(R4), X2
+ XVMOVQ 96(R4), X3
+ XVMOVQ 128(R4), X4
+ XVMOVQ 160(R4), X5
+ XVMOVQ 192(R4), X6
+ XVMOVQ 224(R4), X7
+ XVMOVQ 0(R5), X8
+ XVMOVQ 32(R5), X9
+ XVMOVQ 64(R5), X10
+ XVMOVQ 96(R5), X11
+ XVMOVQ 128(R5), X12
+ XVMOVQ 160(R5), X13
+ XVMOVQ 192(R5), X14
+ XVMOVQ 224(R5), X15
+ XVSEQV X0, X8, X0
+ XVSEQV X1, X9, X1
+ XVSEQV X2, X10, X2
+ XVSEQV X3, X11, X3
+ XVSEQV X4, X12, X4
+ XVSEQV X5, X13, X5
+ XVSEQV X6, X14, X6
+ XVSEQV X7, X15, X7
+ XVANDV X0, X1, X0
+ XVANDV X2, X3, X2
+ XVANDV X4, X5, X4
+ XVANDV X6, X7, X6
+ XVANDV X0, X2, X0
+ XVANDV X4, X6, X4
+ XVANDV X0, X4, X0
+ XVSETALLNEV X0, FCC0
+ BFPF not_equal
+ BEQ R6, equal
+ ADDV $256, R4
+ ADDV $256, R5
+ BGE R6, R7, lasx256_loop
+
+lasx64:
+ MOVV $64, R7
+ BLT R6, R7, tail
+lasx64_loop:
+ ADDV $-64, R6
+ XVMOVQ 0(R4), X0
+ XVMOVQ 32(R4), X1
+ XVMOVQ 0(R5), X2
+ XVMOVQ 32(R5), X3
+ XVSEQV X0, X2, X0
+ XVSEQV X1, X3, X1
+ XVANDV X0, X1, X0
+ XVSETALLNEV X0, FCC0
+ BFPF not_equal
+ BEQ R6, equal
+ ADDV $64, R4
+ ADDV $64, R5
+ BGE R6, R7, lasx64_loop
+ JMP tail
+
+ // Implemented using 128-bit SIMD instructions
+lsx:
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
+ BEQ R7, generic64_loop
+
+lsx128:
+ MOVV $128, R7
+ BLT R6, R7, lsx32
+lsx128_loop:
+ ADDV $-128, R6
+ VMOVQ 0(R4), V0
+ VMOVQ 16(R4), V1
+ VMOVQ 32(R4), V2
+ VMOVQ 48(R4), V3
+ VMOVQ 64(R4), V4
+ VMOVQ 80(R4), V5
+ VMOVQ 96(R4), V6
+ VMOVQ 112(R4), V7
+ VMOVQ 0(R5), V8
+ VMOVQ 16(R5), V9
+ VMOVQ 32(R5), V10
+ VMOVQ 48(R5), V11
+ VMOVQ 64(R5), V12
+ VMOVQ 80(R5), V13
+ VMOVQ 96(R5), V14
+ VMOVQ 112(R5), V15
+ VSEQV V0, V8, V0
+ VSEQV V1, V9, V1
+ VSEQV V2, V10, V2
+ VSEQV V3, V11, V3
+ VSEQV V4, V12, V4
+ VSEQV V5, V13, V5
+ VSEQV V6, V14, V6
+ VSEQV V7, V15, V7
+ VANDV V0, V1, V0
+ VANDV V2, V3, V2
+ VANDV V4, V5, V4
+ VANDV V6, V7, V6
+ VANDV V0, V2, V0
+ VANDV V4, V6, V4
+ VANDV V0, V4, V0
+ VSETALLNEV V0, FCC0
+ BFPF not_equal
+ BEQ R6, equal
+
+ ADDV $128, R4
+ ADDV $128, R5
+ BGE R6, R7, lsx128_loop
+
+lsx32:
+ MOVV $32, R7
+ BLT R6, R7, tail
+lsx32_loop:
+ ADDV $-32, R6
+ VMOVQ 0(R4), V0
+ VMOVQ 16(R4), V1
+ VMOVQ 0(R5), V2
+ VMOVQ 16(R5), V3
+ VSEQV V0, V2, V0
+ VSEQV V1, V3, V1
+ VANDV V0, V1, V0
+ VSETALLNEV V0, FCC0
+ BFPF not_equal
+ BEQ R6, equal
+ ADDV $32, R4
+ ADDV $32, R5
+ BGE R6, R7, lsx32_loop
+ JMP tail
+
+ // Implemented using general instructions
+generic64_loop:
+ ADDV $-64, R6
+ MOVV 0(R4), R7
+ MOVV 8(R4), R8
+ MOVV 16(R4), R9
+ MOVV 24(R4), R10
+ MOVV 0(R5), R15
+ MOVV 8(R5), R16
+ MOVV 16(R5), R17
+ MOVV 24(R5), R18
+ BNE R7, R15, not_equal
+ BNE R8, R16, not_equal
+ BNE R9, R17, not_equal
+ BNE R10, R18, not_equal
+ MOVV 32(R4), R11
+ MOVV 40(R4), R12
+ MOVV 48(R4), R13
+ MOVV 56(R4), R14
+ MOVV 32(R5), R19
+ MOVV 40(R5), R20
+ MOVV 48(R5), R21
+ MOVV 56(R5), R23
+ BNE R11, R19, not_equal
+ BNE R12, R20, not_equal
+ BNE R13, R21, not_equal
+ BNE R14, R23, not_equal
+ BEQ R6, equal
+ ADDV $64, R4
+ ADDV $64, R5
+ MOVV $64, R7
+ BGE R6, R7, generic64_loop
+ JMP tail