diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/internal/bytealg/bytealg.go | 10 | ||||
| -rw-r--r-- | src/internal/bytealg/equal_riscv64.s | 30 |
2 files changed, 36 insertions, 4 deletions
diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go index 711df74baf..319ea54ba3 100644 --- a/src/internal/bytealg/bytealg.go +++ b/src/internal/bytealg/bytealg.go @@ -11,16 +11,18 @@ import ( // Offsets into internal/cpu records for use in assembly. const ( - offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42) - offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2) - offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT) + offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9) + + offsetRISCV64HasV = unsafe.Offsetof(cpu.RISCV64.HasV) offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX) offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX) offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX) - offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9) + offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42) + offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2) + offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT) ) // MaxLen is the maximum length of the string to be searched for (argument b) in Index. diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s index 87b2d79302..58e033f847 100644 --- a/src/internal/bytealg/equal_riscv64.s +++ b/src/internal/bytealg/equal_riscv64.s @@ -2,6 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#include "asm_riscv64.h" #include "go_asm.h" #include "textflag.h" @@ -28,6 +29,35 @@ length_check: MOV $32, X23 BLT X12, X23, loop4_check +#ifndef hasV + MOVB internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X5 + BEQZ X5, equal_scalar +#endif + + // Use vector if not 8 byte aligned. + OR X10, X11, X5 + AND $7, X5 + BNEZ X5, vector_loop + + // Use scalar if 8 byte aligned and <= 64 bytes. + SUB $64, X12, X6 + BLEZ X6, loop32_check + + PCALIGN $16 +vector_loop: + VSETVLI X12, E8, M8, TA, MA, X5 + VLE8V (X10), V8 + VLE8V (X11), V16 + VMSNEVV V8, V16, V0 + VFIRSTM V0, X6 + BGEZ X6, done + ADD X5, X10 + ADD X5, X11 + SUB X5, X12 + BNEZ X12, vector_loop + JMP done + +equal_scalar: // Check alignment - if alignment differs we have to do one byte at a time. AND $7, X10, X9 AND $7, X11, X19 |
