aboutsummaryrefslogtreecommitdiff
path: root/src/internal/runtime
diff options
context:
space:
mode:
Diffstat (limited to 'src/internal/runtime')
-rw-r--r--src/internal/runtime/gc/scan/scan_amd64.s19
1 files changed, 18 insertions, 1 deletions
diff --git a/src/internal/runtime/gc/scan/scan_amd64.s b/src/internal/runtime/gc/scan/scan_amd64.s
index 055995fa38..9b4950a767 100644
--- a/src/internal/runtime/gc/scan/scan_amd64.s
+++ b/src/internal/runtime/gc/scan/scan_amd64.s
@@ -86,7 +86,24 @@ loop:
// Collect just the pointers from the greyed objects into the scan buffer,
// i.e., copy the word indices in the mask from Z1 into contiguous memory.
- VPCOMPRESSQ Z1, K1, (DI)(DX*8)
+ //
+ // N.B. VPCOMPRESSQ supports a memory destination. Unfortunately, on
+ // AMD Genoa / Zen 4, using VPCOMPRESSQ with a memory destination
+ // imposes a severe performance penalty of around an order of magnitude
+ // compared to a register destination.
+ //
+ // This workaround is unfortunate on other microarchitectures, where a
+ // memory destination is slightly faster than adding an additional move
+ // instruction, but no where near an order of magnitude. It would be
+ // nice to have a Genoa-only variant here.
+ //
+ // AMD Turin / Zen 5 fixes this issue.
+ //
+ // See
+ // https://lemire.me/blog/2025/02/14/avx-512-gotcha-avoid-compressing-words-to-memory-with-amd-zen-4-processors/.
+ VPCOMPRESSQ Z1, K1, Z2
+ VMOVDQU64 Z2, (DI)(DX*8)
+
// Advance the scan buffer position by the number of pointers.
MOVBQZX 128(AX), CX
ADDQ CX, DX