1 files changed, 18 insertions, 1 deletions
diff --git a/src/internal/runtime/gc/scan/scan_amd64.s b/src/internal/runtime/gc/scan/scan_amd64.s
index 055995fa38..9b4950a767 100644
--- a/src/internal/runtime/gc/scan/scan_amd64.s
+++ b/src/internal/runtime/gc/scan/scan_amd64.s
@@ -86,7 +86,24 @@ loop:
 
 	// Collect just the pointers from the greyed objects into the scan buffer,
 	// i.e., copy the word indices in the mask from Z1 into contiguous memory.
-	VPCOMPRESSQ Z1, K1, (DI)(DX*8)
+	//
+	// N.B. VPCOMPRESSQ supports a memory destination. Unfortunately, on
+	// AMD Genoa / Zen 4, using VPCOMPRESSQ with a memory destination
+	// imposes a severe performance penalty of around an order of magnitude
+	// compared to a register destination.
+	//
+	// This workaround is unfortunate on other microarchitectures, where a
+	// memory destination is slightly faster than adding an additional move
+	// instruction, but no where near an order of magnitude. It would be
+	// nice to have a Genoa-only variant here.
+	//
+	// AMD Turin / Zen 5 fixes this issue.
+	//
+	// See
+	// https://lemire.me/blog/2025/02/14/avx-512-gotcha-avoid-compressing-words-to-memory-with-amd-zen-4-processors/.
+	VPCOMPRESSQ Z1, K1, Z2
+	VMOVDQU64 Z2, (DI)(DX*8)
+
 	// Advance the scan buffer position by the number of pointers.
 	MOVBQZX 128(AX), CX
 	ADDQ CX, DX