6 files changed, 232 insertions, 78 deletions
diff --git a/src/simd/archsimd/_gen/simdgen/gen_simdMachineOps.go b/src/simd/archsimd/_gen/simdgen/gen_simdMachineOps.go
index 3d99dd2a81..94b122ac39 100644
--- a/src/simd/archsimd/_gen/simdgen/gen_simdMachineOps.go
+++ b/src/simd/archsimd/_gen/simdgen/gen_simdMachineOps.go
@@ -181,7 +181,7 @@ func writeSIMDMachineOps(ops []Operation) *bytes.Buffer {
 		}
 		hasMerging = gOp.hasMaskedMerging(maskType, shapeOut)
 		if hasMerging && !resultInArg0 {
-			// We have to copy the slice here becasue the sort will be visible from other
+			// We have to copy the slice here because the sort will be visible from other
 			// aliases when no reslicing is happening.
 			newIn := make([]Operand, len(op.In), len(op.In)+1)
 			copy(newIn, op.In)
diff --git a/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go b/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
index dd3a75eb44..4f1c70e211 100644
--- a/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
@@ -189,6 +189,7 @@ type X86Features struct {}
 var X86 X86Features
 
 {{range .}}
+{{$f := .}}
 {{- if eq .Feature "AVX512"}}
 // {{.Feature}} returns whether the CPU supports the AVX512F+CD+BW+DQ+VL features.
 //
@@ -199,11 +200,19 @@ var X86 X86Features
 {{- else -}}
 // {{.Feature}} returns whether the CPU supports the {{.Feature}} feature.
 {{- end}}
+{{- if ne .ImpliesAll ""}}
+//
+// If it returns true, then the CPU also supports {{.ImpliesAll}}.
+{{- end}}
 //
 // {{.Feature}} is defined on all GOARCHes, but will only return true on
 // GOARCH {{.GoArch}}.
-func (X86Features) {{.Feature}}() bool {
-	return cpu.X86.Has{{.Feature}}
+func ({{.FeatureVar}}Features) {{.Feature}}() bool {
+{{- if .Virtual}}
+	return {{range $i, $dep := .Implies}}{{if $i}} && {{end}}cpu.{{$f.FeatureVar}}.Has{{$dep}}{{end}}
+{{- else}}
+	return cpu.{{.FeatureVar}}.Has{{.Feature}}
+{{- end}}
 }
 {{end}}
 `
@@ -591,6 +600,65 @@ func writeSIMDTypes(typeMap simdTypeMap) *bytes.Buffer {
 	return buffer
 }
 
+type goarchFeatures struct {
+	// featureVar is the name of the exported feature-check variable for this
+	// architecture.
+	featureVar string
+
+	// features records per-feature information.
+	features map[string]featureInfo
+}
+
+type featureInfo struct {
+	// Implies is a list of other CPU features that are required for this
+	// feature. These are allowed to chain.
+	//
+	// For example, if the Frob feature lists "Baz", then if X.Frob() returns
+	// true, it must also be true that the CPU has feature Baz.
+	Implies []string
+
+	// Virtual means this feature is not represented directly in internal/cpu,
+	// but is instead the logical AND of the features in Implies.
+	Virtual bool
+}
+
+// goarchFeatureInfo maps from GOARCH to CPU feature to additional information
+// about that feature. Not all features need to be in this map.
+var goarchFeatureInfo = make(map[string]goarchFeatures)
+
+func registerFeatureInfo(goArch string, features goarchFeatures) {
+	goarchFeatureInfo[goArch] = features
+}
+
+func featureImplies(goarch string, base string) string {
+	// Compute the transitive closure of base.
+	var list []string
+	var visit func(f string)
+	visit = func(f string) {
+		list = append(list, f)
+		for _, dep := range goarchFeatureInfo[goarch].features[f].Implies {
+			visit(dep)
+		}
+	}
+	visit(base)
+	// Drop base
+	list = list[1:]
+	// Put in "nice" order
+	slices.Reverse(list)
+	// Combine into a comment-ready form
+	switch len(list) {
+	case 0:
+		return ""
+	case 1:
+		return list[0]
+	case 2:
+		return list[0] + " and " + list[1]
+	default:
+		list[len(list)-1] = "and " + list[len(list)-1]
+		return strings.Join(list, ", ")
+	}
+}
+
 func writeSIMDFeatures(ops []Operation) *bytes.Buffer {
 	// Gather all features
 	type featureKey struct {
@@ -606,13 +674,36 @@ func writeSIMDFeatures(ops []Operation) *bytes.Buffer {
 			featureSet[featureKey{op.GoArch, feature}] = struct{}{}
 		}
 	}
-	features := slices.SortedFunc(maps.Keys(featureSet), func(a, b featureKey) int {
+	featureKeys := slices.SortedFunc(maps.Keys(featureSet), func(a, b featureKey) int {
 		if c := cmp.Compare(a.GoArch, b.GoArch); c != 0 {
 			return c
 		}
 		return compareNatural(a.Feature, b.Feature)
 	})
 
+	// TODO: internal/cpu doesn't enforce these at all. You can even do
+	// GODEBUG=cpu.avx=off and it will happily turn off AVX without turning off
+	// AVX2. We need to push these dependencies into it somehow.
+	type feature struct {
+		featureKey
+		FeatureVar string
+		Virtual    bool
+		Implies    []string
+		ImpliesAll string
+	}
+	var features []feature
+	for _, k := range featureKeys {
+		featureVar := goarchFeatureInfo[k.GoArch].featureVar
+		fi := goarchFeatureInfo[k.GoArch].features[k.Feature]
+		features = append(features, feature{
+			featureKey: k,
+			FeatureVar: featureVar,
+			Virtual:    fi.Virtual,
+			Implies:    fi.Implies,
+			ImpliesAll: featureImplies(k.GoArch, k.Feature),
+		})
+	}
+
 	// If we ever have the same feature name on more than one GOARCH, we'll have
 	// to be more careful about this.
 	t := templateOf(simdFeaturesTemplate, "features")
diff --git a/src/simd/archsimd/_gen/simdgen/gen_simdssa.go b/src/simd/archsimd/_gen/simdgen/gen_simdssa.go
index 876ffabe3d..96d096688f 100644
--- a/src/simd/archsimd/_gen/simdgen/gen_simdssa.go
+++ b/src/simd/archsimd/_gen/simdgen/gen_simdssa.go
@@ -133,7 +133,7 @@ func writeSIMDSSA(ops []Operation) *bytes.Buffer {
 		if mem == NoMem && op.hasMaskedMerging(maskType, shapeOut) {
 			regShapeMerging := regShape
 			if shapeOut != OneVregOutAtIn {
-				// We have to copy the slice here becasue the sort will be visible from other
+				// We have to copy the slice here because the sort will be visible from other
 				// aliases when no reslicing is happening.
 				newIn := make([]Operand, len(op.In), len(op.In)+1)
 				copy(newIn, op.In)
diff --git a/src/simd/archsimd/_gen/simdgen/xed.go b/src/simd/archsimd/_gen/simdgen/xed.go
index 4ba6738e7e..5d6fac64d0 100644
--- a/src/simd/archsimd/_gen/simdgen/xed.go
+++ b/src/simd/archsimd/_gen/simdgen/xed.go
@@ -5,7 +5,6 @@
 package main
 
 import (
-	"cmp"
 	"fmt"
 	"log"
 	"maps"
@@ -78,7 +77,7 @@ func loadXED(xedPath string) []*unify.Value {
 		switch {
 		case inst.RealOpcode == "N":
 			return // Skip unstable instructions
-		case !(strings.HasPrefix(inst.Extension, "AVX") || strings.HasPrefix(inst.Extension, "SHA")):
+		case !(strings.HasPrefix(inst.Extension, "AVX") || strings.HasPrefix(inst.Extension, "SHA") || inst.Extension == "FMA"):
 			// We're only interested in AVX and SHA instructions.
 			return
 		}
@@ -210,16 +209,9 @@ func loadXED(xedPath string) []*unify.Value {
 			}
 			log.Printf("%d unhandled CPU features for %d instructions (use -v for details)", len(unknownFeatures), nInst)
 		} else {
-			keys := slices.SortedFunc(maps.Keys(unknownFeatures), func(a, b cpuFeatureKey) int {
-				return cmp.Or(cmp.Compare(a.Extension, b.Extension),
-					cmp.Compare(a.ISASet, b.ISASet))
-			})
+			keys := slices.Sorted(maps.Keys(unknownFeatures))
 			for _, key := range keys {
-				if key.ISASet == "" || key.ISASet == key.Extension {
-					log.Printf("unhandled Extension %s", key.Extension)
-				} else {
-					log.Printf("unhandled Extension %s and ISASet %s", key.Extension, key.ISASet)
-				}
+				log.Printf("unhandled ISASet %s", key)
 				log.Printf("  opcodes: %s", slices.Sorted(maps.Keys(unknownFeatures[key])))
 			}
 		}
@@ -763,16 +755,24 @@ func instToUVal1(inst *xeddata.Inst, ops []operand, feature string, variant inst
 // decodeCPUFeature returns the CPU feature name required by inst. These match
 // the names of the "Has*" feature checks in the simd package.
 func decodeCPUFeature(inst *xeddata.Inst) (string, bool) {
-	key := cpuFeatureKey{
-		Extension: inst.Extension,
-		ISASet:    isaSetStrip.ReplaceAllLiteralString(inst.ISASet, ""),
+	isaSet := inst.ISASet
+	if isaSet == "" {
+		// Older instructions don't have an ISA set. Use their "extension"
+		// instead.
+		isaSet = inst.Extension
 	}
-	feat, ok := cpuFeatureMap[key]
+	// We require AVX512VL to use AVX512 at all, so strip off the vector length
+	// suffixes.
+	if strings.HasPrefix(isaSet, "AVX512") {
+		isaSet = isaSetVL.ReplaceAllLiteralString(isaSet, "")
+	}
+
+	feat, ok := cpuFeatureMap[isaSet]
 	if !ok {
-		imap := unknownFeatures[key]
+		imap := unknownFeatures[isaSet]
 		if imap == nil {
 			imap = make(map[string]struct{})
-			unknownFeatures[key] = imap
+			unknownFeatures[isaSet] = imap
 		}
 		imap[inst.Opcode()] = struct{}{}
 		return "", false
@@ -783,45 +783,76 @@ func decodeCPUFeature(inst *xeddata.Inst) (string, bool) {
 	return feat, true
 }
 
-var isaSetStrip = regexp.MustCompile("_(128N?|256N?|512)$")
+var isaSetVL = regexp.MustCompile("_(128N?|256N?|512)$")
 
-type cpuFeatureKey struct {
-	Extension, ISASet string
-}
-
-// cpuFeatureMap maps from XED's "EXTENSION" and "ISA_SET" to a CPU feature name
-// that can be used in the SIMD API.
-var cpuFeatureMap = map[cpuFeatureKey]string{
-	{"SHA", "SHA"}: "SHA",
-
-	{"AVX", ""}:              "AVX",
-	{"AVX_VNNI", "AVX_VNNI"}: "AVXVNNI",
-	{"AVX2", ""}:             "AVX2",
-	{"AVXAES", ""}:           "AVX, AES",
+// cpuFeatureMap maps from XED's "ISA_SET" (or "EXTENSION") to a CPU feature
+// name to expose in the SIMD feature check API.
+//
+// See XED's datafiles/*/cpuid.xed.txt for how ISA set names map to CPUID flags.
+var cpuFeatureMap = map[string]string{
+	"AVX":      "AVX",
+	"AVX_VNNI": "AVXVNNI",
+	"AVX2":     "AVX2",
+	"AVXAES":   "AVXAES",
+	"SHA":      "SHA",
+	"FMA":      "FMA",
 
 	// AVX-512 foundational features. We combine all of these into one "AVX512" feature.
-	{"AVX512EVEX", "AVX512F"}:  "AVX512",
-	{"AVX512EVEX", "AVX512CD"}: "AVX512",
-	{"AVX512EVEX", "AVX512BW"}: "AVX512",
-	{"AVX512EVEX", "AVX512DQ"}: "AVX512",
-	// AVX512VL doesn't appear explicitly in the ISASet. I guess it's implied by
-	// the vector length suffix.
+	"AVX512F":  "AVX512",
+	"AVX512BW": "AVX512",
+	"AVX512CD": "AVX512",
+	"AVX512DQ": "AVX512",
+	// AVX512VL doesn't appear as its own ISASet; instead, the CPUID flag is
+	// required by the *_128 and *_256 ISASets. We fold it into "AVX512" anyway.
 
 	// AVX-512 extension features
-	{"AVX512EVEX", "AVX512_BITALG"}:     "AVX512BITALG",
-	{"AVX512EVEX", "AVX512_GFNI"}:       "AVX512GFNI",
-	{"AVX512EVEX", "AVX512_VBMI2"}:      "AVX512VBMI2",
-	{"AVX512EVEX", "AVX512_VBMI"}:       "AVX512VBMI",
-	{"AVX512EVEX", "AVX512_VNNI"}:       "AVX512VNNI",
-	{"AVX512EVEX", "AVX512_VPOPCNTDQ"}:  "AVX512VPOPCNTDQ",
-	{"AVX512EVEX", "AVX512_VAES"}:       "AVX512VAES",
-	{"AVX512EVEX", "AVX512_VPCLMULQDQ"}: "AVX512VPCLMULQDQ",
+	"AVX512_BITALG":     "AVX512BITALG",
+	"AVX512_GFNI":       "AVX512GFNI",
+	"AVX512_VBMI":       "AVX512VBMI",
+	"AVX512_VBMI2":      "AVX512VBMI2",
+	"AVX512_VNNI":       "AVX512VNNI",
+	"AVX512_VPOPCNTDQ":  "AVX512VPOPCNTDQ",
+	"AVX512_VAES":       "AVX512VAES",
+	"AVX512_VPCLMULQDQ": "AVX512VPCLMULQDQ",
 
 	// AVX 10.2 (not yet supported)
-	{"AVX512EVEX", "AVX10_2_RC"}: "ignore",
+	"AVX10_2_RC": "ignore",
+}
+
+func init() {
+	// TODO: In general, Intel doesn't make any guarantees about what flags are
+	// set, so this means our feature checks need to ensure these, just to be
+	// sure.
+	var features = map[string]featureInfo{
+		"AVX2":   {Implies: []string{"AVX"}},
+		"AVX512": {Implies: []string{"AVX2"}},
+
+		"AVXAES": {Virtual: true, Implies: []string{"AVX", "AES"}},
+		"FMA":    {Implies: []string{"AVX"}},
+
+		// AVX-512 subfeatures.
+		"AVX512BITALG":    {Implies: []string{"AVX512"}},
+		"AVX512GFNI":      {Implies: []string{"AVX512"}},
+		"AVX512VBMI":      {Implies: []string{"AVX512"}},
+		"AVX512VBMI2":     {Implies: []string{"AVX512"}},
+		"AVX512VNNI":      {Implies: []string{"AVX512"}},
+		"AVX512VPOPCNTDQ": {Implies: []string{"AVX512"}},
+		"AVX512VAES":      {Implies: []string{"AVX512"}},
+
+		// AVX-VNNI and AVX-IFMA are "backports" of the AVX512-VNNI/IFMA
+		// instructions to VEX encoding, limited to 256 bit vectors. They're
+		// intended for lower end CPUs that want to support VNNI/IFMA without
+		// supporting AVX-512. As such, they're built on AVX2's VEX encoding.
+		"AVXVNNI": {Implies: []string{"AVX2"}},
+		"AVXIFMA": {Implies: []string{"AVX2"}},
+	}
+	registerFeatureInfo("amd64", goarchFeatures{
+		featureVar: "X86",
+		features:   features,
+	})
 }
 
-var unknownFeatures = map[cpuFeatureKey]map[string]struct{}{}
+var unknownFeatures = map[string]map[string]struct{}{}
 
 // hasOptionalMask returns whether there is an optional mask operand in ops.
 func hasOptionalMask(ops []operand) bool {
diff --git a/src/simd/archsimd/cpu.go b/src/simd/archsimd/cpu.go
index d0c0ff5426..8069ee7f26 100644
--- a/src/simd/archsimd/cpu.go
+++ b/src/simd/archsimd/cpu.go
@@ -10,14 +10,6 @@ type X86Features struct{}
 
 var X86 X86Features
 
-// AES returns whether the CPU supports the AES feature.
-//
-// AES is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AES() bool {
-	return cpu.X86.HasAES
-}
-
 // AVX returns whether the CPU supports the AVX feature.
 //
 // AVX is defined on all GOARCHes, but will only return true on
@@ -28,6 +20,8 @@ func (X86Features) AVX() bool {
 
 // AVX2 returns whether the CPU supports the AVX2 feature.
 //
+// If it returns true, then the CPU also supports AVX.
+//
 // AVX2 is defined on all GOARCHes, but will only return true on
 // GOARCH amd64.
 func (X86Features) AVX2() bool {
@@ -41,6 +35,8 @@ func (X86Features) AVX2() bool {
 // Nearly every CPU that has shipped with any support for AVX-512 has
 // supported all five of these features.
 //
+// If it returns true, then the CPU also supports AVX and AVX2.
+//
 // AVX512 is defined on all GOARCHes, but will only return true on
 // GOARCH amd64.
 func (X86Features) AVX512() bool {
@@ -49,6 +45,8 @@ func (X86Features) AVX512() bool {
 
 // AVX512BITALG returns whether the CPU supports the AVX512BITALG feature.
 //
+// If it returns true, then the CPU also supports AVX, AVX2, and AVX512.
+//
 // AVX512BITALG is defined on all GOARCHes, but will only return true on
 // GOARCH amd64.
 func (X86Features) AVX512BITALG() bool {
@@ -57,6 +55,8 @@ func (X86Features) AVX512BITALG() bool {
 
 // AVX512GFNI returns whether the CPU supports the AVX512GFNI feature.
 //
+// If it returns true, then the CPU also supports AVX, AVX2, and AVX512.
+//
 // AVX512GFNI is defined on all GOARCHes, but will only return true on
 // GOARCH amd64.
 func (X86Features) AVX512GFNI() bool {
@@ -65,6 +65,8 @@ func (X86Features) AVX512GFNI() bool {
 
 // AVX512VAES returns whether the CPU supports the AVX512VAES feature.
 //
+// If it returns true, then the CPU also supports AVX, AVX2, and AVX512.
+//
 // AVX512VAES is defined on all GOARCHes, but will only return true on
 // GOARCH amd64.
 func (X86Features) AVX512VAES() bool {
@@ -73,6 +75,8 @@ func (X86Features) AVX512VAES() bool {
 
 // AVX512VBMI returns whether the CPU supports the AVX512VBMI feature.
 //
+// If it returns true, then the CPU also supports AVX, AVX2, and AVX512.
+//
 // AVX512VBMI is defined on all GOARCHes, but will only return true on
 // GOARCH amd64.
 func (X86Features) AVX512VBMI() bool {
@@ -81,6 +85,8 @@ func (X86Features) AVX512VBMI() bool {
 
 // AVX512VBMI2 returns whether the CPU supports the AVX512VBMI2 feature.
 //
+// If it returns true, then the CPU also supports AVX, AVX2, and AVX512.
+//
 // AVX512VBMI2 is defined on all GOARCHes, but will only return true on
 // GOARCH amd64.
 func (X86Features) AVX512VBMI2() bool {
@@ -89,6 +95,8 @@ func (X86Features) AVX512VBMI2() bool {
 
 // AVX512VNNI returns whether the CPU supports the AVX512VNNI feature.
 //
+// If it returns true, then the CPU also supports AVX, AVX2, and AVX512.
+//
 // AVX512VNNI is defined on all GOARCHes, but will only return true on
 // GOARCH amd64.
 func (X86Features) AVX512VNNI() bool {
@@ -105,20 +113,44 @@ func (X86Features) AVX512VPCLMULQDQ() bool {
 
 // AVX512VPOPCNTDQ returns whether the CPU supports the AVX512VPOPCNTDQ feature.
 //
+// If it returns true, then the CPU also supports AVX, AVX2, and AVX512.
+//
 // AVX512VPOPCNTDQ is defined on all GOARCHes, but will only return true on
 // GOARCH amd64.
 func (X86Features) AVX512VPOPCNTDQ() bool {
 	return cpu.X86.HasAVX512VPOPCNTDQ
 }
 
+// AVXAES returns whether the CPU supports the AVXAES feature.
+//
+// If it returns true, then the CPU also supports AES and AVX.
+//
+// AVXAES is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVXAES() bool {
+	return cpu.X86.HasAVX && cpu.X86.HasAES
+}
+
 // AVXVNNI returns whether the CPU supports the AVXVNNI feature.
 //
+// If it returns true, then the CPU also supports AVX and AVX2.
+//
 // AVXVNNI is defined on all GOARCHes, but will only return true on
 // GOARCH amd64.
 func (X86Features) AVXVNNI() bool {
 	return cpu.X86.HasAVXVNNI
 }
 
+// FMA returns whether the CPU supports the FMA feature.
+//
+// If it returns true, then the CPU also supports AVX.
+//
+// FMA is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) FMA() bool {
+	return cpu.X86.HasFMA
+}
+
 // SHA returns whether the CPU supports the SHA feature.
 //
 // SHA is defined on all GOARCHes, but will only return true on
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index bb162c4ff9..ec50cc72c5 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -11,7 +11,7 @@ package archsimd
 // y is the chunk of dw array in use.
 // result = AddRoundKey(InvShiftRows(InvSubBytes(x)), y)
 //
-// Asm: VAESDECLAST, CPU Feature: AVX, AES
+// Asm: VAESDECLAST, CPU Feature: AVXAES
 func (x Uint8x16) AESDecryptLastRound(y Uint32x4) Uint8x16
 
 // AESDecryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
@@ -37,7 +37,7 @@ func (x Uint8x64) AESDecryptLastRound(y Uint32x16) Uint8x64
 // y is the chunk of dw array in use.
 // result = AddRoundKey(InvMixColumns(InvShiftRows(InvSubBytes(x))), y)
 //
-// Asm: VAESDEC, CPU Feature: AVX, AES
+// Asm: VAESDEC, CPU Feature: AVXAES
 func (x Uint8x16) AESDecryptOneRound(y Uint32x4) Uint8x16
 
 // AESDecryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
@@ -63,7 +63,7 @@ func (x Uint8x64) AESDecryptOneRound(y Uint32x16) Uint8x64
 // y is the chunk of w array in use.
 // result = AddRoundKey((ShiftRows(SubBytes(x))), y)
 //
-// Asm: VAESENCLAST, CPU Feature: AVX, AES
+// Asm: VAESENCLAST, CPU Feature: AVXAES
 func (x Uint8x16) AESEncryptLastRound(y Uint32x4) Uint8x16
 
 // AESEncryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
@@ -89,7 +89,7 @@ func (x Uint8x64) AESEncryptLastRound(y Uint32x16) Uint8x64
 // y is the chunk of w array in use.
 // result = AddRoundKey(MixColumns(ShiftRows(SubBytes(x))), y)
 //
-// Asm: VAESENC, CPU Feature: AVX, AES
+// Asm: VAESENC, CPU Feature: AVXAES
 func (x Uint8x16) AESEncryptOneRound(y Uint32x4) Uint8x16
 
 // AESEncryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
@@ -114,7 +114,7 @@ func (x Uint8x64) AESEncryptOneRound(y Uint32x16) Uint8x64
 // x is the chunk of w array in use.
 // result = InvMixColumns(x)
 //
-// Asm: VAESIMC, CPU Feature: AVX, AES
+// Asm: VAESIMC, CPU Feature: AVXAES
 func (x Uint32x4) AESInvMixColumns() Uint32x4
 
 /* AESRoundKeyGenAssist */
@@ -129,7 +129,7 @@ func (x Uint32x4) AESInvMixColumns() Uint32x4
 //
 // rconVal results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
-// Asm: VAESKEYGENASSIST, CPU Feature: AVX, AES
+// Asm: VAESKEYGENASSIST, CPU Feature: AVXAES
 func (x Uint32x4) AESRoundKeyGenAssist(rconVal uint8) Uint32x4
 
 /* Abs */
@@ -4088,12 +4088,12 @@ func (x Uint64x8) Mul(y Uint64x8) Uint64x8
 
 // MulAdd performs a fused (x * y) + z.
 //
-// Asm: VFMADD213PS, CPU Feature: AVX512
+// Asm: VFMADD213PS, CPU Feature: FMA
 func (x Float32x4) MulAdd(y Float32x4, z Float32x4) Float32x4
 
 // MulAdd performs a fused (x * y) + z.
 //
-// Asm: VFMADD213PS, CPU Feature: AVX512
+// Asm: VFMADD213PS, CPU Feature: FMA
 func (x Float32x8) MulAdd(y Float32x8, z Float32x8) Float32x8
 
 // MulAdd performs a fused (x * y) + z.
@@ -4103,12 +4103,12 @@ func (x Float32x16) MulAdd(y Float32x16, z Float32x16) Float32x16
 
 // MulAdd performs a fused (x * y) + z.
 //
-// Asm: VFMADD213PD, CPU Feature: AVX512
+// Asm: VFMADD213PD, CPU Feature: FMA
 func (x Float64x2) MulAdd(y Float64x2, z Float64x2) Float64x2
 
 // MulAdd performs a fused (x * y) + z.
 //
-// Asm: VFMADD213PD, CPU Feature: AVX512
+// Asm: VFMADD213PD, CPU Feature: FMA
 func (x Float64x4) MulAdd(y Float64x4, z Float64x4) Float64x4
 
 // MulAdd performs a fused (x * y) + z.
@@ -4120,12 +4120,12 @@ func (x Float64x8) MulAdd(y Float64x8, z Float64x8) Float64x8
 
 // MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
 //
-// Asm: VFMADDSUB213PS, CPU Feature: AVX512
+// Asm: VFMADDSUB213PS, CPU Feature: FMA
 func (x Float32x4) MulAddSub(y Float32x4, z Float32x4) Float32x4
 
 // MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
 //
-// Asm: VFMADDSUB213PS, CPU Feature: AVX512
+// Asm: VFMADDSUB213PS, CPU Feature: FMA
 func (x Float32x8) MulAddSub(y Float32x8, z Float32x8) Float32x8
 
 // MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
@@ -4135,12 +4135,12 @@ func (x Float32x16) MulAddSub(y Float32x16, z Float32x16) Float32x16
 
 // MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
 //
-// Asm: VFMADDSUB213PD, CPU Feature: AVX512
+// Asm: VFMADDSUB213PD, CPU Feature: FMA
 func (x Float64x2) MulAddSub(y Float64x2, z Float64x2) Float64x2
 
 // MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
 //
-// Asm: VFMADDSUB213PD, CPU Feature: AVX512
+// Asm: VFMADDSUB213PD, CPU Feature: FMA
 func (x Float64x4) MulAddSub(y Float64x4, z Float64x4) Float64x4
 
 // MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
@@ -4210,12 +4210,12 @@ func (x Uint16x32) MulHigh(y Uint16x32) Uint16x32
 
 // MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
 //
-// Asm: VFMSUBADD213PS, CPU Feature: AVX512
+// Asm: VFMSUBADD213PS, CPU Feature: FMA
 func (x Float32x4) MulSubAdd(y Float32x4, z Float32x4) Float32x4
 
 // MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
 //
-// Asm: VFMSUBADD213PS, CPU Feature: AVX512
+// Asm: VFMSUBADD213PS, CPU Feature: FMA
 func (x Float32x8) MulSubAdd(y Float32x8, z Float32x8) Float32x8
 
 // MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
@@ -4225,12 +4225,12 @@ func (x Float32x16) MulSubAdd(y Float32x16, z Float32x16) Float32x16
 
 // MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
 //
-// Asm: VFMSUBADD213PD, CPU Feature: AVX512
+// Asm: VFMSUBADD213PD, CPU Feature: FMA
 func (x Float64x2) MulSubAdd(y Float64x2, z Float64x2) Float64x2
 
 // MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
 //
-// Asm: VFMSUBADD213PD, CPU Feature: AVX512
+// Asm: VFMSUBADD213PD, CPU Feature: FMA
 func (x Float64x4) MulSubAdd(y Float64x4, z Float64x4) Float64x4
 
 // MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.