[dev.simd] cmd/compile, simd: add SHA features

This CL also fixed some bugs left in CL 712181. Change-Id: I9cb6cd9fbaef307f352809bf21b8fec3eb62721a Reviewed-on: https://go-review.googlesource.com/c/go/+/712361 Reviewed-by: David Chase <drchase@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
author: Junyang Shao <shaojunyang@google.com> 2025-10-16 16:07:32 +0000
committer: Junyang Shao <shaojunyang@google.com> 2025-10-24 10:53:28 -0700
commit: cf7c1a4cbb917b6c5d80d1d9443a40cb7720db75 (patch)
tree: a1e74fb930fae0c22ad03c377c5bc4f4585ad6e0 /src/simd
parent: 2b8eded4f4fd3d421d1fb9af68c774142abcf208 (diff)
download: go-cf7c1a4cbb917b6c5d80d1d9443a40cb7720db75.tar.xz
12 files changed, 300 insertions, 19 deletions
diff --git a/src/simd/_gen/simdgen/gen_simdIntrinsics.go b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
index a59bd9d658..8827ce07c1 100644
--- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go
+++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
@@ -58,6 +58,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 {{end}}
 {{define "op2Imm8_II"}}	addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_II(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
 {{end}}
+{{define "op2Imm8_SHA1RNDS4"}}	addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_SHA1RNDS4(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
+{{end}}
 {{define "op3Imm8"}}	addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
 {{end}}
 {{define "op3Imm8_2I"}}	addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
diff --git a/src/simd/_gen/simdgen/gen_simdMachineOps.go b/src/simd/_gen/simdgen/gen_simdMachineOps.go
index e65b36e95d..b1286ad604 100644
--- a/src/simd/_gen/simdgen/gen_simdMachineOps.go
+++ b/src/simd/_gen/simdgen/gen_simdMachineOps.go
@@ -16,7 +16,7 @@ const simdMachineOpsTmpl = `
 package main
 
 func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vfpkv, w11, w21, w2k, wkw, w2kw, w2kk, w31, w3kw, wgpw, wgp, wfpw, wfpkw,
-	wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload regInfo) []opData {
+	wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload, v31x0AtIn2 regInfo) []opData {
 	return []opData{
 {{- range .OpsData }}
 		{name: "{{.OpName}}", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", commutative: {{.Comm}}, typ: "{{.Type}}", resultInArg0: {{.ResultInArg0}}},
@@ -61,7 +61,7 @@ func writeSIMDMachineOps(ops []Operation) *bytes.Buffer {
 		"v11": true, "v21": true, "v2k": true, "v2kv": true, "v2kk": true, "vkv": true, "v31": true, "v3kv": true, "vgpv": true, "vgp": true, "vfpv": true, "vfpkv": true,
 		"w11": true, "w21": true, "w2k": true, "w2kw": true, "w2kk": true, "wkw": true, "w31": true, "w3kw": true, "wgpw": true, "wgp": true, "wfpw": true, "wfpkw": true,
 		"wkwload": true, "v21load": true, "v31load": true, "v11load": true, "w21load": true, "w31load": true, "w2kload": true, "w2kwload": true, "w11load": true,
-		"w3kwload": true, "w2kkload": true}
+		"w3kwload": true, "w2kkload": true, "v31x0AtIn2": true}
 	opsData := make([]opData, 0)
 	opsDataImm := make([]opData, 0)
 	opsDataLoad := make([]opData, 0)
diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go
index 2d81231cda..a8998ec252 100644
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@@ -352,6 +352,15 @@ func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uin
 func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
 {{end}}
 
+{{define "op2Imm8_SHA1RNDS4"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
+{{end}}
+
 {{define "op3Imm8"}}
 {{if .Documentation}}{{.Documentation}}
 //{{end}}
diff --git a/src/simd/_gen/simdgen/gen_simdssa.go b/src/simd/_gen/simdgen/gen_simdssa.go
index c1ce584549..8402376210 100644
--- a/src/simd/_gen/simdgen/gen_simdssa.go
+++ b/src/simd/_gen/simdgen/gen_simdssa.go
@@ -96,6 +96,9 @@ func writeSIMDSSA(ops []Operation) *bytes.Buffer {
 		"v2kvloadImm8",
 		"v31ResultInArg0Imm8",
 		"v31loadResultInArg0Imm8",
+		"v21ResultInArg0",
+		"v21ResultInArg0Imm8",
+		"v31x0AtIn2ResultInArg0",
 	}
 	regInfoSet := map[string][]string{}
 	for _, key := range regInfoKeys {
diff --git a/src/simd/_gen/simdgen/gen_utility.go b/src/simd/_gen/simdgen/gen_utility.go
index 70f07cf7a4..2fb05026c0 100644
--- a/src/simd/_gen/simdgen/gen_utility.go
+++ b/src/simd/_gen/simdgen/gen_utility.go
@@ -236,9 +236,9 @@ func (op *Operation) shape() (shapeIn inShape, shapeOut outShape, maskType maskS
 // regShape returns a string representation of the register shape.
 func (op *Operation) regShape(mem memShape) (string, error) {
 	_, _, _, _, gOp := op.shape()
-	var regInfo string
+	var regInfo, fixedName string
 	var vRegInCnt, gRegInCnt, kMaskInCnt, vRegOutCnt, gRegOutCnt, kMaskOutCnt, memInCnt, memOutCnt int
-	for _, in := range gOp.In {
+	for i, in := range gOp.In {
 		switch in.Class {
 		case "vreg":
 			vRegInCnt++
@@ -253,8 +253,11 @@ func (op *Operation) regShape(mem memShape) (string, error) {
 			memInCnt++
 			vRegInCnt++
 		}
+		if in.FixedReg != nil {
+			fixedName = fmt.Sprintf("%sAtIn%d", *in.FixedReg, i)
+		}
 	}
-	for _, out := range gOp.Out {
+	for i, out := range gOp.Out {
 		// If class overwrite is happening, that's not really a mask but a vreg.
 		if out.Class == "vreg" || out.OverwriteClass != nil {
 			vRegOutCnt++
@@ -269,6 +272,9 @@ func (op *Operation) regShape(mem memShape) (string, error) {
 			vRegOutCnt++
 			memOutCnt++
 		}
+		if out.FixedReg != nil {
+			fixedName = fmt.Sprintf("%sAtIn%d", *out.FixedReg, i)
+		}
 	}
 	var inRegs, inMasks, outRegs, outMasks string
 
@@ -309,6 +315,7 @@ func (op *Operation) regShape(mem memShape) (string, error) {
 	if memOutCnt > 0 {
 		panic("simdgen does not understand memory as output as of now")
 	}
+	regInfo += fixedName
 	return regInfo, nil
 }
 
diff --git a/src/simd/_gen/simdgen/godefs.go b/src/simd/_gen/simdgen/godefs.go
index bda1dfc8fe..244f67fe9d 100644
--- a/src/simd/_gen/simdgen/godefs.go
+++ b/src/simd/_gen/simdgen/godefs.go
@@ -256,6 +256,8 @@ type Operand struct {
 	// because Intel's XED data is inconsistent. e.g. AVX512 VPMADDUBSW marks its operand
 	// elemBits 16, which should be 8.
 	OverwriteElementBits *int
+	// FixedReg is the name of the fixed registers
+	FixedReg *string
 }
 
 // isDigit returns true if the byte is an ASCII digit.
diff --git a/src/simd/_gen/simdgen/main.go b/src/simd/_gen/simdgen/main.go
index 537dde0c66..ca75cff55d 100644
--- a/src/simd/_gen/simdgen/main.go
+++ b/src/simd/_gen/simdgen/main.go
@@ -92,8 +92,9 @@ import (
 	"slices"
 	"strings"
 
-	"gopkg.in/yaml.v3"
 	"simd/_gen/unify"
+
+	"gopkg.in/yaml.v3"
 )
 
 var (
@@ -199,6 +200,15 @@ func main() {
 		log.Fatal(err)
 	}
 
+	// Validate results.
+	//
+	// Don't validate if this is a command-line query because that tends to
+	// eliminate lots of required defs and is used in cases where maybe defs
+	// aren't enumerable anyway.
+	if *flagQ == "" && len(must) > 0 {
+		validate(unified, must)
+	}
+
 	// Print results.
 	switch *flagO {
 	case "yaml":
@@ -228,15 +238,6 @@ func main() {
 			fmt.Fprintf(os.Stderr, "XED decoding generated %d \"errors\" which is not cause for alarm, use -v for details.\n", operandRemarks)
 		}
 	}
-
-	// Validate results.
-	//
-	// Don't validate if this is a command-line query because that tends to
-	// eliminate lots of required defs and is used in cases where maybe defs
-	// aren't enumerable anyway.
-	if *flagQ == "" && len(must) > 0 {
-		validate(unified, must)
-	}
 }
 
 func validate(cl unify.Closure, required map[*unify.Value]struct{}) {
diff --git a/src/simd/_gen/simdgen/ops/Others/categories.yaml b/src/simd/_gen/simdgen/ops/Others/categories.yaml
index dd922fb14b..3c8befb826 100644
--- a/src/simd/_gen/simdgen/ops/Others/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/Others/categories.yaml
@@ -46,4 +46,63 @@
   documentation: !string |-
     // NAME performs the InvMixColumns operation in AES cipher algorithm defined in FIPS 197.
     // x is the chunk of w array in use.
-    // result = InvMixColumns(x)
-\ No newline at end of file
+    // result = InvMixColumns(x)
+- go: SHA1Round4
+  commutative: false
+  documentation: !string |-
+    // NAME performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
+    // x contains the state variables a, b, c and d from upper to lower order.
+    // y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
+    // result = the state variables a', b', c', d' updated after 4 rounds.
+    // constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
+- go: SHA1NextE
+  commutative: false
+  documentation: !string |-
+    // NAME calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
+    // x contains the state variable a (before the 4 rounds), placed in the upper element.
+    // y is the elements of W array for next 4 rounds from upper to lower order.
+    // result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
+    // from upper to lower order.
+    // For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
+    // for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
+    // computation of the value of e'.)
+- go: SHA1Msg1
+  commutative: false
+  documentation: !string |-
+    // NAME does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
+    // x = {W3, W2, W1, W0}
+    // y = {0, 0, W5, W4}
+    // result = {W3^W5, W2^W4, W1^W3, W0^W2}.
+- go: SHA1Msg2
+  commutative: false
+  documentation: !string |-
+    // NAME does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
+    // x = result of 2.
+    // y = {W15, W14, W13}
+    // result = {W19, W18, W17, W16}
+- go: SHA256Rounds2
+  commutative: false
+  documentation: !string |-
+    // NAME does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
+    // x = {h, g, d, c}
+    // y = {f, e, b, a}
+    // z = {W0+K0, W1+K1}
+    // result = {f', e', b', a'}
+    // The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
+    // the corresponding element of the W array to make the input data z.
+    // The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
+    // y (the state variables a, b, e, f before the 2 rounds).
+- go: SHA256Msg1
+  commutative: false
+  documentation: !string |-
+    // NAME does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
+    // x = {W0, W1, W2, W3}
+    // y = {W4, 0, 0, 0}
+    // result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
+- go: SHA256Msg2
+  commutative: false
+  documentation: !string |-
+    // NAME does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
+    // x = result of 2
+    // y = {0, 0, W14, W15}
+    // result = {W16, W17, W18, W19}
+\ No newline at end of file
diff --git a/src/simd/_gen/simdgen/ops/Others/go.yaml b/src/simd/_gen/simdgen/ops/Others/go.yaml
index 0f8b7b43a2..77b9fc3783 100644
--- a/src/simd/_gen/simdgen/ops/Others/go.yaml
+++ b/src/simd/_gen/simdgen/ops/Others/go.yaml
@@ -52,4 +52,45 @@
   in:
   - *uint32s
   out:
-  - *uint32s
-\ No newline at end of file
+  - *uint32s
+- go: SHA1Round4
+  asm: SHA1RNDS4
+  operandOrder: "SHA1RNDS4"
+  in: &2any1imm
+  - *any
+  - *any
+  - class: immediate
+    immOffset: 0
+  out: &1any
+  - *any
+- go: SHA1NextE
+  asm: SHA1NEXTE
+  in: &2any
+  - *any
+  - *any
+  out: *1any
+- go: SHA1Msg1
+  asm: SHA1MSG1
+  in: *2any
+  out: *1any
+- go: SHA1Msg2
+  asm: SHA1MSG2
+  in: *2any
+  out: *1any
+- go: SHA256Rounds2
+  asm: SHA256RNDS2
+  in:
+  - base: $t
+  - base: $t
+  - base: $t
+    overwriteElementBits: 32
+  out:
+  - base: $t
+- go: SHA256Msg1
+  asm: SHA256MSG1
+  in: *2any
+  out: *1any
+- go: SHA256Msg2
+  asm: SHA256MSG1
+  in: *2any
+  out: *1any
+\ No newline at end of file
diff --git a/src/simd/_gen/simdgen/xed.go b/src/simd/_gen/simdgen/xed.go
index 76bd584b52..9e9b67e77d 100644
--- a/src/simd/_gen/simdgen/xed.go
+++ b/src/simd/_gen/simdgen/xed.go
@@ -25,7 +25,6 @@ const (
 	NOT_REG_CLASS = iota // not a register
 	VREG_CLASS           // classify as a vector register; see
 	GREG_CLASS           // classify as a general register
-	REG_FIXED            // classify as a fixed  register
 )
 
 // instVariant is a bitmap indicating a variant of an instruction that has
@@ -852,7 +851,7 @@ type fixedReg struct {
 }
 
 var fixedRegMap = map[string]fixedReg{
-	"XED_REG_XMM0": {REG_FIXED, "XMM0", 128},
+	"XED_REG_XMM0": {VREG_CLASS, "x0", 128},
 }
 
 // decodeReg returns class (NOT_REG_CLASS, VREG_CLASS, GREG_CLASS, VREG_CLASS_FIXED,
diff --git a/src/simd/cpu.go b/src/simd/cpu.go
index 7d4fe25003..ca445072c0 100644
--- a/src/simd/cpu.go
+++ b/src/simd/cpu.go
@@ -106,3 +106,11 @@ func HasAVX512VPOPCNTDQ() bool {
 func HasAVXVNNI() bool {
 	return cpu.X86.HasAVXVNNI
 }
+
+// HasSHA returns whether the CPU supports the SHA feature.
+//
+// HasSHA is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func HasSHA() bool {
+	return cpu.X86.HasSHA
+}
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go
index 49c387aea9..e0c76099ba 100644
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -5623,6 +5623,156 @@ func (x Float64x4) RoundToEvenScaledResidue(prec uint8) Float64x4
 // Asm: VREDUCEPD, CPU Feature: AVX512
 func (x Float64x8) RoundToEvenScaledResidue(prec uint8) Float64x8
 
+/* SHA1Msg1 */
+
+// SHA1Msg1 does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W3, W2, W1, W0}
+// y = {0, 0, W5, W4}
+// result = {W3^W5, W2^W4, W1^W3, W0^W2}.
+//
+// Asm: SHA1MSG1, CPU Feature: SHA
+func (x Int32x4) SHA1Msg1(y Int32x4) Int32x4
+
+// SHA1Msg1 does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W3, W2, W1, W0}
+// y = {0, 0, W5, W4}
+// result = {W3^W5, W2^W4, W1^W3, W0^W2}.
+//
+// Asm: SHA1MSG1, CPU Feature: SHA
+func (x Uint32x4) SHA1Msg1(y Uint32x4) Uint32x4
+
+/* SHA1Msg2 */
+
+// SHA1Msg2 does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2.
+// y = {W15, W14, W13}
+// result = {W19, W18, W17, W16}
+//
+// Asm: SHA1MSG2, CPU Feature: SHA
+func (x Int32x4) SHA1Msg2(y Int32x4) Int32x4
+
+// SHA1Msg2 does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2.
+// y = {W15, W14, W13}
+// result = {W19, W18, W17, W16}
+//
+// Asm: SHA1MSG2, CPU Feature: SHA
+func (x Uint32x4) SHA1Msg2(y Uint32x4) Uint32x4
+
+/* SHA1NextE */
+
+// SHA1NextE calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variable a (before the 4 rounds), placed in the upper element.
+// y is the elements of W array for next 4 rounds from upper to lower order.
+// result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
+// from upper to lower order.
+// For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
+// for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
+// computation of the value of e'.)
+//
+// Asm: SHA1NEXTE, CPU Feature: SHA
+func (x Int32x4) SHA1NextE(y Int32x4) Int32x4
+
+// SHA1NextE calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variable a (before the 4 rounds), placed in the upper element.
+// y is the elements of W array for next 4 rounds from upper to lower order.
+// result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
+// from upper to lower order.
+// For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
+// for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
+// computation of the value of e'.)
+//
+// Asm: SHA1NEXTE, CPU Feature: SHA
+func (x Uint32x4) SHA1NextE(y Uint32x4) Uint32x4
+
+/* SHA1Round4 */
+
+// SHA1Round4 performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variables a, b, c and d from upper to lower order.
+// y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
+// result = the state variables a', b', c', d' updated after 4 rounds.
+// constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
+//
+// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: SHA1RNDS4, CPU Feature: SHA
+func (x Int32x4) SHA1Round4(constant uint8, y Int32x4) Int32x4
+
+// SHA1Round4 performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variables a, b, c and d from upper to lower order.
+// y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
+// result = the state variables a', b', c', d' updated after 4 rounds.
+// constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
+//
+// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: SHA1RNDS4, CPU Feature: SHA
+func (x Uint32x4) SHA1Round4(constant uint8, y Uint32x4) Uint32x4
+
+/* SHA256Msg1 */
+
+// SHA256Msg1 does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W0, W1, W2, W3}
+// y = {W4, 0, 0, 0}
+// result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
+//
+// Asm: SHA256MSG1, CPU Feature: SHA
+func (x Int32x4) SHA256Msg1(y Int32x4) Int32x4
+
+// SHA256Msg1 does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W0, W1, W2, W3}
+// y = {W4, 0, 0, 0}
+// result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
+//
+// Asm: SHA256MSG1, CPU Feature: SHA
+func (x Uint32x4) SHA256Msg1(y Uint32x4) Uint32x4
+
+/* SHA256Msg2 */
+
+// SHA256Msg2 does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2
+// y = {0, 0, W14, W15}
+// result = {W16, W17, W18, W19}
+//
+// Asm: SHA256MSG1, CPU Feature: SHA
+func (x Int32x4) SHA256Msg2(y Int32x4) Int32x4
+
+// SHA256Msg2 does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2
+// y = {0, 0, W14, W15}
+// result = {W16, W17, W18, W19}
+//
+// Asm: SHA256MSG1, CPU Feature: SHA
+func (x Uint32x4) SHA256Msg2(y Uint32x4) Uint32x4
+
+/* SHA256Rounds2 */
+
+// SHA256Rounds2 does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
+// x = {h, g, d, c}
+// y = {f, e, b, a}
+// z = {W0+K0, W1+K1}
+// result = {f', e', b', a'}
+// The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
+// the corresponding element of the W array to make the input data z.
+// The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
+// y (the state variables a, b, e, f before the 2 rounds).
+//
+// Asm: SHA256RNDS2, CPU Feature: SHA
+func (x Int32x4) SHA256Rounds2(y Int32x4, z Int32x4) Int32x4
+
+// SHA256Rounds2 does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
+// x = {h, g, d, c}
+// y = {f, e, b, a}
+// z = {W0+K0, W1+K1}
+// result = {f', e', b', a'}
+// The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
+// the corresponding element of the W array to make the input data z.
+// The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
+// y (the state variables a, b, e, f before the 2 rounds).
+//
+// Asm: SHA256RNDS2, CPU Feature: SHA
+func (x Uint32x4) SHA256Rounds2(y Uint32x4, z Uint32x4) Uint32x4
+
 /* Scale */
 
 // Scale multiplies elements by a power of 2.
author	Junyang Shao <shaojunyang@google.com>	2025-10-16 16:07:32 +0000
committer	Junyang Shao <shaojunyang@google.com>	2025-10-24 10:53:28 -0700
commit	cf7c1a4cbb917b6c5d80d1d9443a40cb7720db75 (patch)
tree	a1e74fb930fae0c22ad03c377c5bc4f4585ad6e0 /src/simd
parent	2b8eded4f4fd3d421d1fb9af68c774142abcf208 (diff)
download	go-cf7c1a4cbb917b6c5d80d1d9443a40cb7720db75.tar.xz