aboutsummaryrefslogtreecommitdiff
path: root/src/simd
diff options
context:
space:
mode:
authorJunyang Shao <shaojunyang@google.com>2025-10-16 16:07:32 +0000
committerJunyang Shao <shaojunyang@google.com>2025-10-24 10:53:28 -0700
commitcf7c1a4cbb917b6c5d80d1d9443a40cb7720db75 (patch)
treea1e74fb930fae0c22ad03c377c5bc4f4585ad6e0 /src/simd
parent2b8eded4f4fd3d421d1fb9af68c774142abcf208 (diff)
downloadgo-cf7c1a4cbb917b6c5d80d1d9443a40cb7720db75.tar.xz
[dev.simd] cmd/compile, simd: add SHA features
This CL also fixed some bugs left in CL 712181. Change-Id: I9cb6cd9fbaef307f352809bf21b8fec3eb62721a Reviewed-on: https://go-review.googlesource.com/c/go/+/712361 Reviewed-by: David Chase <drchase@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Diffstat (limited to 'src/simd')
-rw-r--r--src/simd/_gen/simdgen/gen_simdIntrinsics.go2
-rw-r--r--src/simd/_gen/simdgen/gen_simdMachineOps.go4
-rw-r--r--src/simd/_gen/simdgen/gen_simdTypes.go9
-rw-r--r--src/simd/_gen/simdgen/gen_simdssa.go3
-rw-r--r--src/simd/_gen/simdgen/gen_utility.go13
-rw-r--r--src/simd/_gen/simdgen/godefs.go2
-rw-r--r--src/simd/_gen/simdgen/main.go21
-rw-r--r--src/simd/_gen/simdgen/ops/Others/categories.yaml61
-rw-r--r--src/simd/_gen/simdgen/ops/Others/go.yaml43
-rw-r--r--src/simd/_gen/simdgen/xed.go3
-rw-r--r--src/simd/cpu.go8
-rw-r--r--src/simd/ops_amd64.go150
12 files changed, 300 insertions, 19 deletions
diff --git a/src/simd/_gen/simdgen/gen_simdIntrinsics.go b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
index a59bd9d658..8827ce07c1 100644
--- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go
+++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
@@ -58,6 +58,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
{{end}}
{{define "op2Imm8_II"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_II(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
{{end}}
+{{define "op2Imm8_SHA1RNDS4"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_SHA1RNDS4(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
+{{end}}
{{define "op3Imm8"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
{{end}}
{{define "op3Imm8_2I"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
diff --git a/src/simd/_gen/simdgen/gen_simdMachineOps.go b/src/simd/_gen/simdgen/gen_simdMachineOps.go
index e65b36e95d..b1286ad604 100644
--- a/src/simd/_gen/simdgen/gen_simdMachineOps.go
+++ b/src/simd/_gen/simdgen/gen_simdMachineOps.go
@@ -16,7 +16,7 @@ const simdMachineOpsTmpl = `
package main
func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vfpkv, w11, w21, w2k, wkw, w2kw, w2kk, w31, w3kw, wgpw, wgp, wfpw, wfpkw,
- wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload regInfo) []opData {
+ wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload, v31x0AtIn2 regInfo) []opData {
return []opData{
{{- range .OpsData }}
{name: "{{.OpName}}", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", commutative: {{.Comm}}, typ: "{{.Type}}", resultInArg0: {{.ResultInArg0}}},
@@ -61,7 +61,7 @@ func writeSIMDMachineOps(ops []Operation) *bytes.Buffer {
"v11": true, "v21": true, "v2k": true, "v2kv": true, "v2kk": true, "vkv": true, "v31": true, "v3kv": true, "vgpv": true, "vgp": true, "vfpv": true, "vfpkv": true,
"w11": true, "w21": true, "w2k": true, "w2kw": true, "w2kk": true, "wkw": true, "w31": true, "w3kw": true, "wgpw": true, "wgp": true, "wfpw": true, "wfpkw": true,
"wkwload": true, "v21load": true, "v31load": true, "v11load": true, "w21load": true, "w31load": true, "w2kload": true, "w2kwload": true, "w11load": true,
- "w3kwload": true, "w2kkload": true}
+ "w3kwload": true, "w2kkload": true, "v31x0AtIn2": true}
opsData := make([]opData, 0)
opsDataImm := make([]opData, 0)
opsDataLoad := make([]opData, 0)
diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go
index 2d81231cda..a8998ec252 100644
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@@ -352,6 +352,15 @@ func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uin
func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
{{end}}
+{{define "op2Imm8_SHA1RNDS4"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
+{{end}}
+
{{define "op3Imm8"}}
{{if .Documentation}}{{.Documentation}}
//{{end}}
diff --git a/src/simd/_gen/simdgen/gen_simdssa.go b/src/simd/_gen/simdgen/gen_simdssa.go
index c1ce584549..8402376210 100644
--- a/src/simd/_gen/simdgen/gen_simdssa.go
+++ b/src/simd/_gen/simdgen/gen_simdssa.go
@@ -96,6 +96,9 @@ func writeSIMDSSA(ops []Operation) *bytes.Buffer {
"v2kvloadImm8",
"v31ResultInArg0Imm8",
"v31loadResultInArg0Imm8",
+ "v21ResultInArg0",
+ "v21ResultInArg0Imm8",
+ "v31x0AtIn2ResultInArg0",
}
regInfoSet := map[string][]string{}
for _, key := range regInfoKeys {
diff --git a/src/simd/_gen/simdgen/gen_utility.go b/src/simd/_gen/simdgen/gen_utility.go
index 70f07cf7a4..2fb05026c0 100644
--- a/src/simd/_gen/simdgen/gen_utility.go
+++ b/src/simd/_gen/simdgen/gen_utility.go
@@ -236,9 +236,9 @@ func (op *Operation) shape() (shapeIn inShape, shapeOut outShape, maskType maskS
// regShape returns a string representation of the register shape.
func (op *Operation) regShape(mem memShape) (string, error) {
_, _, _, _, gOp := op.shape()
- var regInfo string
+ var regInfo, fixedName string
var vRegInCnt, gRegInCnt, kMaskInCnt, vRegOutCnt, gRegOutCnt, kMaskOutCnt, memInCnt, memOutCnt int
- for _, in := range gOp.In {
+ for i, in := range gOp.In {
switch in.Class {
case "vreg":
vRegInCnt++
@@ -253,8 +253,11 @@ func (op *Operation) regShape(mem memShape) (string, error) {
memInCnt++
vRegInCnt++
}
+ if in.FixedReg != nil {
+ fixedName = fmt.Sprintf("%sAtIn%d", *in.FixedReg, i)
+ }
}
- for _, out := range gOp.Out {
+ for i, out := range gOp.Out {
// If class overwrite is happening, that's not really a mask but a vreg.
if out.Class == "vreg" || out.OverwriteClass != nil {
vRegOutCnt++
@@ -269,6 +272,9 @@ func (op *Operation) regShape(mem memShape) (string, error) {
vRegOutCnt++
memOutCnt++
}
+ if out.FixedReg != nil {
+ fixedName = fmt.Sprintf("%sAtIn%d", *out.FixedReg, i)
+ }
}
var inRegs, inMasks, outRegs, outMasks string
@@ -309,6 +315,7 @@ func (op *Operation) regShape(mem memShape) (string, error) {
if memOutCnt > 0 {
panic("simdgen does not understand memory as output as of now")
}
+ regInfo += fixedName
return regInfo, nil
}
diff --git a/src/simd/_gen/simdgen/godefs.go b/src/simd/_gen/simdgen/godefs.go
index bda1dfc8fe..244f67fe9d 100644
--- a/src/simd/_gen/simdgen/godefs.go
+++ b/src/simd/_gen/simdgen/godefs.go
@@ -256,6 +256,8 @@ type Operand struct {
// because Intel's XED data is inconsistent. e.g. AVX512 VPMADDUBSW marks its operand
// elemBits 16, which should be 8.
OverwriteElementBits *int
+ // FixedReg is the name of the fixed registers
+ FixedReg *string
}
// isDigit returns true if the byte is an ASCII digit.
diff --git a/src/simd/_gen/simdgen/main.go b/src/simd/_gen/simdgen/main.go
index 537dde0c66..ca75cff55d 100644
--- a/src/simd/_gen/simdgen/main.go
+++ b/src/simd/_gen/simdgen/main.go
@@ -92,8 +92,9 @@ import (
"slices"
"strings"
- "gopkg.in/yaml.v3"
"simd/_gen/unify"
+
+ "gopkg.in/yaml.v3"
)
var (
@@ -199,6 +200,15 @@ func main() {
log.Fatal(err)
}
+ // Validate results.
+ //
+ // Don't validate if this is a command-line query because that tends to
+ // eliminate lots of required defs and is used in cases where maybe defs
+ // aren't enumerable anyway.
+ if *flagQ == "" && len(must) > 0 {
+ validate(unified, must)
+ }
+
// Print results.
switch *flagO {
case "yaml":
@@ -228,15 +238,6 @@ func main() {
fmt.Fprintf(os.Stderr, "XED decoding generated %d \"errors\" which is not cause for alarm, use -v for details.\n", operandRemarks)
}
}
-
- // Validate results.
- //
- // Don't validate if this is a command-line query because that tends to
- // eliminate lots of required defs and is used in cases where maybe defs
- // aren't enumerable anyway.
- if *flagQ == "" && len(must) > 0 {
- validate(unified, must)
- }
}
func validate(cl unify.Closure, required map[*unify.Value]struct{}) {
diff --git a/src/simd/_gen/simdgen/ops/Others/categories.yaml b/src/simd/_gen/simdgen/ops/Others/categories.yaml
index dd922fb14b..3c8befb826 100644
--- a/src/simd/_gen/simdgen/ops/Others/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/Others/categories.yaml
@@ -46,4 +46,63 @@
documentation: !string |-
// NAME performs the InvMixColumns operation in AES cipher algorithm defined in FIPS 197.
// x is the chunk of w array in use.
- // result = InvMixColumns(x) \ No newline at end of file
+ // result = InvMixColumns(x)
+- go: SHA1Round4
+ commutative: false
+ documentation: !string |-
+ // NAME performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
+ // x contains the state variables a, b, c and d from upper to lower order.
+ // y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
+ // result = the state variables a', b', c', d' updated after 4 rounds.
+ // constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
+- go: SHA1NextE
+ commutative: false
+ documentation: !string |-
+ // NAME calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
+ // x contains the state variable a (before the 4 rounds), placed in the upper element.
+ // y is the elements of W array for next 4 rounds from upper to lower order.
+ // result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
+ // from upper to lower order.
+ // For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
+ // for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
+ // computation of the value of e'.)
+- go: SHA1Msg1
+ commutative: false
+ documentation: !string |-
+ // NAME does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
+ // x = {W3, W2, W1, W0}
+ // y = {0, 0, W5, W4}
+ // result = {W3^W5, W2^W4, W1^W3, W0^W2}.
+- go: SHA1Msg2
+ commutative: false
+ documentation: !string |-
+ // NAME does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
+ // x = result of 2.
+ // y = {W15, W14, W13}
+ // result = {W19, W18, W17, W16}
+- go: SHA256Rounds2
+ commutative: false
+ documentation: !string |-
+ // NAME does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
+ // x = {h, g, d, c}
+ // y = {f, e, b, a}
+ // z = {W0+K0, W1+K1}
+ // result = {f', e', b', a'}
+ // The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
+ // the corresponding element of the W array to make the input data z.
+ // The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
+ // y (the state variables a, b, e, f before the 2 rounds).
+- go: SHA256Msg1
+ commutative: false
+ documentation: !string |-
+ // NAME does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
+ // x = {W0, W1, W2, W3}
+ // y = {W4, 0, 0, 0}
+ // result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
+- go: SHA256Msg2
+ commutative: false
+ documentation: !string |-
+ // NAME does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
+ // x = result of 2
+ // y = {0, 0, W14, W15}
+ // result = {W16, W17, W18, W19} \ No newline at end of file
diff --git a/src/simd/_gen/simdgen/ops/Others/go.yaml b/src/simd/_gen/simdgen/ops/Others/go.yaml
index 0f8b7b43a2..77b9fc3783 100644
--- a/src/simd/_gen/simdgen/ops/Others/go.yaml
+++ b/src/simd/_gen/simdgen/ops/Others/go.yaml
@@ -52,4 +52,45 @@
in:
- *uint32s
out:
- - *uint32s \ No newline at end of file
+ - *uint32s
+- go: SHA1Round4
+ asm: SHA1RNDS4
+ operandOrder: "SHA1RNDS4"
+ in: &2any1imm
+ - *any
+ - *any
+ - class: immediate
+ immOffset: 0
+ out: &1any
+ - *any
+- go: SHA1NextE
+ asm: SHA1NEXTE
+ in: &2any
+ - *any
+ - *any
+ out: *1any
+- go: SHA1Msg1
+ asm: SHA1MSG1
+ in: *2any
+ out: *1any
+- go: SHA1Msg2
+ asm: SHA1MSG2
+ in: *2any
+ out: *1any
+- go: SHA256Rounds2
+ asm: SHA256RNDS2
+ in:
+ - base: $t
+ - base: $t
+ - base: $t
+ overwriteElementBits: 32
+ out:
+ - base: $t
+- go: SHA256Msg1
+ asm: SHA256MSG1
+ in: *2any
+ out: *1any
+- go: SHA256Msg2
+ asm: SHA256MSG1
+ in: *2any
+ out: *1any \ No newline at end of file
diff --git a/src/simd/_gen/simdgen/xed.go b/src/simd/_gen/simdgen/xed.go
index 76bd584b52..9e9b67e77d 100644
--- a/src/simd/_gen/simdgen/xed.go
+++ b/src/simd/_gen/simdgen/xed.go
@@ -25,7 +25,6 @@ const (
NOT_REG_CLASS = iota // not a register
VREG_CLASS // classify as a vector register; see
GREG_CLASS // classify as a general register
- REG_FIXED // classify as a fixed register
)
// instVariant is a bitmap indicating a variant of an instruction that has
@@ -852,7 +851,7 @@ type fixedReg struct {
}
var fixedRegMap = map[string]fixedReg{
- "XED_REG_XMM0": {REG_FIXED, "XMM0", 128},
+ "XED_REG_XMM0": {VREG_CLASS, "x0", 128},
}
// decodeReg returns class (NOT_REG_CLASS, VREG_CLASS, GREG_CLASS, VREG_CLASS_FIXED,
diff --git a/src/simd/cpu.go b/src/simd/cpu.go
index 7d4fe25003..ca445072c0 100644
--- a/src/simd/cpu.go
+++ b/src/simd/cpu.go
@@ -106,3 +106,11 @@ func HasAVX512VPOPCNTDQ() bool {
func HasAVXVNNI() bool {
return cpu.X86.HasAVXVNNI
}
+
+// HasSHA returns whether the CPU supports the SHA feature.
+//
+// HasSHA is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func HasSHA() bool {
+ return cpu.X86.HasSHA
+}
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go
index 49c387aea9..e0c76099ba 100644
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -5623,6 +5623,156 @@ func (x Float64x4) RoundToEvenScaledResidue(prec uint8) Float64x4
// Asm: VREDUCEPD, CPU Feature: AVX512
func (x Float64x8) RoundToEvenScaledResidue(prec uint8) Float64x8
+/* SHA1Msg1 */
+
+// SHA1Msg1 does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W3, W2, W1, W0}
+// y = {0, 0, W5, W4}
+// result = {W3^W5, W2^W4, W1^W3, W0^W2}.
+//
+// Asm: SHA1MSG1, CPU Feature: SHA
+func (x Int32x4) SHA1Msg1(y Int32x4) Int32x4
+
+// SHA1Msg1 does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W3, W2, W1, W0}
+// y = {0, 0, W5, W4}
+// result = {W3^W5, W2^W4, W1^W3, W0^W2}.
+//
+// Asm: SHA1MSG1, CPU Feature: SHA
+func (x Uint32x4) SHA1Msg1(y Uint32x4) Uint32x4
+
+/* SHA1Msg2 */
+
+// SHA1Msg2 does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2.
+// y = {W15, W14, W13}
+// result = {W19, W18, W17, W16}
+//
+// Asm: SHA1MSG2, CPU Feature: SHA
+func (x Int32x4) SHA1Msg2(y Int32x4) Int32x4
+
+// SHA1Msg2 does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2.
+// y = {W15, W14, W13}
+// result = {W19, W18, W17, W16}
+//
+// Asm: SHA1MSG2, CPU Feature: SHA
+func (x Uint32x4) SHA1Msg2(y Uint32x4) Uint32x4
+
+/* SHA1NextE */
+
+// SHA1NextE calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variable a (before the 4 rounds), placed in the upper element.
+// y is the elements of W array for next 4 rounds from upper to lower order.
+// result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
+// from upper to lower order.
+// For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
+// for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
+// computation of the value of e'.)
+//
+// Asm: SHA1NEXTE, CPU Feature: SHA
+func (x Int32x4) SHA1NextE(y Int32x4) Int32x4
+
+// SHA1NextE calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variable a (before the 4 rounds), placed in the upper element.
+// y is the elements of W array for next 4 rounds from upper to lower order.
+// result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
+// from upper to lower order.
+// For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
+// for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
+// computation of the value of e'.)
+//
+// Asm: SHA1NEXTE, CPU Feature: SHA
+func (x Uint32x4) SHA1NextE(y Uint32x4) Uint32x4
+
+/* SHA1Round4 */
+
+// SHA1Round4 performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variables a, b, c and d from upper to lower order.
+// y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
+// result = the state variables a', b', c', d' updated after 4 rounds.
+// constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
+//
+// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: SHA1RNDS4, CPU Feature: SHA
+func (x Int32x4) SHA1Round4(constant uint8, y Int32x4) Int32x4
+
+// SHA1Round4 performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variables a, b, c and d from upper to lower order.
+// y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
+// result = the state variables a', b', c', d' updated after 4 rounds.
+// constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
+//
+// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: SHA1RNDS4, CPU Feature: SHA
+func (x Uint32x4) SHA1Round4(constant uint8, y Uint32x4) Uint32x4
+
+/* SHA256Msg1 */
+
+// SHA256Msg1 does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W0, W1, W2, W3}
+// y = {W4, 0, 0, 0}
+// result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
+//
+// Asm: SHA256MSG1, CPU Feature: SHA
+func (x Int32x4) SHA256Msg1(y Int32x4) Int32x4
+
+// SHA256Msg1 does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W0, W1, W2, W3}
+// y = {W4, 0, 0, 0}
+// result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
+//
+// Asm: SHA256MSG1, CPU Feature: SHA
+func (x Uint32x4) SHA256Msg1(y Uint32x4) Uint32x4
+
+/* SHA256Msg2 */
+
+// SHA256Msg2 does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2
+// y = {0, 0, W14, W15}
+// result = {W16, W17, W18, W19}
+//
+// Asm: SHA256MSG1, CPU Feature: SHA
+func (x Int32x4) SHA256Msg2(y Int32x4) Int32x4
+
+// SHA256Msg2 does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2
+// y = {0, 0, W14, W15}
+// result = {W16, W17, W18, W19}
+//
+// Asm: SHA256MSG1, CPU Feature: SHA
+func (x Uint32x4) SHA256Msg2(y Uint32x4) Uint32x4
+
+/* SHA256Rounds2 */
+
+// SHA256Rounds2 does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
+// x = {h, g, d, c}
+// y = {f, e, b, a}
+// z = {W0+K0, W1+K1}
+// result = {f', e', b', a'}
+// The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
+// the corresponding element of the W array to make the input data z.
+// The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
+// y (the state variables a, b, e, f before the 2 rounds).
+//
+// Asm: SHA256RNDS2, CPU Feature: SHA
+func (x Int32x4) SHA256Rounds2(y Int32x4, z Int32x4) Int32x4
+
+// SHA256Rounds2 does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
+// x = {h, g, d, c}
+// y = {f, e, b, a}
+// z = {W0+K0, W1+K1}
+// result = {f', e', b', a'}
+// The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
+// the corresponding element of the W array to make the input data z.
+// The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
+// y (the state variables a, b, e, f before the 2 rounds).
+//
+// Asm: SHA256RNDS2, CPU Feature: SHA
+func (x Uint32x4) SHA256Rounds2(y Uint32x4, z Uint32x4) Uint32x4
+
/* Scale */
// Scale multiplies elements by a power of 2.