aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorDmitri Shuralyov <dmitshur@golang.org>2024-11-21 19:54:12 -0500
committerGopher Robot <gobot@golang.org>2024-11-22 01:35:05 +0000
commitca14eaf77c86bd5492329d2be6f1a82afe7802f5 (patch)
tree1b6daab038ea1094e63584a460aebbec56491b30 /src
parente8d95619978c4602d4446f113b3b69b7a22308fa (diff)
downloadgo-ca14eaf77c86bd5492329d2be6f1a82afe7802f5.tar.xz
all: update golang.org/x/net [generated]
A part of the keeping Go's vendored dependencies and generated code up to date. This updates h2_bundle.go with unencrypted HTTP/2 support. For #36905. For #67816. [git-generate] cd src go get golang.org/x/net@v0.31.0 go mod tidy go mod vendor cd cmd go get golang.org/x/net@v0.31.0 go mod tidy go mod vendor go generate -run=bundle std Change-Id: I2b77f651b990f260fbe7d551c7a819518f1c983f Reviewed-on: https://go-review.googlesource.com/c/go/+/631035 Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Damien Neil <dneil@google.com> Auto-Submit: Dmitri Shuralyov <dmitshur@golang.org>
Diffstat (limited to 'src')
-rw-r--r--src/cmd/go.mod4
-rw-r--r--src/cmd/go.sum8
-rw-r--r--src/cmd/vendor/golang.org/x/term/README.md11
-rw-r--r--src/cmd/vendor/golang.org/x/term/term_windows.go1
-rw-r--r--src/cmd/vendor/modules.txt4
-rw-r--r--src/go.mod6
-rw-r--r--src/go.sum12
-rw-r--r--src/net/http/h2_bundle.go849
-rw-r--r--src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go2
-rw-r--r--src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.go (renamed from src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go)2
-rw-r--r--src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s (renamed from src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s)114
-rw-r--r--src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s11467
-rw-r--r--src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go2
-rw-r--r--src/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s133
-rw-r--r--src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.go (renamed from src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go)2
-rw-r--r--src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.s (renamed from src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s)30
-rw-r--r--src/vendor/golang.org/x/net/route/address.go27
-rw-r--r--src/vendor/modules.txt6
18 files changed, 10105 insertions, 2575 deletions
diff --git a/src/cmd/go.mod b/src/cmd/go.mod
index 8488e0a47a..b441fe55a9 100644
--- a/src/cmd/go.mod
+++ b/src/cmd/go.mod
@@ -10,12 +10,12 @@ require (
golang.org/x/sync v0.9.0
golang.org/x/sys v0.27.0
golang.org/x/telemetry v0.0.0-20240828202201-a797f331ea97
- golang.org/x/term v0.22.1-0.20240716160707-d4346f0be292
+ golang.org/x/term v0.26.0
golang.org/x/tools v0.27.0
)
require (
github.com/ianlancetaylor/demangle v0.0.0-20240912202439-0a2b6291aafd // indirect
- golang.org/x/text v0.19.0 // indirect
+ golang.org/x/text v0.20.0 // indirect
rsc.io/markdown v0.0.0-20240306144322-0bf8f97ee8ef // indirect
)
diff --git a/src/cmd/go.sum b/src/cmd/go.sum
index 7ad2cb7f94..685fd76a0f 100644
--- a/src/cmd/go.sum
+++ b/src/cmd/go.sum
@@ -18,10 +18,10 @@ golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s=
golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/telemetry v0.0.0-20240828202201-a797f331ea97 h1:5xPN7d0u5VdgF2gFFXUDaeD3NP1pPgFMHocnCQGAh5M=
golang.org/x/telemetry v0.0.0-20240828202201-a797f331ea97/go.mod h1:m7R/r+o5h7UvF2JD9n2iLSGY4v8v+zNSyTJ6xynLrqs=
-golang.org/x/term v0.22.1-0.20240716160707-d4346f0be292 h1:BOrQi08eIX3cDgGcMgFONf27MxXigcYa9x+iW5JuCXw=
-golang.org/x/term v0.22.1-0.20240716160707-d4346f0be292/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4=
-golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM=
-golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
+golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU=
+golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E=
+golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
+golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
golang.org/x/tools v0.27.0 h1:qEKojBykQkQ4EynWy4S8Weg69NumxKdn40Fce3uc/8o=
golang.org/x/tools v0.27.0/go.mod h1:sUi0ZgbwW9ZPAq26Ekut+weQPR5eIM6GQLQ1Yjm1H0Q=
rsc.io/markdown v0.0.0-20240306144322-0bf8f97ee8ef h1:mqLYrXCXYEZOop9/Dbo6RPX11539nwiCNBb1icVPmw8=
diff --git a/src/cmd/vendor/golang.org/x/term/README.md b/src/cmd/vendor/golang.org/x/term/README.md
index d03d0aefef..05ff623f94 100644
--- a/src/cmd/vendor/golang.org/x/term/README.md
+++ b/src/cmd/vendor/golang.org/x/term/README.md
@@ -4,16 +4,13 @@
This repository provides Go terminal and console support packages.
-## Download/Install
-
-The easiest way to install is to run `go get -u golang.org/x/term`. You can
-also manually git clone the repository to `$GOPATH/src/golang.org/x/term`.
-
## Report Issues / Send Patches
This repository uses Gerrit for code changes. To learn how to submit changes to
-this repository, see https://golang.org/doc/contribute.html.
+this repository, see https://go.dev/doc/contribute.
+
+The git repository is https://go.googlesource.com/term.
The main issue tracker for the term repository is located at
-https://github.com/golang/go/issues. Prefix your issue with "x/term:" in the
+https://go.dev/issues. Prefix your issue with "x/term:" in the
subject line, so it is easy to find.
diff --git a/src/cmd/vendor/golang.org/x/term/term_windows.go b/src/cmd/vendor/golang.org/x/term/term_windows.go
index 465f560604..df6bf948e1 100644
--- a/src/cmd/vendor/golang.org/x/term/term_windows.go
+++ b/src/cmd/vendor/golang.org/x/term/term_windows.go
@@ -26,6 +26,7 @@ func makeRaw(fd int) (*State, error) {
return nil, err
}
raw := st &^ (windows.ENABLE_ECHO_INPUT | windows.ENABLE_PROCESSED_INPUT | windows.ENABLE_LINE_INPUT | windows.ENABLE_PROCESSED_OUTPUT)
+ raw |= windows.ENABLE_VIRTUAL_TERMINAL_INPUT
if err := windows.SetConsoleMode(windows.Handle(fd), raw); err != nil {
return nil, err
}
diff --git a/src/cmd/vendor/modules.txt b/src/cmd/vendor/modules.txt
index cc2ce3df7e..6ea68b2153 100644
--- a/src/cmd/vendor/modules.txt
+++ b/src/cmd/vendor/modules.txt
@@ -60,10 +60,10 @@ golang.org/x/telemetry/internal/crashmonitor
golang.org/x/telemetry/internal/mmap
golang.org/x/telemetry/internal/telemetry
golang.org/x/telemetry/internal/upload
-# golang.org/x/term v0.22.1-0.20240716160707-d4346f0be292
+# golang.org/x/term v0.26.0
## explicit; go 1.18
golang.org/x/term
-# golang.org/x/text v0.19.0
+# golang.org/x/text v0.20.0
## explicit; go 1.18
golang.org/x/text/cases
golang.org/x/text/internal
diff --git a/src/go.mod b/src/go.mod
index 7a7b99150f..47ef05be87 100644
--- a/src/go.mod
+++ b/src/go.mod
@@ -3,11 +3,11 @@ module std
go 1.24
require (
- golang.org/x/crypto v0.25.1-0.20240722173533-bb80217080b0
- golang.org/x/net v0.27.1-0.20240722181819-765c7e89b3bd
+ golang.org/x/crypto v0.29.0
+ golang.org/x/net v0.31.0
)
require (
golang.org/x/sys v0.27.0 // indirect
- golang.org/x/text v0.19.0 // indirect
+ golang.org/x/text v0.20.0 // indirect
)
diff --git a/src/go.sum b/src/go.sum
index 26a27ac809..9d72b7d654 100644
--- a/src/go.sum
+++ b/src/go.sum
@@ -1,8 +1,8 @@
-golang.org/x/crypto v0.25.1-0.20240722173533-bb80217080b0 h1:wxHbFWyu21uEPJJnYaSDaHSWbvnZ9gLSSOPwnEc3lLM=
-golang.org/x/crypto v0.25.1-0.20240722173533-bb80217080b0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M=
-golang.org/x/net v0.27.1-0.20240722181819-765c7e89b3bd h1:pHzwejE8Zkb94bG4nA+fUeskKPFp1HPldrhv62dabro=
-golang.org/x/net v0.27.1-0.20240722181819-765c7e89b3bd/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE=
+golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ=
+golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg=
+golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo=
+golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM=
golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s=
golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
-golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM=
-golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
+golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
+golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
diff --git a/src/net/http/h2_bundle.go b/src/net/http/h2_bundle.go
index af86c1430d..6b40923a86 100644
--- a/src/net/http/h2_bundle.go
+++ b/src/net/http/h2_bundle.go
@@ -883,7 +883,7 @@ func (c *http2dialCall) dial(ctx context.Context, addr string) {
// This code decides which ones live or die.
// The return value used is whether c was used.
// c is never closed.
-func (p *http2clientConnPool) addConnIfNeeded(key string, t *http2Transport, c *tls.Conn) (used bool, err error) {
+func (p *http2clientConnPool) addConnIfNeeded(key string, t *http2Transport, c net.Conn) (used bool, err error) {
p.mu.Lock()
for _, cc := range p.conns[key] {
if cc.CanTakeNewRequest() {
@@ -919,8 +919,8 @@ type http2addConnCall struct {
err error
}
-func (c *http2addConnCall) run(t *http2Transport, key string, tc *tls.Conn) {
- cc, err := t.NewClientConn(tc)
+func (c *http2addConnCall) run(t *http2Transport, key string, nc net.Conn) {
+ cc, err := t.NewClientConn(nc)
p := c.p
p.mu.Lock()
@@ -1035,6 +1035,169 @@ func http2shouldRetryDial(call *http2dialCall, req *Request) bool {
return call.ctx.Err() != nil
}
+// http2Config is a package-internal version of net/http.HTTP2Config.
+//
+// http.HTTP2Config was added in Go 1.24.
+// When running with a version of net/http that includes HTTP2Config,
+// we merge the configuration with the fields in Transport or Server
+// to produce an http2Config.
+//
+// Zero valued fields in http2Config are interpreted as in the
+// net/http.HTTPConfig documentation.
+//
+// Precedence order for reconciling configurations is:
+//
+// - Use the net/http.{Server,Transport}.HTTP2Config value, when non-zero.
+// - Otherwise use the http2.{Server.Transport} value.
+// - If the resulting value is zero or out of range, use a default.
+type http2http2Config struct {
+ MaxConcurrentStreams uint32
+ MaxDecoderHeaderTableSize uint32
+ MaxEncoderHeaderTableSize uint32
+ MaxReadFrameSize uint32
+ MaxUploadBufferPerConnection int32
+ MaxUploadBufferPerStream int32
+ SendPingTimeout time.Duration
+ PingTimeout time.Duration
+ WriteByteTimeout time.Duration
+ PermitProhibitedCipherSuites bool
+ CountError func(errType string)
+}
+
+// configFromServer merges configuration settings from
+// net/http.Server.HTTP2Config and http2.Server.
+func http2configFromServer(h1 *Server, h2 *http2Server) http2http2Config {
+ conf := http2http2Config{
+ MaxConcurrentStreams: h2.MaxConcurrentStreams,
+ MaxEncoderHeaderTableSize: h2.MaxEncoderHeaderTableSize,
+ MaxDecoderHeaderTableSize: h2.MaxDecoderHeaderTableSize,
+ MaxReadFrameSize: h2.MaxReadFrameSize,
+ MaxUploadBufferPerConnection: h2.MaxUploadBufferPerConnection,
+ MaxUploadBufferPerStream: h2.MaxUploadBufferPerStream,
+ SendPingTimeout: h2.ReadIdleTimeout,
+ PingTimeout: h2.PingTimeout,
+ WriteByteTimeout: h2.WriteByteTimeout,
+ PermitProhibitedCipherSuites: h2.PermitProhibitedCipherSuites,
+ CountError: h2.CountError,
+ }
+ http2fillNetHTTPServerConfig(&conf, h1)
+ http2setConfigDefaults(&conf, true)
+ return conf
+}
+
+// configFromServer merges configuration settings from h2 and h2.t1.HTTP2
+// (the net/http Transport).
+func http2configFromTransport(h2 *http2Transport) http2http2Config {
+ conf := http2http2Config{
+ MaxEncoderHeaderTableSize: h2.MaxEncoderHeaderTableSize,
+ MaxDecoderHeaderTableSize: h2.MaxDecoderHeaderTableSize,
+ MaxReadFrameSize: h2.MaxReadFrameSize,
+ SendPingTimeout: h2.ReadIdleTimeout,
+ PingTimeout: h2.PingTimeout,
+ WriteByteTimeout: h2.WriteByteTimeout,
+ }
+
+ // Unlike most config fields, where out-of-range values revert to the default,
+ // Transport.MaxReadFrameSize clips.
+ if conf.MaxReadFrameSize < http2minMaxFrameSize {
+ conf.MaxReadFrameSize = http2minMaxFrameSize
+ } else if conf.MaxReadFrameSize > http2maxFrameSize {
+ conf.MaxReadFrameSize = http2maxFrameSize
+ }
+
+ if h2.t1 != nil {
+ http2fillNetHTTPTransportConfig(&conf, h2.t1)
+ }
+ http2setConfigDefaults(&conf, false)
+ return conf
+}
+
+func http2setDefault[T ~int | ~int32 | ~uint32 | ~int64](v *T, minval, maxval, defval T) {
+ if *v < minval || *v > maxval {
+ *v = defval
+ }
+}
+
+func http2setConfigDefaults(conf *http2http2Config, server bool) {
+ http2setDefault(&conf.MaxConcurrentStreams, 1, math.MaxUint32, http2defaultMaxStreams)
+ http2setDefault(&conf.MaxEncoderHeaderTableSize, 1, math.MaxUint32, http2initialHeaderTableSize)
+ http2setDefault(&conf.MaxDecoderHeaderTableSize, 1, math.MaxUint32, http2initialHeaderTableSize)
+ if server {
+ http2setDefault(&conf.MaxUploadBufferPerConnection, http2initialWindowSize, math.MaxInt32, 1<<20)
+ } else {
+ http2setDefault(&conf.MaxUploadBufferPerConnection, http2initialWindowSize, math.MaxInt32, http2transportDefaultConnFlow)
+ }
+ if server {
+ http2setDefault(&conf.MaxUploadBufferPerStream, 1, math.MaxInt32, 1<<20)
+ } else {
+ http2setDefault(&conf.MaxUploadBufferPerStream, 1, math.MaxInt32, http2transportDefaultStreamFlow)
+ }
+ http2setDefault(&conf.MaxReadFrameSize, http2minMaxFrameSize, http2maxFrameSize, http2defaultMaxReadFrameSize)
+ http2setDefault(&conf.PingTimeout, 1, math.MaxInt64, 15*time.Second)
+}
+
+// adjustHTTP1MaxHeaderSize converts a limit in bytes on the size of an HTTP/1 header
+// to an HTTP/2 MAX_HEADER_LIST_SIZE value.
+func http2adjustHTTP1MaxHeaderSize(n int64) int64 {
+ // http2's count is in a slightly different unit and includes 32 bytes per pair.
+ // So, take the net/http.Server value and pad it up a bit, assuming 10 headers.
+ const perFieldOverhead = 32 // per http2 spec
+ const typicalHeaders = 10 // conservative
+ return n + typicalHeaders*perFieldOverhead
+}
+
+// fillNetHTTPServerConfig sets fields in conf from srv.HTTP2.
+func http2fillNetHTTPServerConfig(conf *http2http2Config, srv *Server) {
+ http2fillNetHTTPConfig(conf, srv.HTTP2)
+}
+
+// fillNetHTTPServerConfig sets fields in conf from tr.HTTP2.
+func http2fillNetHTTPTransportConfig(conf *http2http2Config, tr *Transport) {
+ http2fillNetHTTPConfig(conf, tr.HTTP2)
+}
+
+func http2fillNetHTTPConfig(conf *http2http2Config, h2 *HTTP2Config) {
+ if h2 == nil {
+ return
+ }
+ if h2.MaxConcurrentStreams != 0 {
+ conf.MaxConcurrentStreams = uint32(h2.MaxConcurrentStreams)
+ }
+ if h2.MaxEncoderHeaderTableSize != 0 {
+ conf.MaxEncoderHeaderTableSize = uint32(h2.MaxEncoderHeaderTableSize)
+ }
+ if h2.MaxDecoderHeaderTableSize != 0 {
+ conf.MaxDecoderHeaderTableSize = uint32(h2.MaxDecoderHeaderTableSize)
+ }
+ if h2.MaxConcurrentStreams != 0 {
+ conf.MaxConcurrentStreams = uint32(h2.MaxConcurrentStreams)
+ }
+ if h2.MaxReadFrameSize != 0 {
+ conf.MaxReadFrameSize = uint32(h2.MaxReadFrameSize)
+ }
+ if h2.MaxReceiveBufferPerConnection != 0 {
+ conf.MaxUploadBufferPerConnection = int32(h2.MaxReceiveBufferPerConnection)
+ }
+ if h2.MaxReceiveBufferPerStream != 0 {
+ conf.MaxUploadBufferPerStream = int32(h2.MaxReceiveBufferPerStream)
+ }
+ if h2.SendPingTimeout != 0 {
+ conf.SendPingTimeout = h2.SendPingTimeout
+ }
+ if h2.PingTimeout != 0 {
+ conf.PingTimeout = h2.PingTimeout
+ }
+ if h2.WriteByteTimeout != 0 {
+ conf.WriteByteTimeout = h2.WriteByteTimeout
+ }
+ if h2.PermitProhibitedCipherSuites {
+ conf.PermitProhibitedCipherSuites = true
+ }
+ if h2.CountError != nil {
+ conf.CountError = h2.CountError
+ }
+}
+
// Buffer chunks are allocated from a pool to reduce pressure on GC.
// The maximum wasted space per dataBuffer is 2x the largest size class,
// which happens when the dataBuffer has multiple chunks and there is
@@ -3550,13 +3713,19 @@ func (cw http2closeWaiter) Wait() {
// Its buffered writer is lazily allocated as needed, to minimize
// idle memory usage with many connections.
type http2bufferedWriter struct {
- _ http2incomparable
- w io.Writer // immutable
- bw *bufio.Writer // non-nil when data is buffered
+ _ http2incomparable
+ group http2synctestGroupInterface // immutable
+ conn net.Conn // immutable
+ bw *bufio.Writer // non-nil when data is buffered
+ byteTimeout time.Duration // immutable, WriteByteTimeout
}
-func http2newBufferedWriter(w io.Writer) *http2bufferedWriter {
- return &http2bufferedWriter{w: w}
+func http2newBufferedWriter(group http2synctestGroupInterface, conn net.Conn, timeout time.Duration) *http2bufferedWriter {
+ return &http2bufferedWriter{
+ group: group,
+ conn: conn,
+ byteTimeout: timeout,
+ }
}
// bufWriterPoolBufferSize is the size of bufio.Writer's
@@ -3583,7 +3752,7 @@ func (w *http2bufferedWriter) Available() int {
func (w *http2bufferedWriter) Write(p []byte) (n int, err error) {
if w.bw == nil {
bw := http2bufWriterPool.Get().(*bufio.Writer)
- bw.Reset(w.w)
+ bw.Reset((*http2bufferedWriterTimeoutWriter)(w))
w.bw = bw
}
return w.bw.Write(p)
@@ -3601,6 +3770,38 @@ func (w *http2bufferedWriter) Flush() error {
return err
}
+type http2bufferedWriterTimeoutWriter http2bufferedWriter
+
+func (w *http2bufferedWriterTimeoutWriter) Write(p []byte) (n int, err error) {
+ return http2writeWithByteTimeout(w.group, w.conn, w.byteTimeout, p)
+}
+
+// writeWithByteTimeout writes to conn.
+// If more than timeout passes without any bytes being written to the connection,
+// the write fails.
+func http2writeWithByteTimeout(group http2synctestGroupInterface, conn net.Conn, timeout time.Duration, p []byte) (n int, err error) {
+ if timeout <= 0 {
+ return conn.Write(p)
+ }
+ for {
+ var now time.Time
+ if group == nil {
+ now = time.Now()
+ } else {
+ now = group.Now()
+ }
+ conn.SetWriteDeadline(now.Add(timeout))
+ nn, err := conn.Write(p[n:])
+ n += nn
+ if n == len(p) || nn == 0 || !errors.Is(err, os.ErrDeadlineExceeded) {
+ // Either we finished the write, made no progress, or hit the deadline.
+ // Whichever it is, we're done now.
+ conn.SetWriteDeadline(time.Time{})
+ return n, err
+ }
+ }
+}
+
func http2mustUint31(v int32) uint32 {
if v < 0 || v > 2147483647 {
panic("out of range")
@@ -3882,10 +4083,14 @@ func (p *http2pipe) Done() <-chan struct{} {
}
const (
- http2prefaceTimeout = 10 * time.Second
- http2firstSettingsTimeout = 2 * time.Second // should be in-flight with preface anyway
- http2handlerChunkWriteSize = 4 << 10
- http2defaultMaxStreams = 250 // TODO: make this 100 as the GFE seems to?
+ http2prefaceTimeout = 10 * time.Second
+ http2firstSettingsTimeout = 2 * time.Second // should be in-flight with preface anyway
+ http2handlerChunkWriteSize = 4 << 10
+ http2defaultMaxStreams = 250 // TODO: make this 100 as the GFE seems to?
+
+ // maxQueuedControlFrames is the maximum number of control frames like
+ // SETTINGS, PING and RST_STREAM that will be queued for writing before
+ // the connection is closed to prevent memory exhaustion attacks.
http2maxQueuedControlFrames = 10000
)
@@ -3957,6 +4162,22 @@ type http2Server struct {
// If zero or negative, there is no timeout.
IdleTimeout time.Duration
+ // ReadIdleTimeout is the timeout after which a health check using a ping
+ // frame will be carried out if no frame is received on the connection.
+ // If zero, no health check is performed.
+ ReadIdleTimeout time.Duration
+
+ // PingTimeout is the timeout after which the connection will be closed
+ // if a response to a ping is not received.
+ // If zero, a default of 15 seconds is used.
+ PingTimeout time.Duration
+
+ // WriteByteTimeout is the timeout after which a connection will be
+ // closed if no data can be written to it. The timeout begins when data is
+ // available to write, and is extended whenever any bytes are written.
+ // If zero or negative, there is no timeout.
+ WriteByteTimeout time.Duration
+
// MaxUploadBufferPerConnection is the size of the initial flow
// control window for each connections. The HTTP/2 spec does not
// allow this to be smaller than 65535 or larger than 2^32-1.
@@ -4019,57 +4240,6 @@ func (s *http2Server) afterFunc(d time.Duration, f func()) http2timer {
return http2timeTimer{time.AfterFunc(d, f)}
}
-func (s *http2Server) initialConnRecvWindowSize() int32 {
- if s.MaxUploadBufferPerConnection >= http2initialWindowSize {
- return s.MaxUploadBufferPerConnection
- }
- return 1 << 20
-}
-
-func (s *http2Server) initialStreamRecvWindowSize() int32 {
- if s.MaxUploadBufferPerStream > 0 {
- return s.MaxUploadBufferPerStream
- }
- return 1 << 20
-}
-
-func (s *http2Server) maxReadFrameSize() uint32 {
- if v := s.MaxReadFrameSize; v >= http2minMaxFrameSize && v <= http2maxFrameSize {
- return v
- }
- return http2defaultMaxReadFrameSize
-}
-
-func (s *http2Server) maxConcurrentStreams() uint32 {
- if v := s.MaxConcurrentStreams; v > 0 {
- return v
- }
- return http2defaultMaxStreams
-}
-
-func (s *http2Server) maxDecoderHeaderTableSize() uint32 {
- if v := s.MaxDecoderHeaderTableSize; v > 0 {
- return v
- }
- return http2initialHeaderTableSize
-}
-
-func (s *http2Server) maxEncoderHeaderTableSize() uint32 {
- if v := s.MaxEncoderHeaderTableSize; v > 0 {
- return v
- }
- return http2initialHeaderTableSize
-}
-
-// maxQueuedControlFrames is the maximum number of control frames like
-// SETTINGS, PING and RST_STREAM that will be queued for writing before
-// the connection is closed to prevent memory exhaustion attacks.
-func (s *http2Server) maxQueuedControlFrames() int {
- // TODO: if anybody asks, add a Server field, and remember to define the
- // behavior of negative values.
- return http2maxQueuedControlFrames
-}
-
type http2serverInternalState struct {
mu sync.Mutex
activeConns map[*http2serverConn]struct{}
@@ -4166,7 +4336,7 @@ func http2ConfigureServer(s *Server, conf *http2Server) error {
if s.TLSNextProto == nil {
s.TLSNextProto = map[string]func(*Server, *tls.Conn, Handler){}
}
- protoHandler := func(hs *Server, c *tls.Conn, h Handler) {
+ protoHandler := func(hs *Server, c net.Conn, h Handler, sawClientPreface bool) {
if http2testHookOnConn != nil {
http2testHookOnConn()
}
@@ -4183,12 +4353,31 @@ func http2ConfigureServer(s *Server, conf *http2Server) error {
ctx = bc.BaseContext()
}
conf.ServeConn(c, &http2ServeConnOpts{
- Context: ctx,
- Handler: h,
- BaseConfig: hs,
+ Context: ctx,
+ Handler: h,
+ BaseConfig: hs,
+ SawClientPreface: sawClientPreface,
})
}
- s.TLSNextProto[http2NextProtoTLS] = protoHandler
+ s.TLSNextProto[http2NextProtoTLS] = func(hs *Server, c *tls.Conn, h Handler) {
+ protoHandler(hs, c, h, false)
+ }
+ // The "unencrypted_http2" TLSNextProto key is used to pass off non-TLS HTTP/2 conns.
+ //
+ // A connection passed in this method has already had the HTTP/2 preface read from it.
+ s.TLSNextProto[http2nextProtoUnencryptedHTTP2] = func(hs *Server, c *tls.Conn, h Handler) {
+ nc, err := http2unencryptedNetConnFromTLSConn(c)
+ if err != nil {
+ if lg := hs.ErrorLog; lg != nil {
+ lg.Print(err)
+ } else {
+ log.Print(err)
+ }
+ go c.Close()
+ return
+ }
+ protoHandler(hs, nc, h, true)
+ }
return nil
}
@@ -4270,13 +4459,15 @@ func (s *http2Server) serveConn(c net.Conn, opts *http2ServeConnOpts, newf func(
baseCtx, cancel := http2serverConnBaseContext(c, opts)
defer cancel()
+ http1srv := opts.baseConfig()
+ conf := http2configFromServer(http1srv, s)
sc := &http2serverConn{
srv: s,
- hs: opts.baseConfig(),
+ hs: http1srv,
conn: c,
baseCtx: baseCtx,
remoteAddrStr: c.RemoteAddr().String(),
- bw: http2newBufferedWriter(c),
+ bw: http2newBufferedWriter(s.group, c, conf.WriteByteTimeout),
handler: opts.handler(),
streams: make(map[uint32]*http2stream),
readFrameCh: make(chan http2readFrameResult),
@@ -4286,9 +4477,12 @@ func (s *http2Server) serveConn(c net.Conn, opts *http2ServeConnOpts, newf func(
bodyReadCh: make(chan http2bodyReadMsg), // buffering doesn't matter either way
doneServing: make(chan struct{}),
clientMaxStreams: math.MaxUint32, // Section 6.5.2: "Initially, there is no limit to this value"
- advMaxStreams: s.maxConcurrentStreams(),
+ advMaxStreams: conf.MaxConcurrentStreams,
initialStreamSendWindowSize: http2initialWindowSize,
+ initialStreamRecvWindowSize: conf.MaxUploadBufferPerStream,
maxFrameSize: http2initialMaxFrameSize,
+ pingTimeout: conf.PingTimeout,
+ countErrorFunc: conf.CountError,
serveG: http2newGoroutineLock(),
pushEnabled: true,
sawClientPreface: opts.SawClientPreface,
@@ -4321,15 +4515,15 @@ func (s *http2Server) serveConn(c net.Conn, opts *http2ServeConnOpts, newf func(
sc.flow.add(http2initialWindowSize)
sc.inflow.init(http2initialWindowSize)
sc.hpackEncoder = hpack.NewEncoder(&sc.headerWriteBuf)
- sc.hpackEncoder.SetMaxDynamicTableSizeLimit(s.maxEncoderHeaderTableSize())
+ sc.hpackEncoder.SetMaxDynamicTableSizeLimit(conf.MaxEncoderHeaderTableSize)
fr := http2NewFramer(sc.bw, c)
- if s.CountError != nil {
- fr.countError = s.CountError
+ if conf.CountError != nil {
+ fr.countError = conf.CountError
}
- fr.ReadMetaHeaders = hpack.NewDecoder(s.maxDecoderHeaderTableSize(), nil)
+ fr.ReadMetaHeaders = hpack.NewDecoder(conf.MaxDecoderHeaderTableSize, nil)
fr.MaxHeaderListSize = sc.maxHeaderListSize()
- fr.SetMaxReadFrameSize(s.maxReadFrameSize())
+ fr.SetMaxReadFrameSize(conf.MaxReadFrameSize)
sc.framer = fr
if tc, ok := c.(http2connectionStater); ok {
@@ -4362,7 +4556,7 @@ func (s *http2Server) serveConn(c net.Conn, opts *http2ServeConnOpts, newf func(
// So for now, do nothing here again.
}
- if !s.PermitProhibitedCipherSuites && http2isBadCipher(sc.tlsState.CipherSuite) {
+ if !conf.PermitProhibitedCipherSuites && http2isBadCipher(sc.tlsState.CipherSuite) {
// "Endpoints MAY choose to generate a connection error
// (Section 5.4.1) of type INADEQUATE_SECURITY if one of
// the prohibited cipher suites are negotiated."
@@ -4399,7 +4593,7 @@ func (s *http2Server) serveConn(c net.Conn, opts *http2ServeConnOpts, newf func(
opts.UpgradeRequest = nil
}
- sc.serve()
+ sc.serve(conf)
}
func http2serverConnBaseContext(c net.Conn, opts *http2ServeConnOpts) (ctx context.Context, cancel func()) {
@@ -4439,6 +4633,7 @@ type http2serverConn struct {
tlsState *tls.ConnectionState // shared by all handlers, like net/http
remoteAddrStr string
writeSched http2WriteScheduler
+ countErrorFunc func(errType string)
// Everything following is owned by the serve loop; use serveG.check():
serveG http2goroutineLock // used to verify funcs are on serve()
@@ -4458,6 +4653,7 @@ type http2serverConn struct {
streams map[uint32]*http2stream
unstartedHandlers []http2unstartedHandler
initialStreamSendWindowSize int32
+ initialStreamRecvWindowSize int32
maxFrameSize int32
peerMaxHeaderListSize uint32 // zero means unknown (default)
canonHeader map[string]string // http2-lower-case -> Go-Canonical-Case
@@ -4468,9 +4664,14 @@ type http2serverConn struct {
inGoAway bool // we've started to or sent GOAWAY
inFrameScheduleLoop bool // whether we're in the scheduleFrameWrite loop
needToSendGoAway bool // we need to schedule a GOAWAY frame write
+ pingSent bool
+ sentPingData [8]byte
goAwayCode http2ErrCode
shutdownTimer http2timer // nil until used
idleTimer http2timer // nil if unused
+ readIdleTimeout time.Duration
+ pingTimeout time.Duration
+ readIdleTimer http2timer // nil if unused
// Owned by the writeFrameAsync goroutine:
headerWriteBuf bytes.Buffer
@@ -4485,11 +4686,7 @@ func (sc *http2serverConn) maxHeaderListSize() uint32 {
if n <= 0 {
n = DefaultMaxHeaderBytes
}
- // http2's count is in a slightly different unit and includes 32 bytes per pair.
- // So, take the net/http.Server value and pad it up a bit, assuming 10 headers.
- const perFieldOverhead = 32 // per http2 spec
- const typicalHeaders = 10 // conservative
- return uint32(n + typicalHeaders*perFieldOverhead)
+ return uint32(http2adjustHTTP1MaxHeaderSize(int64(n)))
}
func (sc *http2serverConn) curOpenStreams() uint32 {
@@ -4756,7 +4953,7 @@ func (sc *http2serverConn) notePanic() {
}
}
-func (sc *http2serverConn) serve() {
+func (sc *http2serverConn) serve(conf http2http2Config) {
sc.serveG.check()
defer sc.notePanic()
defer sc.conn.Close()
@@ -4770,18 +4967,18 @@ func (sc *http2serverConn) serve() {
sc.writeFrame(http2FrameWriteRequest{
write: http2writeSettings{
- {http2SettingMaxFrameSize, sc.srv.maxReadFrameSize()},
+ {http2SettingMaxFrameSize, conf.MaxReadFrameSize},
{http2SettingMaxConcurrentStreams, sc.advMaxStreams},
{http2SettingMaxHeaderListSize, sc.maxHeaderListSize()},
- {http2SettingHeaderTableSize, sc.srv.maxDecoderHeaderTableSize()},
- {http2SettingInitialWindowSize, uint32(sc.srv.initialStreamRecvWindowSize())},
+ {http2SettingHeaderTableSize, conf.MaxDecoderHeaderTableSize},
+ {http2SettingInitialWindowSize, uint32(sc.initialStreamRecvWindowSize)},
},
})
sc.unackedSettings++
// Each connection starts with initialWindowSize inflow tokens.
// If a higher value is configured, we add more tokens.
- if diff := sc.srv.initialConnRecvWindowSize() - http2initialWindowSize; diff > 0 {
+ if diff := conf.MaxUploadBufferPerConnection - http2initialWindowSize; diff > 0 {
sc.sendWindowUpdate(nil, int(diff))
}
@@ -4801,11 +4998,18 @@ func (sc *http2serverConn) serve() {
defer sc.idleTimer.Stop()
}
+ if conf.SendPingTimeout > 0 {
+ sc.readIdleTimeout = conf.SendPingTimeout
+ sc.readIdleTimer = sc.srv.afterFunc(conf.SendPingTimeout, sc.onReadIdleTimer)
+ defer sc.readIdleTimer.Stop()
+ }
+
go sc.readFrames() // closed by defer sc.conn.Close above
settingsTimer := sc.srv.afterFunc(http2firstSettingsTimeout, sc.onSettingsTimer)
defer settingsTimer.Stop()
+ lastFrameTime := sc.srv.now()
loopNum := 0
for {
loopNum++
@@ -4819,6 +5023,7 @@ func (sc *http2serverConn) serve() {
case res := <-sc.wroteFrameCh:
sc.wroteFrame(res)
case res := <-sc.readFrameCh:
+ lastFrameTime = sc.srv.now()
// Process any written frames before reading new frames from the client since a
// written frame could have triggered a new stream to be started.
if sc.writingFrameAsync {
@@ -4850,6 +5055,8 @@ func (sc *http2serverConn) serve() {
case http2idleTimerMsg:
sc.vlogf("connection is idle")
sc.goAway(http2ErrCodeNo)
+ case http2readIdleTimerMsg:
+ sc.handlePingTimer(lastFrameTime)
case http2shutdownTimerMsg:
sc.vlogf("GOAWAY close timer fired; closing conn from %v", sc.conn.RemoteAddr())
return
@@ -4872,7 +5079,7 @@ func (sc *http2serverConn) serve() {
// If the peer is causing us to generate a lot of control frames,
// but not reading them from us, assume they are trying to make us
// run out of memory.
- if sc.queuedControlFrames > sc.srv.maxQueuedControlFrames() {
+ if sc.queuedControlFrames > http2maxQueuedControlFrames {
sc.vlogf("http2: too many control frames in send queue, closing connection")
return
}
@@ -4888,12 +5095,39 @@ func (sc *http2serverConn) serve() {
}
}
+func (sc *http2serverConn) handlePingTimer(lastFrameReadTime time.Time) {
+ if sc.pingSent {
+ sc.vlogf("timeout waiting for PING response")
+ sc.conn.Close()
+ return
+ }
+
+ pingAt := lastFrameReadTime.Add(sc.readIdleTimeout)
+ now := sc.srv.now()
+ if pingAt.After(now) {
+ // We received frames since arming the ping timer.
+ // Reset it for the next possible timeout.
+ sc.readIdleTimer.Reset(pingAt.Sub(now))
+ return
+ }
+
+ sc.pingSent = true
+ // Ignore crypto/rand.Read errors: It generally can't fail, and worse case if it does
+ // is we send a PING frame containing 0s.
+ _, _ = rand.Read(sc.sentPingData[:])
+ sc.writeFrame(http2FrameWriteRequest{
+ write: &http2writePing{data: sc.sentPingData},
+ })
+ sc.readIdleTimer.Reset(sc.pingTimeout)
+}
+
type http2serverMessage int
// Message values sent to serveMsgCh.
var (
http2settingsTimerMsg = new(http2serverMessage)
http2idleTimerMsg = new(http2serverMessage)
+ http2readIdleTimerMsg = new(http2serverMessage)
http2shutdownTimerMsg = new(http2serverMessage)
http2gracefulShutdownMsg = new(http2serverMessage)
http2handlerDoneMsg = new(http2serverMessage)
@@ -4903,6 +5137,8 @@ func (sc *http2serverConn) onSettingsTimer() { sc.sendServeMsg(http2settingsTime
func (sc *http2serverConn) onIdleTimer() { sc.sendServeMsg(http2idleTimerMsg) }
+func (sc *http2serverConn) onReadIdleTimer() { sc.sendServeMsg(http2readIdleTimerMsg) }
+
func (sc *http2serverConn) onShutdownTimer() { sc.sendServeMsg(http2shutdownTimerMsg) }
func (sc *http2serverConn) sendServeMsg(msg interface{}) {
@@ -5155,6 +5391,10 @@ func (sc *http2serverConn) wroteFrame(res http2frameWriteResult) {
sc.writingFrame = false
sc.writingFrameAsync = false
+ if res.err != nil {
+ sc.conn.Close()
+ }
+
wr := res.wr
if http2writeEndsStream(wr.write) {
@@ -5429,6 +5669,11 @@ func (sc *http2serverConn) processFrame(f http2Frame) error {
func (sc *http2serverConn) processPing(f *http2PingFrame) error {
sc.serveG.check()
if f.IsAck() {
+ if sc.pingSent && sc.sentPingData == f.Data {
+ // This is a response to a PING we sent.
+ sc.pingSent = false
+ sc.readIdleTimer.Reset(sc.readIdleTimeout)
+ }
// 6.7 PING: " An endpoint MUST NOT respond to PING frames
// containing this flag."
return nil
@@ -5995,7 +6240,7 @@ func (sc *http2serverConn) newStream(id, pusherID uint32, state http2streamState
st.cw.Init()
st.flow.conn = &sc.flow // link to conn-level counter
st.flow.add(sc.initialStreamSendWindowSize)
- st.inflow.init(sc.srv.initialStreamRecvWindowSize())
+ st.inflow.init(sc.initialStreamRecvWindowSize)
if sc.hs.WriteTimeout > 0 {
st.writeDeadline = sc.srv.afterFunc(sc.hs.WriteTimeout, st.onWriteTimeout)
}
@@ -6690,6 +6935,11 @@ func (w *http2responseWriter) SetWriteDeadline(deadline time.Time) error {
return nil
}
+func (w *http2responseWriter) EnableFullDuplex() error {
+ // We always support full duplex responses, so this is a no-op.
+ return nil
+}
+
func (w *http2responseWriter) Flush() {
w.FlushError()
}
@@ -7136,7 +7386,7 @@ func (sc *http2serverConn) countError(name string, err error) error {
if sc == nil || sc.srv == nil {
return err
}
- f := sc.srv.CountError
+ f := sc.countErrorFunc
if f == nil {
return err
}
@@ -7339,6 +7589,20 @@ func (t *http2Transport) markNewGoroutine() {
}
}
+func (t *http2Transport) now() time.Time {
+ if t != nil && t.http2transportTestHooks != nil {
+ return t.http2transportTestHooks.group.Now()
+ }
+ return time.Now()
+}
+
+func (t *http2Transport) timeSince(when time.Time) time.Duration {
+ if t != nil && t.http2transportTestHooks != nil {
+ return t.now().Sub(when)
+ }
+ return time.Since(when)
+}
+
// newTimer creates a new time.Timer, or a synthetic timer in tests.
func (t *http2Transport) newTimer(d time.Duration) http2timer {
if t.http2transportTestHooks != nil {
@@ -7363,40 +7627,26 @@ func (t *http2Transport) contextWithTimeout(ctx context.Context, d time.Duration
}
func (t *http2Transport) maxHeaderListSize() uint32 {
- if t.MaxHeaderListSize == 0 {
+ n := int64(t.MaxHeaderListSize)
+ if t.t1 != nil && t.t1.MaxResponseHeaderBytes != 0 {
+ n = t.t1.MaxResponseHeaderBytes
+ if n > 0 {
+ n = http2adjustHTTP1MaxHeaderSize(n)
+ }
+ }
+ if n <= 0 {
return 10 << 20
}
- if t.MaxHeaderListSize == 0xffffffff {
+ if n >= 0xffffffff {
return 0
}
- return t.MaxHeaderListSize
-}
-
-func (t *http2Transport) maxFrameReadSize() uint32 {
- if t.MaxReadFrameSize == 0 {
- return 0 // use the default provided by the peer
- }
- if t.MaxReadFrameSize < http2minMaxFrameSize {
- return http2minMaxFrameSize
- }
- if t.MaxReadFrameSize > http2maxFrameSize {
- return http2maxFrameSize
- }
- return t.MaxReadFrameSize
+ return uint32(n)
}
func (t *http2Transport) disableCompression() bool {
return t.DisableCompression || (t.t1 != nil && t.t1.DisableCompression)
}
-func (t *http2Transport) pingTimeout() time.Duration {
- if t.PingTimeout == 0 {
- return 15 * time.Second
- }
- return t.PingTimeout
-
-}
-
// ConfigureTransport configures a net/http HTTP/1 Transport to use HTTP/2.
// It returns an error if t1 has already been HTTP/2-enabled.
//
@@ -7432,8 +7682,8 @@ func http2configureTransports(t1 *Transport) (*http2Transport, error) {
if !http2strSliceContains(t1.TLSClientConfig.NextProtos, "http/1.1") {
t1.TLSClientConfig.NextProtos = append(t1.TLSClientConfig.NextProtos, "http/1.1")
}
- upgradeFn := func(authority string, c *tls.Conn) RoundTripper {
- addr := http2authorityAddr("https", authority)
+ upgradeFn := func(scheme, authority string, c net.Conn) RoundTripper {
+ addr := http2authorityAddr(scheme, authority)
if used, err := connPool.addConnIfNeeded(addr, t2, c); err != nil {
go c.Close()
return http2erringRoundTripper{err}
@@ -7444,18 +7694,37 @@ func http2configureTransports(t1 *Transport) (*http2Transport, error) {
// was unknown)
go c.Close()
}
+ if scheme == "http" {
+ return (*http2unencryptedTransport)(t2)
+ }
return t2
}
- if m := t1.TLSNextProto; len(m) == 0 {
- t1.TLSNextProto = map[string]func(string, *tls.Conn) RoundTripper{
- "h2": upgradeFn,
+ if t1.TLSNextProto == nil {
+ t1.TLSNextProto = make(map[string]func(string, *tls.Conn) RoundTripper)
+ }
+ t1.TLSNextProto[http2NextProtoTLS] = func(authority string, c *tls.Conn) RoundTripper {
+ return upgradeFn("https", authority, c)
+ }
+ // The "unencrypted_http2" TLSNextProto key is used to pass off non-TLS HTTP/2 conns.
+ t1.TLSNextProto[http2nextProtoUnencryptedHTTP2] = func(authority string, c *tls.Conn) RoundTripper {
+ nc, err := http2unencryptedNetConnFromTLSConn(c)
+ if err != nil {
+ go c.Close()
+ return http2erringRoundTripper{err}
}
- } else {
- m["h2"] = upgradeFn
+ return upgradeFn("http", authority, nc)
}
return t2, nil
}
+// unencryptedTransport is a Transport with a RoundTrip method that
+// always permits http:// URLs.
+type http2unencryptedTransport http2Transport
+
+func (t *http2unencryptedTransport) RoundTrip(req *Request) (*Response, error) {
+ return (*http2Transport)(t).RoundTripOpt(req, http2RoundTripOpt{allowHTTP: true})
+}
+
func (t *http2Transport) connPool() http2ClientConnPool {
t.connPoolOnce.Do(t.initConnPool)
return t.connPoolOrDef
@@ -7475,7 +7744,7 @@ type http2ClientConn struct {
t *http2Transport
tconn net.Conn // usually *tls.Conn, except specialized impls
tlsState *tls.ConnectionState // nil only for specialized impls
- reused uint32 // whether conn is being reused; atomic
+ atomicReused uint32 // whether conn is being reused; atomic
singleUse bool // whether being used for a single http.Request
getConnCalled bool // used by clientConnPool
@@ -7506,11 +7775,22 @@ type http2ClientConn struct {
lastActive time.Time
lastIdle time.Time // time last idle
// Settings from peer: (also guarded by wmu)
- maxFrameSize uint32
- maxConcurrentStreams uint32
- peerMaxHeaderListSize uint64
- peerMaxHeaderTableSize uint32
- initialWindowSize uint32
+ maxFrameSize uint32
+ maxConcurrentStreams uint32
+ peerMaxHeaderListSize uint64
+ peerMaxHeaderTableSize uint32
+ initialWindowSize uint32
+ initialStreamRecvWindowSize int32
+ readIdleTimeout time.Duration
+ pingTimeout time.Duration
+
+ // pendingResets is the number of RST_STREAM frames we have sent to the peer,
+ // without confirming that the peer has received them. When we send a RST_STREAM,
+ // we bundle it with a PING frame, unless a PING is already in flight. We count
+ // the reset stream against the connection's concurrency limit until we get
+ // a PING response. This limits the number of requests we'll try to send to a
+ // completely unresponsive connection.
+ pendingResets int
// reqHeaderMu is a 1-element semaphore channel controlling access to sending new requests.
// Write to reqHeaderMu to lock it, read from it to unlock.
@@ -7568,12 +7848,12 @@ type http2clientStream struct {
sentHeaders bool
// owned by clientConnReadLoop:
- firstByte bool // got the first response byte
- pastHeaders bool // got first MetaHeadersFrame (actual headers)
- pastTrailers bool // got optional second MetaHeadersFrame (trailers)
- num1xx uint8 // number of 1xx responses seen
- readClosed bool // peer sent an END_STREAM flag
- readAborted bool // read loop reset the stream
+ firstByte bool // got the first response byte
+ pastHeaders bool // got first MetaHeadersFrame (actual headers)
+ pastTrailers bool // got optional second MetaHeadersFrame (trailers)
+ readClosed bool // peer sent an END_STREAM flag
+ readAborted bool // read loop reset the stream
+ totalHeaderSize int64 // total size of 1xx headers seen
trailer Header // accumulated trailers
resTrailer *Header // client's Response.Trailer
@@ -7635,6 +7915,7 @@ func (cs *http2clientStream) closeReqBodyLocked() {
}
type http2stickyErrWriter struct {
+ group http2synctestGroupInterface
conn net.Conn
timeout time.Duration
err *error
@@ -7644,22 +7925,9 @@ func (sew http2stickyErrWriter) Write(p []byte) (n int, err error) {
if *sew.err != nil {
return 0, *sew.err
}
- for {
- if sew.timeout != 0 {
- sew.conn.SetWriteDeadline(time.Now().Add(sew.timeout))
- }
- nn, err := sew.conn.Write(p[n:])
- n += nn
- if n < len(p) && nn > 0 && errors.Is(err, os.ErrDeadlineExceeded) {
- // Keep extending the deadline so long as we're making progress.
- continue
- }
- if sew.timeout != 0 {
- sew.conn.SetWriteDeadline(time.Time{})
- }
- *sew.err = err
- return n, err
- }
+ n, err = http2writeWithByteTimeout(sew.group, sew.conn, sew.timeout, p)
+ *sew.err = err
+ return n, err
}
// noCachedConnError is the concrete type of ErrNoCachedConn, which
@@ -7691,6 +7959,8 @@ type http2RoundTripOpt struct {
// no cached connection is available, RoundTripOpt
// will return ErrNoCachedConn.
OnlyCachedConn bool
+
+ allowHTTP bool // allow http:// URLs
}
func (t *http2Transport) RoundTrip(req *Request) (*Response, error) {
@@ -7723,7 +7993,14 @@ func http2authorityAddr(scheme string, authority string) (addr string) {
// RoundTripOpt is like RoundTrip, but takes options.
func (t *http2Transport) RoundTripOpt(req *Request, opt http2RoundTripOpt) (*Response, error) {
- if !(req.URL.Scheme == "https" || (req.URL.Scheme == "http" && t.AllowHTTP)) {
+ switch req.URL.Scheme {
+ case "https":
+ // Always okay.
+ case "http":
+ if !t.AllowHTTP && !opt.allowHTTP {
+ return nil, errors.New("http2: unencrypted HTTP/2 not enabled")
+ }
+ default:
return nil, errors.New("http2: unsupported scheme")
}
@@ -7734,7 +8011,7 @@ func (t *http2Transport) RoundTripOpt(req *Request, opt http2RoundTripOpt) (*Res
t.vlogf("http2: Transport failed to get client conn for %s: %v", addr, err)
return nil, err
}
- reused := !atomic.CompareAndSwapUint32(&cc.reused, 0, 1)
+ reused := !atomic.CompareAndSwapUint32(&cc.atomicReused, 0, 1)
http2traceGotConn(req, cc, reused)
res, err := cc.RoundTrip(req)
if err != nil && retry <= 6 {
@@ -7759,6 +8036,22 @@ func (t *http2Transport) RoundTripOpt(req *Request, opt http2RoundTripOpt) (*Res
}
}
}
+ if err == http2errClientConnNotEstablished {
+ // This ClientConn was created recently,
+ // this is the first request to use it,
+ // and the connection is closed and not usable.
+ //
+ // In this state, cc.idleTimer will remove the conn from the pool
+ // when it fires. Stop the timer and remove it here so future requests
+ // won't try to use this connection.
+ //
+ // If the timer has already fired and we're racing it, the redundant
+ // call to MarkDead is harmless.
+ if cc.idleTimer != nil {
+ cc.idleTimer.Stop()
+ }
+ t.connPool().MarkDead(cc)
+ }
if err != nil {
t.vlogf("RoundTrip failure: %v", err)
return nil, err
@@ -7777,9 +8070,10 @@ func (t *http2Transport) CloseIdleConnections() {
}
var (
- http2errClientConnClosed = errors.New("http2: client conn is closed")
- http2errClientConnUnusable = errors.New("http2: client conn not usable")
- http2errClientConnGotGoAway = errors.New("http2: Transport received Server's graceful shutdown GOAWAY")
+ http2errClientConnClosed = errors.New("http2: client conn is closed")
+ http2errClientConnUnusable = errors.New("http2: client conn not usable")
+ http2errClientConnNotEstablished = errors.New("http2: client conn could not be established")
+ http2errClientConnGotGoAway = errors.New("http2: Transport received Server's graceful shutdown GOAWAY")
)
// shouldRetryRequest is called by RoundTrip when a request fails to get
@@ -7895,44 +8189,37 @@ func (t *http2Transport) expectContinueTimeout() time.Duration {
return t.t1.ExpectContinueTimeout
}
-func (t *http2Transport) maxDecoderHeaderTableSize() uint32 {
- if v := t.MaxDecoderHeaderTableSize; v > 0 {
- return v
- }
- return http2initialHeaderTableSize
-}
-
-func (t *http2Transport) maxEncoderHeaderTableSize() uint32 {
- if v := t.MaxEncoderHeaderTableSize; v > 0 {
- return v
- }
- return http2initialHeaderTableSize
-}
-
func (t *http2Transport) NewClientConn(c net.Conn) (*http2ClientConn, error) {
return t.newClientConn(c, t.disableKeepAlives())
}
func (t *http2Transport) newClientConn(c net.Conn, singleUse bool) (*http2ClientConn, error) {
+ conf := http2configFromTransport(t)
cc := &http2ClientConn{
- t: t,
- tconn: c,
- readerDone: make(chan struct{}),
- nextStreamID: 1,
- maxFrameSize: 16 << 10, // spec default
- initialWindowSize: 65535, // spec default
- maxConcurrentStreams: http2initialMaxConcurrentStreams, // "infinite", per spec. Use a smaller value until we have received server settings.
- peerMaxHeaderListSize: 0xffffffffffffffff, // "infinite", per spec. Use 2^64-1 instead.
- streams: make(map[uint32]*http2clientStream),
- singleUse: singleUse,
- wantSettingsAck: true,
- pings: make(map[[8]byte]chan struct{}),
- reqHeaderMu: make(chan struct{}, 1),
+ t: t,
+ tconn: c,
+ readerDone: make(chan struct{}),
+ nextStreamID: 1,
+ maxFrameSize: 16 << 10, // spec default
+ initialWindowSize: 65535, // spec default
+ initialStreamRecvWindowSize: conf.MaxUploadBufferPerStream,
+ maxConcurrentStreams: http2initialMaxConcurrentStreams, // "infinite", per spec. Use a smaller value until we have received server settings.
+ peerMaxHeaderListSize: 0xffffffffffffffff, // "infinite", per spec. Use 2^64-1 instead.
+ streams: make(map[uint32]*http2clientStream),
+ singleUse: singleUse,
+ wantSettingsAck: true,
+ readIdleTimeout: conf.SendPingTimeout,
+ pingTimeout: conf.PingTimeout,
+ pings: make(map[[8]byte]chan struct{}),
+ reqHeaderMu: make(chan struct{}, 1),
+ lastActive: t.now(),
}
+ var group http2synctestGroupInterface
if t.http2transportTestHooks != nil {
t.markNewGoroutine()
t.http2transportTestHooks.newclientconn(cc)
c = cc.tconn
+ group = t.group
}
if http2VerboseLogs {
t.vlogf("http2: Transport creating client conn %p to %v", cc, c.RemoteAddr())
@@ -7944,24 +8231,23 @@ func (t *http2Transport) newClientConn(c net.Conn, singleUse bool) (*http2Client
// TODO: adjust this writer size to account for frame size +
// MTU + crypto/tls record padding.
cc.bw = bufio.NewWriter(http2stickyErrWriter{
+ group: group,
conn: c,
- timeout: t.WriteByteTimeout,
+ timeout: conf.WriteByteTimeout,
err: &cc.werr,
})
cc.br = bufio.NewReader(c)
cc.fr = http2NewFramer(cc.bw, cc.br)
- if t.maxFrameReadSize() != 0 {
- cc.fr.SetMaxReadFrameSize(t.maxFrameReadSize())
- }
+ cc.fr.SetMaxReadFrameSize(conf.MaxReadFrameSize)
if t.CountError != nil {
cc.fr.countError = t.CountError
}
- maxHeaderTableSize := t.maxDecoderHeaderTableSize()
+ maxHeaderTableSize := conf.MaxDecoderHeaderTableSize
cc.fr.ReadMetaHeaders = hpack.NewDecoder(maxHeaderTableSize, nil)
cc.fr.MaxHeaderListSize = t.maxHeaderListSize()
cc.henc = hpack.NewEncoder(&cc.hbuf)
- cc.henc.SetMaxDynamicTableSizeLimit(t.maxEncoderHeaderTableSize())
+ cc.henc.SetMaxDynamicTableSizeLimit(conf.MaxEncoderHeaderTableSize)
cc.peerMaxHeaderTableSize = http2initialHeaderTableSize
if cs, ok := c.(http2connectionStater); ok {
@@ -7971,11 +8257,9 @@ func (t *http2Transport) newClientConn(c net.Conn, singleUse bool) (*http2Client
initialSettings := []http2Setting{
{ID: http2SettingEnablePush, Val: 0},
- {ID: http2SettingInitialWindowSize, Val: http2transportDefaultStreamFlow},
- }
- if max := t.maxFrameReadSize(); max != 0 {
- initialSettings = append(initialSettings, http2Setting{ID: http2SettingMaxFrameSize, Val: max})
+ {ID: http2SettingInitialWindowSize, Val: uint32(cc.initialStreamRecvWindowSize)},
}
+ initialSettings = append(initialSettings, http2Setting{ID: http2SettingMaxFrameSize, Val: conf.MaxReadFrameSize})
if max := t.maxHeaderListSize(); max != 0 {
initialSettings = append(initialSettings, http2Setting{ID: http2SettingMaxHeaderListSize, Val: max})
}
@@ -7985,8 +8269,8 @@ func (t *http2Transport) newClientConn(c net.Conn, singleUse bool) (*http2Client
cc.bw.Write(http2clientPreface)
cc.fr.WriteSettings(initialSettings...)
- cc.fr.WriteWindowUpdate(0, http2transportDefaultConnFlow)
- cc.inflow.init(http2transportDefaultConnFlow + http2initialWindowSize)
+ cc.fr.WriteWindowUpdate(0, uint32(conf.MaxUploadBufferPerConnection))
+ cc.inflow.init(conf.MaxUploadBufferPerConnection + http2initialWindowSize)
cc.bw.Flush()
if cc.werr != nil {
cc.Close()
@@ -8004,7 +8288,7 @@ func (t *http2Transport) newClientConn(c net.Conn, singleUse bool) (*http2Client
}
func (cc *http2ClientConn) healthCheck() {
- pingTimeout := cc.t.pingTimeout()
+ pingTimeout := cc.pingTimeout
// We don't need to periodically ping in the health check, because the readLoop of ClientConn will
// trigger the healthCheck again if there is no frame received.
ctx, cancel := cc.t.contextWithTimeout(context.Background(), pingTimeout)
@@ -8132,7 +8416,7 @@ func (cc *http2ClientConn) State() http2ClientConnState {
return http2ClientConnState{
Closed: cc.closed,
Closing: cc.closing || cc.singleUse || cc.doNotReuse || cc.goAway != nil,
- StreamsActive: len(cc.streams),
+ StreamsActive: len(cc.streams) + cc.pendingResets,
StreamsReserved: cc.streamsReserved,
StreamsPending: cc.pendingRequests,
LastIdle: cc.lastIdle,
@@ -8164,16 +8448,38 @@ func (cc *http2ClientConn) idleStateLocked() (st http2clientConnIdleState) {
// writing it.
maxConcurrentOkay = true
} else {
- maxConcurrentOkay = int64(len(cc.streams)+cc.streamsReserved+1) <= int64(cc.maxConcurrentStreams)
+ // We can take a new request if the total of
+ // - active streams;
+ // - reservation slots for new streams; and
+ // - streams for which we have sent a RST_STREAM and a PING,
+ // but received no subsequent frame
+ // is less than the concurrency limit.
+ maxConcurrentOkay = cc.currentRequestCountLocked() < int(cc.maxConcurrentStreams)
}
st.canTakeNewRequest = cc.goAway == nil && !cc.closed && !cc.closing && maxConcurrentOkay &&
!cc.doNotReuse &&
int64(cc.nextStreamID)+2*int64(cc.pendingRequests) < math.MaxInt32 &&
!cc.tooIdleLocked()
+
+ // If this connection has never been used for a request and is closed,
+ // then let it take a request (which will fail).
+ //
+ // This avoids a situation where an error early in a connection's lifetime
+ // goes unreported.
+ if cc.nextStreamID == 1 && cc.streamsReserved == 0 && cc.closed {
+ st.canTakeNewRequest = true
+ }
+
return
}
+// currentRequestCountLocked reports the number of concurrency slots currently in use,
+// including active streams, reserved slots, and reset streams waiting for acknowledgement.
+func (cc *http2ClientConn) currentRequestCountLocked() int {
+ return len(cc.streams) + cc.streamsReserved + cc.pendingResets
+}
+
func (cc *http2ClientConn) canTakeNewRequestLocked() bool {
st := cc.idleStateLocked()
return st.canTakeNewRequest
@@ -8186,7 +8492,7 @@ func (cc *http2ClientConn) tooIdleLocked() bool {
// times are compared based on their wall time. We don't want
// to reuse a connection that's been sitting idle during
// VM/laptop suspend if monotonic time was also frozen.
- return cc.idleTimeout != 0 && !cc.lastIdle.IsZero() && time.Since(cc.lastIdle.Round(0)) > cc.idleTimeout
+ return cc.idleTimeout != 0 && !cc.lastIdle.IsZero() && cc.t.timeSince(cc.lastIdle.Round(0)) > cc.idleTimeout
}
// onIdleTimeout is called from a time.AfterFunc goroutine. It will
@@ -8750,6 +9056,7 @@ func (cs *http2clientStream) cleanupWriteRequest(err error) {
cs.reqBodyClosed = make(chan struct{})
}
bodyClosed := cs.reqBodyClosed
+ closeOnIdle := cc.singleUse || cc.doNotReuse || cc.t.disableKeepAlives() || cc.goAway != nil
cc.mu.Unlock()
if mustCloseBody {
cs.reqBody.Close()
@@ -8774,16 +9081,40 @@ func (cs *http2clientStream) cleanupWriteRequest(err error) {
if cs.sentHeaders {
if se, ok := err.(http2StreamError); ok {
if se.Cause != http2errFromPeer {
- cc.writeStreamReset(cs.ID, se.Code, err)
+ cc.writeStreamReset(cs.ID, se.Code, false, err)
}
} else {
- cc.writeStreamReset(cs.ID, http2ErrCodeCancel, err)
+ // We're cancelling an in-flight request.
+ //
+ // This could be due to the server becoming unresponsive.
+ // To avoid sending too many requests on a dead connection,
+ // we let the request continue to consume a concurrency slot
+ // until we can confirm the server is still responding.
+ // We do this by sending a PING frame along with the RST_STREAM
+ // (unless a ping is already in flight).
+ //
+ // For simplicity, we don't bother tracking the PING payload:
+ // We reset cc.pendingResets any time we receive a PING ACK.
+ //
+ // We skip this if the conn is going to be closed on idle,
+ // because it's short lived and will probably be closed before
+ // we get the ping response.
+ ping := false
+ if !closeOnIdle {
+ cc.mu.Lock()
+ if cc.pendingResets == 0 {
+ ping = true
+ }
+ cc.pendingResets++
+ cc.mu.Unlock()
+ }
+ cc.writeStreamReset(cs.ID, http2ErrCodeCancel, ping, err)
}
}
cs.bufPipe.CloseWithError(err) // no-op if already closed
} else {
if cs.sentHeaders && !cs.sentEndStream {
- cc.writeStreamReset(cs.ID, http2ErrCodeNo, nil)
+ cc.writeStreamReset(cs.ID, http2ErrCodeNo, false, nil)
}
cs.bufPipe.CloseWithError(http2errRequestCanceled)
}
@@ -8805,12 +9136,17 @@ func (cs *http2clientStream) cleanupWriteRequest(err error) {
// Must hold cc.mu.
func (cc *http2ClientConn) awaitOpenSlotForStreamLocked(cs *http2clientStream) error {
for {
- cc.lastActive = time.Now()
+ if cc.closed && cc.nextStreamID == 1 && cc.streamsReserved == 0 {
+ // This is the very first request sent to this connection.
+ // Return a fatal error which aborts the retry loop.
+ return http2errClientConnNotEstablished
+ }
+ cc.lastActive = cc.t.now()
if cc.closed || !cc.canTakeNewRequestLocked() {
return http2errClientConnUnusable
}
cc.lastIdle = time.Time{}
- if int64(len(cc.streams)) < int64(cc.maxConcurrentStreams) {
+ if cc.currentRequestCountLocked() < int(cc.maxConcurrentStreams) {
return nil
}
cc.pendingRequests++
@@ -9337,7 +9673,7 @@ type http2resAndError struct {
func (cc *http2ClientConn) addStreamLocked(cs *http2clientStream) {
cs.flow.add(int32(cc.initialWindowSize))
cs.flow.setConnFlow(&cc.flow)
- cs.inflow.init(http2transportDefaultStreamFlow)
+ cs.inflow.init(cc.initialStreamRecvWindowSize)
cs.ID = cc.nextStreamID
cc.nextStreamID += 2
cc.streams[cs.ID] = cs
@@ -9353,10 +9689,10 @@ func (cc *http2ClientConn) forgetStreamID(id uint32) {
if len(cc.streams) != slen-1 {
panic("forgetting unknown stream id")
}
- cc.lastActive = time.Now()
+ cc.lastActive = cc.t.now()
if len(cc.streams) == 0 && cc.idleTimer != nil {
cc.idleTimer.Reset(cc.idleTimeout)
- cc.lastIdle = time.Now()
+ cc.lastIdle = cc.t.now()
}
// Wake up writeRequestBody via clientStream.awaitFlowControl and
// wake up RoundTrip if there is a pending request.
@@ -9416,7 +9752,6 @@ func http2isEOFOrNetReadError(err error) bool {
func (rl *http2clientConnReadLoop) cleanup() {
cc := rl.cc
- cc.t.connPool().MarkDead(cc)
defer cc.closeConn()
defer close(cc.readerDone)
@@ -9440,6 +9775,24 @@ func (rl *http2clientConnReadLoop) cleanup() {
}
cc.closed = true
+ // If the connection has never been used, and has been open for only a short time,
+ // leave it in the connection pool for a little while.
+ //
+ // This avoids a situation where new connections are constantly created,
+ // added to the pool, fail, and are removed from the pool, without any error
+ // being surfaced to the user.
+ const unusedWaitTime = 5 * time.Second
+ idleTime := cc.t.now().Sub(cc.lastActive)
+ if atomic.LoadUint32(&cc.atomicReused) == 0 && idleTime < unusedWaitTime {
+ cc.idleTimer = cc.t.afterFunc(unusedWaitTime-idleTime, func() {
+ cc.t.connPool().MarkDead(cc)
+ })
+ } else {
+ cc.mu.Unlock() // avoid any deadlocks in MarkDead
+ cc.t.connPool().MarkDead(cc)
+ cc.mu.Lock()
+ }
+
for _, cs := range cc.streams {
select {
case <-cs.peerClosed:
@@ -9483,7 +9836,7 @@ func (cc *http2ClientConn) countReadFrameError(err error) {
func (rl *http2clientConnReadLoop) run() error {
cc := rl.cc
gotSettings := false
- readIdleTimeout := cc.t.ReadIdleTimeout
+ readIdleTimeout := cc.readIdleTimeout
var t http2timer
if readIdleTimeout != 0 {
t = cc.t.afterFunc(readIdleTimeout, cc.healthCheck)
@@ -9667,15 +10020,34 @@ func (rl *http2clientConnReadLoop) handleResponse(cs *http2clientStream, f *http
if f.StreamEnded() {
return nil, errors.New("1xx informational response with END_STREAM flag")
}
- cs.num1xx++
- const max1xxResponses = 5 // arbitrary bound on number of informational responses, same as net/http
- if cs.num1xx > max1xxResponses {
- return nil, errors.New("http2: too many 1xx informational responses")
- }
if fn := cs.get1xxTraceFunc(); fn != nil {
+ // If the 1xx response is being delivered to the user,
+ // then they're responsible for limiting the number
+ // of responses.
if err := fn(statusCode, textproto.MIMEHeader(header)); err != nil {
return nil, err
}
+ } else {
+ // If the user didn't examine the 1xx response, then we
+ // limit the size of all 1xx headers.
+ //
+ // This differs a bit from the HTTP/1 implementation, which
+ // limits the size of all 1xx headers plus the final response.
+ // Use the larger limit of MaxHeaderListSize and
+ // net/http.Transport.MaxResponseHeaderBytes.
+ limit := int64(cs.cc.t.maxHeaderListSize())
+ if t1 := cs.cc.t.t1; t1 != nil && t1.MaxResponseHeaderBytes > limit {
+ limit = t1.MaxResponseHeaderBytes
+ }
+ for _, h := range f.Fields {
+ cs.totalHeaderSize += int64(h.Size())
+ }
+ if cs.totalHeaderSize > limit {
+ if http2VerboseLogs {
+ log.Printf("http2: 1xx informational responses too large")
+ }
+ return nil, errors.New("header list too large")
+ }
}
if statusCode == 100 {
http2traceGot100Continue(cs.trace)
@@ -10219,6 +10591,11 @@ func (rl *http2clientConnReadLoop) processPing(f *http2PingFrame) error {
close(c)
delete(cc.pings, f.Data)
}
+ if cc.pendingResets > 0 {
+ // See clientStream.cleanupWriteRequest.
+ cc.pendingResets = 0
+ cc.cond.Broadcast()
+ }
return nil
}
cc := rl.cc
@@ -10241,13 +10618,20 @@ func (rl *http2clientConnReadLoop) processPushPromise(f *http2PushPromiseFrame)
return http2ConnectionError(http2ErrCodeProtocol)
}
-func (cc *http2ClientConn) writeStreamReset(streamID uint32, code http2ErrCode, err error) {
+// writeStreamReset sends a RST_STREAM frame.
+// When ping is true, it also sends a PING frame with a random payload.
+func (cc *http2ClientConn) writeStreamReset(streamID uint32, code http2ErrCode, ping bool, err error) {
// TODO: map err to more interesting error codes, once the
// HTTP community comes up with some. But currently for
// RST_STREAM there's no equivalent to GOAWAY frame's debug
// data, and the error codes are all pretty vague ("cancel").
cc.wmu.Lock()
cc.fr.WriteRSTStream(streamID, code)
+ if ping {
+ var payload [8]byte
+ rand.Read(payload[:])
+ cc.fr.WritePing(false, payload)
+ }
cc.bw.Flush()
cc.wmu.Unlock()
}
@@ -10404,7 +10788,7 @@ func http2traceGotConn(req *Request, cc *http2ClientConn, reused bool) {
cc.mu.Lock()
ci.WasIdle = len(cc.streams) == 0 && reused
if ci.WasIdle && !cc.lastActive.IsZero() {
- ci.IdleTime = time.Since(cc.lastActive)
+ ci.IdleTime = cc.t.timeSince(cc.lastActive)
}
cc.mu.Unlock()
@@ -10472,6 +10856,27 @@ func (t *http2Transport) dialTLSWithContext(ctx context.Context, network, addr s
return tlsCn, nil
}
+const http2nextProtoUnencryptedHTTP2 = "unencrypted_http2"
+
+// unencryptedNetConnFromTLSConn retrieves a net.Conn wrapped in a *tls.Conn.
+//
+// TLSNextProto functions accept a *tls.Conn.
+//
+// When passing an unencrypted HTTP/2 connection to a TLSNextProto function,
+// we pass a *tls.Conn with an underlying net.Conn containing the unencrypted connection.
+// To be extra careful about mistakes (accidentally dropping TLS encryption in a place
+// where we want it), the tls.Conn contains a net.Conn with an UnencryptedNetConn method
+// that returns the actual connection we want to use.
+func http2unencryptedNetConnFromTLSConn(tc *tls.Conn) (net.Conn, error) {
+ conner, ok := tc.NetConn().(interface {
+ UnencryptedNetConn() net.Conn
+ })
+ if !ok {
+ return nil, errors.New("http2: TLS conn unexpectedly found in unencrypted handoff")
+ }
+ return conner.UnencryptedNetConn(), nil
+}
+
// writeFramer is implemented by any type that is used to write frames.
type http2writeFramer interface {
writeFrame(http2writeContext) error
@@ -10588,6 +10993,18 @@ func (se http2StreamError) writeFrame(ctx http2writeContext) error {
func (se http2StreamError) staysWithinBuffer(max int) bool { return http2frameHeaderLen+4 <= max }
+type http2writePing struct {
+ data [8]byte
+}
+
+func (w http2writePing) writeFrame(ctx http2writeContext) error {
+ return ctx.Framer().WritePing(false, w.data)
+}
+
+func (w http2writePing) staysWithinBuffer(max int) bool {
+ return http2frameHeaderLen+len(w.data) <= max
+}
+
type http2writePingAck struct{ pf *http2PingFrame }
func (w http2writePingAck) writeFrame(ctx http2writeContext) error {
diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go b/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
index db42e6676a..c709b72847 100644
--- a/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
+++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build (!arm64 && !s390x && !ppc64le) || !gc || purego
+//go:build (!arm64 && !s390x && !ppc64 && !ppc64le) || !gc || purego
package chacha20
diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go b/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.go
index 3a4287f990..bd183d9ba1 100644
--- a/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go
+++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build gc && !purego
+//go:build gc && !purego && (ppc64 || ppc64le)
package chacha20
diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s b/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s
index c672ccf698..a660b4112f 100644
--- a/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s
+++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s
@@ -19,7 +19,7 @@
// The differences in this and the original implementation are
// due to the calling conventions and initialization of constants.
-//go:build gc && !purego
+//go:build gc && !purego && (ppc64 || ppc64le)
#include "textflag.h"
@@ -36,32 +36,68 @@
// for VPERMXOR
#define MASK R18
-DATA consts<>+0x00(SB)/8, $0x3320646e61707865
-DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
-DATA consts<>+0x10(SB)/8, $0x0000000000000001
-DATA consts<>+0x18(SB)/8, $0x0000000000000000
-DATA consts<>+0x20(SB)/8, $0x0000000000000004
-DATA consts<>+0x28(SB)/8, $0x0000000000000000
-DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
-DATA consts<>+0x38(SB)/8, $0x0203000106070405
-DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
-DATA consts<>+0x48(SB)/8, $0x0102030005060704
-DATA consts<>+0x50(SB)/8, $0x6170786561707865
-DATA consts<>+0x58(SB)/8, $0x6170786561707865
-DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
-DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
-DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
-DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
-DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
-DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
-DATA consts<>+0x90(SB)/8, $0x0000000100000000
-DATA consts<>+0x98(SB)/8, $0x0000000300000002
-DATA consts<>+0xa0(SB)/8, $0x5566774411223300
-DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88
-DATA consts<>+0xb0(SB)/8, $0x6677445522330011
-DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899
+DATA consts<>+0x00(SB)/4, $0x61707865
+DATA consts<>+0x04(SB)/4, $0x3320646e
+DATA consts<>+0x08(SB)/4, $0x79622d32
+DATA consts<>+0x0c(SB)/4, $0x6b206574
+DATA consts<>+0x10(SB)/4, $0x00000001
+DATA consts<>+0x14(SB)/4, $0x00000000
+DATA consts<>+0x18(SB)/4, $0x00000000
+DATA consts<>+0x1c(SB)/4, $0x00000000
+DATA consts<>+0x20(SB)/4, $0x00000004
+DATA consts<>+0x24(SB)/4, $0x00000000
+DATA consts<>+0x28(SB)/4, $0x00000000
+DATA consts<>+0x2c(SB)/4, $0x00000000
+DATA consts<>+0x30(SB)/4, $0x0e0f0c0d
+DATA consts<>+0x34(SB)/4, $0x0a0b0809
+DATA consts<>+0x38(SB)/4, $0x06070405
+DATA consts<>+0x3c(SB)/4, $0x02030001
+DATA consts<>+0x40(SB)/4, $0x0d0e0f0c
+DATA consts<>+0x44(SB)/4, $0x090a0b08
+DATA consts<>+0x48(SB)/4, $0x05060704
+DATA consts<>+0x4c(SB)/4, $0x01020300
+DATA consts<>+0x50(SB)/4, $0x61707865
+DATA consts<>+0x54(SB)/4, $0x61707865
+DATA consts<>+0x58(SB)/4, $0x61707865
+DATA consts<>+0x5c(SB)/4, $0x61707865
+DATA consts<>+0x60(SB)/4, $0x3320646e
+DATA consts<>+0x64(SB)/4, $0x3320646e
+DATA consts<>+0x68(SB)/4, $0x3320646e
+DATA consts<>+0x6c(SB)/4, $0x3320646e
+DATA consts<>+0x70(SB)/4, $0x79622d32
+DATA consts<>+0x74(SB)/4, $0x79622d32
+DATA consts<>+0x78(SB)/4, $0x79622d32
+DATA consts<>+0x7c(SB)/4, $0x79622d32
+DATA consts<>+0x80(SB)/4, $0x6b206574
+DATA consts<>+0x84(SB)/4, $0x6b206574
+DATA consts<>+0x88(SB)/4, $0x6b206574
+DATA consts<>+0x8c(SB)/4, $0x6b206574
+DATA consts<>+0x90(SB)/4, $0x00000000
+DATA consts<>+0x94(SB)/4, $0x00000001
+DATA consts<>+0x98(SB)/4, $0x00000002
+DATA consts<>+0x9c(SB)/4, $0x00000003
+DATA consts<>+0xa0(SB)/4, $0x11223300
+DATA consts<>+0xa4(SB)/4, $0x55667744
+DATA consts<>+0xa8(SB)/4, $0x99aabb88
+DATA consts<>+0xac(SB)/4, $0xddeeffcc
+DATA consts<>+0xb0(SB)/4, $0x22330011
+DATA consts<>+0xb4(SB)/4, $0x66774455
+DATA consts<>+0xb8(SB)/4, $0xaabb8899
+DATA consts<>+0xbc(SB)/4, $0xeeffccdd
GLOBL consts<>(SB), RODATA, $0xc0
+#ifdef GOARCH_ppc64
+#define BE_XXBRW_INIT() \
+ LVSL (R0)(R0), V24 \
+ VSPLTISB $3, V25 \
+ VXOR V24, V25, V24 \
+
+#define BE_XXBRW(vr) VPERM vr, vr, V24, vr
+#else
+#define BE_XXBRW_INIT()
+#define BE_XXBRW(vr)
+#endif
+
//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
MOVD out+0(FP), OUT
@@ -94,6 +130,8 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
// Clear V27
VXOR V27, V27, V27
+ BE_XXBRW_INIT()
+
// V28
LXVW4X (CONSTBASE)(R11), VS60
@@ -299,6 +337,11 @@ loop_vsx:
VADDUWM V8, V18, V8
VADDUWM V12, V19, V12
+ BE_XXBRW(V0)
+ BE_XXBRW(V4)
+ BE_XXBRW(V8)
+ BE_XXBRW(V12)
+
CMPU LEN, $64
BLT tail_vsx
@@ -327,6 +370,11 @@ loop_vsx:
VADDUWM V9, V18, V8
VADDUWM V13, V19, V12
+ BE_XXBRW(V0)
+ BE_XXBRW(V4)
+ BE_XXBRW(V8)
+ BE_XXBRW(V12)
+
CMPU LEN, $64
BLT tail_vsx
@@ -334,8 +382,8 @@ loop_vsx:
LXVW4X (INP)(R8), VS60
LXVW4X (INP)(R9), VS61
LXVW4X (INP)(R10), VS62
- VXOR V27, V0, V27
+ VXOR V27, V0, V27
VXOR V28, V4, V28
VXOR V29, V8, V29
VXOR V30, V12, V30
@@ -354,6 +402,11 @@ loop_vsx:
VADDUWM V10, V18, V8
VADDUWM V14, V19, V12
+ BE_XXBRW(V0)
+ BE_XXBRW(V4)
+ BE_XXBRW(V8)
+ BE_XXBRW(V12)
+
CMPU LEN, $64
BLT tail_vsx
@@ -381,6 +434,11 @@ loop_vsx:
VADDUWM V11, V18, V8
VADDUWM V15, V19, V12
+ BE_XXBRW(V0)
+ BE_XXBRW(V4)
+ BE_XXBRW(V8)
+ BE_XXBRW(V12)
+
CMPU LEN, $64
BLT tail_vsx
@@ -408,9 +466,9 @@ loop_vsx:
done_vsx:
// Increment counter by number of 64 byte blocks
- MOVD (CNT), R14
+ MOVWZ (CNT), R14
ADD BLOCKS, R14
- MOVD R14, (CNT)
+ MOVWZ R14, (CNT)
RET
tail_vsx:
diff --git a/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
index 731d2ac6db..fd5ee845f9 100644
--- a/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
+++ b/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
@@ -1,2715 +1,9762 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
+// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT.
//go:build gc && !purego
#include "textflag.h"
-// General register allocation
-#define oup DI
-#define inp SI
-#define inl BX
-#define adp CX // free to reuse, after we hash the additional data
-#define keyp R8 // free to reuse, when we copy the key to stack
-#define itr2 R9 // general iterator
-#define itr1 CX // general iterator
-#define acc0 R10
-#define acc1 R11
-#define acc2 R12
-#define t0 R13
-#define t1 R14
-#define t2 R15
-#define t3 R8
-// Register and stack allocation for the SSE code
-#define rStore (0*16)(BP)
-#define sStore (1*16)(BP)
-#define state1Store (2*16)(BP)
-#define state2Store (3*16)(BP)
-#define tmpStore (4*16)(BP)
-#define ctr0Store (5*16)(BP)
-#define ctr1Store (6*16)(BP)
-#define ctr2Store (7*16)(BP)
-#define ctr3Store (8*16)(BP)
-#define A0 X0
-#define A1 X1
-#define A2 X2
-#define B0 X3
-#define B1 X4
-#define B2 X5
-#define C0 X6
-#define C1 X7
-#define C2 X8
-#define D0 X9
-#define D1 X10
-#define D2 X11
-#define T0 X12
-#define T1 X13
-#define T2 X14
-#define T3 X15
-#define A3 T0
-#define B3 T1
-#define C3 T2
-#define D3 T3
-// Register and stack allocation for the AVX2 code
-#define rsStoreAVX2 (0*32)(BP)
-#define state1StoreAVX2 (1*32)(BP)
-#define state2StoreAVX2 (2*32)(BP)
-#define ctr0StoreAVX2 (3*32)(BP)
-#define ctr1StoreAVX2 (4*32)(BP)
-#define ctr2StoreAVX2 (5*32)(BP)
-#define ctr3StoreAVX2 (6*32)(BP)
-#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
-#define AA0 Y0
-#define AA1 Y5
-#define AA2 Y6
-#define AA3 Y7
-#define BB0 Y14
-#define BB1 Y9
-#define BB2 Y10
-#define BB3 Y11
-#define CC0 Y12
-#define CC1 Y13
-#define CC2 Y8
-#define CC3 Y15
-#define DD0 Y4
-#define DD1 Y1
-#define DD2 Y2
-#define DD3 Y3
-#define TT0 DD3
-#define TT1 AA3
-#define TT2 BB3
-#define TT3 CC3
-// ChaCha20 constants
-DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
-DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
-DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
-DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
-DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
-DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
-DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
-DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
-// <<< 16 with PSHUFB
-DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
-DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
-DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
-DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
-// <<< 8 with PSHUFB
-DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
-DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
-DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
-DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
-
-DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
-DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
-DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
-DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
-
-DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
-DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
-DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
-DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
-// Poly1305 key clamp
-DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
-DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
-DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
-DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
-
-DATA ·sseIncMask<>+0x00(SB)/8, $0x1
-DATA ·sseIncMask<>+0x08(SB)/8, $0x0
-// To load/store the last < 16 bytes in a buffer
-DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
-DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
-DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
-DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
-DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
-DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
-DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
-DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
-DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
-DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
-DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
-DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
-DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
-DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
-
-GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
-GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
-GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
-GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
-GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
-// No PALIGNR in Go ASM yet (but VPALIGNR is present).
-#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
-#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
-#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
-#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
-#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
-#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
-#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
-#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
-#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
-#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
-#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
-#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
-#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
-#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
-#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
-#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
-#define shiftC0Right shiftC0Left
-#define shiftC1Right shiftC1Left
-#define shiftC2Right shiftC2Left
-#define shiftC3Right shiftC3Left
-#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
-#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
-#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
-#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
-
-// Some macros
-
-// ROL rotates the uint32s in register R left by N bits, using temporary T.
-#define ROL(N, R, T) \
- MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R
-
-// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
-#ifdef GOAMD64_v2
-#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
-#else
-#define ROL16(R, T) ROL(16, R, T)
-#endif
-
-// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
-#ifdef GOAMD64_v2
-#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
-#else
-#define ROL8(R, T) ROL(8, R, T)
-#endif
-#define chachaQR(A, B, C, D, T) \
- PADDD B, A; PXOR A, D; ROL16(D, T) \
- PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
- PADDD B, A; PXOR A, D; ROL8(D, T) \
- PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
-
-#define chachaQR_AVX2(A, B, C, D, T) \
- VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
- VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
- VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
- VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
-
-#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
-#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
-#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
-#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
-#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
-
-#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
-#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
-#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
-
-#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
-#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
-// ----------------------------------------------------------------------------
+// func polyHashADInternal<>()
TEXT polyHashADInternal<>(SB), NOSPLIT, $0
- // adp points to beginning of additional data
- // itr2 holds ad length
- XORQ acc0, acc0
- XORQ acc1, acc1
- XORQ acc2, acc2
- CMPQ itr2, $13
- JNE hashADLoop
+ // Hack: Must declare #define macros inside of a function due to Avo constraints
+ // ROL rotates the uint32s in register R left by N bits, using temporary T.
+ #define ROL(N, R, T) \
+ MOVO R, T; \
+ PSLLL $(N), T; \
+ PSRLL $(32-(N)), R; \
+ PXOR T, R
+
+ // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
+ #ifdef GOAMD64_v2
+ #define ROL8(R, T) PSHUFB ·rol8<>(SB), R
+ #else
+ #define ROL8(R, T) ROL(8, R, T)
+ #endif
-openFastTLSAD:
- // Special treatment for the TLS case of 13 bytes
- MOVQ (adp), acc0
- MOVQ 5(adp), acc1
- SHRQ $24, acc1
- MOVQ $1, acc2
- polyMul
+ // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
+ #ifdef GOAMD64_v2
+ #define ROL16(R, T) PSHUFB ·rol16<>(SB), R
+ #else
+ #define ROL16(R, T) ROL(16, R, T)
+ #endif
+ XORQ R10, R10
+ XORQ R11, R11
+ XORQ R12, R12
+ CMPQ R9, $0x0d
+ JNE hashADLoop
+ MOVQ (CX), R10
+ MOVQ 5(CX), R11
+ SHRQ $0x18, R11
+ MOVQ $0x00000001, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
RET
hashADLoop:
// Hash in 16 byte chunks
- CMPQ itr2, $16
- JB hashADTail
- polyAdd(0(adp))
- LEAQ (1*16)(adp), adp
- SUBQ $16, itr2
- polyMul
- JMP hashADLoop
+ CMPQ R9, $0x10
+ JB hashADTail
+ ADDQ (CX), R10
+ ADCQ 8(CX), R11
+ ADCQ $0x01, R12
+ LEAQ 16(CX), CX
+ SUBQ $0x10, R9
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ JMP hashADLoop
hashADTail:
- CMPQ itr2, $0
+ CMPQ R9, $0x00
JE hashADDone
// Hash last < 16 byte tail
- XORQ t0, t0
- XORQ t1, t1
- XORQ t2, t2
- ADDQ itr2, adp
+ XORQ R13, R13
+ XORQ R14, R14
+ XORQ R15, R15
+ ADDQ R9, CX
hashADTailLoop:
- SHLQ $8, t0, t1
- SHLQ $8, t0
- MOVB -1(adp), t2
- XORQ t2, t0
- DECQ adp
- DECQ itr2
- JNE hashADTailLoop
+ SHLQ $0x08, R13, R14
+ SHLQ $0x08, R13
+ MOVB -1(CX), R15
+ XORQ R15, R13
+ DECQ CX
+ DECQ R9
+ JNE hashADTailLoop
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
-hashADTailFinish:
- ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
- polyMul
-
- // Finished AD
hashADDone:
RET
-// ----------------------------------------------------------------------------
-// func chacha20Poly1305Open(dst, key, src, ad []byte) bool
-TEXT ·chacha20Poly1305Open(SB), 0, $288-97
+// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
+// Requires: AVX, AVX2, BMI2, CMOV, SSE2
+TEXT ·chacha20Poly1305Open(SB), $288-97
// For aligned stack access
MOVQ SP, BP
- ADDQ $32, BP
+ ADDQ $0x20, BP
ANDQ $-32, BP
- MOVQ dst+0(FP), oup
- MOVQ key+24(FP), keyp
- MOVQ src+48(FP), inp
- MOVQ src_len+56(FP), inl
- MOVQ ad+72(FP), adp
+ MOVQ dst_base+0(FP), DI
+ MOVQ key_base+24(FP), R8
+ MOVQ src_base+48(FP), SI
+ MOVQ src_len+56(FP), BX
+ MOVQ ad_base+72(FP), CX
// Check for AVX2 support
- CMPB ·useAVX2(SB), $1
+ CMPB ·useAVX2+0(SB), $0x01
JE chacha20Poly1305Open_AVX2
// Special optimization, for very short buffers
- CMPQ inl, $128
- JBE openSSE128 // About 16% faster
+ CMPQ BX, $0x80
+ JBE openSSE128
// For long buffers, prepare the poly key first
- MOVOU ·chacha20Constants<>(SB), A0
- MOVOU (1*16)(keyp), B0
- MOVOU (2*16)(keyp), C0
- MOVOU (3*16)(keyp), D0
- MOVO D0, T1
+ MOVOU ·chacha20Constants<>+0(SB), X0
+ MOVOU 16(R8), X3
+ MOVOU 32(R8), X6
+ MOVOU 48(R8), X9
+ MOVO X9, X13
// Store state on stack for future use
- MOVO B0, state1Store
- MOVO C0, state2Store
- MOVO D0, ctr3Store
- MOVQ $10, itr2
+ MOVO X3, 32(BP)
+ MOVO X6, 48(BP)
+ MOVO X9, 128(BP)
+ MOVQ $0x0000000a, R9
openSSEPreparePolyKey:
- chachaQR(A0, B0, C0, D0, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- chachaQR(A0, B0, C0, D0, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- DECQ itr2
- JNE openSSEPreparePolyKey
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ DECQ R9
+ JNE openSSEPreparePolyKey
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
- PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL 32(BP), X3
// Clamp and store the key
- PAND ·polyClampMask<>(SB), A0
- MOVO A0, rStore; MOVO B0, sStore
+ PAND ·polyClampMask<>+0(SB), X0
+ MOVO X0, (BP)
+ MOVO X3, 16(BP)
// Hash AAD
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
openSSEMainLoop:
- CMPQ inl, $256
+ CMPQ BX, $0x00000100
JB openSSEMainLoopDone
// Load state, increment counter blocks
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X2, X12
+ MOVO X5, X13
+ MOVO X8, X14
+ MOVO X11, X15
+ PADDL ·sseIncMask<>+0(SB), X15
// Store counters
- MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+ MOVO X9, 80(BP)
+ MOVO X10, 96(BP)
+ MOVO X11, 112(BP)
+ MOVO X15, 128(BP)
- // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
- MOVQ $4, itr1
- MOVQ inp, itr2
+ // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash
+ // 2 blocks, and for the remaining 4 only 1 block - for a total of 16
+ MOVQ $0x00000004, CX
+ MOVQ SI, R9
openSSEInternalLoop:
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyAdd(0(itr2))
- shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
- shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
- shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
- polyMulStage1
- polyMulStage2
- LEAQ (2*8)(itr2), itr2
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- polyMulStage3
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyMulReduceStage
- shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
- shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
- shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
- DECQ itr1
- JGE openSSEInternalLoop
-
- polyAdd(0(itr2))
- polyMul
- LEAQ (2*8)(itr2), itr2
-
- CMPQ itr1, $-6
- JG openSSEInternalLoop
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x0c
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ LEAQ 16(R9), R9
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x04
+ DECQ CX
+ JGE openSSEInternalLoop
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
+ CMPQ CX, $-6
+ JG openSSEInternalLoop
// Add in the state
- PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
- PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
- PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
- PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+ PADDD ·chacha20Constants<>+0(SB), X0
+ PADDD ·chacha20Constants<>+0(SB), X1
+ PADDD ·chacha20Constants<>+0(SB), X2
+ PADDD ·chacha20Constants<>+0(SB), X12
+ PADDD 32(BP), X3
+ PADDD 32(BP), X4
+ PADDD 32(BP), X5
+ PADDD 32(BP), X13
+ PADDD 48(BP), X6
+ PADDD 48(BP), X7
+ PADDD 48(BP), X8
+ PADDD 48(BP), X14
+ PADDD 80(BP), X9
+ PADDD 96(BP), X10
+ PADDD 112(BP), X11
+ PADDD 128(BP), X15
// Load - xor - store
- MOVO D3, tmpStore
- MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
- MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
- MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
- MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
- MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
- MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
- MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
- MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
- MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
- MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
- MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
- MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
- MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
- MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
- MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
- MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
- LEAQ 256(inp), inp
- LEAQ 256(oup), oup
- SUBQ $256, inl
+ MOVO X15, 64(BP)
+ MOVOU (SI), X15
+ PXOR X15, X0
+ MOVOU X0, (DI)
+ MOVOU 16(SI), X15
+ PXOR X15, X3
+ MOVOU X3, 16(DI)
+ MOVOU 32(SI), X15
+ PXOR X15, X6
+ MOVOU X6, 32(DI)
+ MOVOU 48(SI), X15
+ PXOR X15, X9
+ MOVOU X9, 48(DI)
+ MOVOU 64(SI), X9
+ PXOR X9, X1
+ MOVOU X1, 64(DI)
+ MOVOU 80(SI), X9
+ PXOR X9, X4
+ MOVOU X4, 80(DI)
+ MOVOU 96(SI), X9
+ PXOR X9, X7
+ MOVOU X7, 96(DI)
+ MOVOU 112(SI), X9
+ PXOR X9, X10
+ MOVOU X10, 112(DI)
+ MOVOU 128(SI), X9
+ PXOR X9, X2
+ MOVOU X2, 128(DI)
+ MOVOU 144(SI), X9
+ PXOR X9, X5
+ MOVOU X5, 144(DI)
+ MOVOU 160(SI), X9
+ PXOR X9, X8
+ MOVOU X8, 160(DI)
+ MOVOU 176(SI), X9
+ PXOR X9, X11
+ MOVOU X11, 176(DI)
+ MOVOU 192(SI), X9
+ PXOR X9, X12
+ MOVOU X12, 192(DI)
+ MOVOU 208(SI), X9
+ PXOR X9, X13
+ MOVOU X13, 208(DI)
+ MOVOU 224(SI), X9
+ PXOR X9, X14
+ MOVOU X14, 224(DI)
+ MOVOU 240(SI), X9
+ PXOR 64(BP), X9
+ MOVOU X9, 240(DI)
+ LEAQ 256(SI), SI
+ LEAQ 256(DI), DI
+ SUBQ $0x00000100, BX
JMP openSSEMainLoop
openSSEMainLoopDone:
// Handle the various tail sizes efficiently
- TESTQ inl, inl
+ TESTQ BX, BX
JE openSSEFinalize
- CMPQ inl, $64
+ CMPQ BX, $0x40
JBE openSSETail64
- CMPQ inl, $128
+ CMPQ BX, $0x80
JBE openSSETail128
- CMPQ inl, $192
+ CMPQ BX, $0xc0
JBE openSSETail192
JMP openSSETail256
openSSEFinalize:
// Hash in the PT, AAD lengths
- ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
- polyMul
+ ADDQ ad_len+80(FP), R10
+ ADCQ src_len+56(FP), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Final reduce
- MOVQ acc0, t0
- MOVQ acc1, t1
- MOVQ acc2, t2
- SUBQ $-5, acc0
- SBBQ $-1, acc1
- SBBQ $3, acc2
- CMOVQCS t0, acc0
- CMOVQCS t1, acc1
- CMOVQCS t2, acc2
+ MOVQ R10, R13
+ MOVQ R11, R14
+ MOVQ R12, R15
+ SUBQ $-5, R10
+ SBBQ $-1, R11
+ SBBQ $0x03, R12
+ CMOVQCS R13, R10
+ CMOVQCS R14, R11
+ CMOVQCS R15, R12
// Add in the "s" part of the key
- ADDQ 0+sStore, acc0
- ADCQ 8+sStore, acc1
+ ADDQ 16(BP), R10
+ ADCQ 24(BP), R11
// Finally, constant time compare to the tag at the end of the message
XORQ AX, AX
- MOVQ $1, DX
- XORQ (0*8)(inp), acc0
- XORQ (1*8)(inp), acc1
- ORQ acc1, acc0
+ MOVQ $0x00000001, DX
+ XORQ (SI), R10
+ XORQ 8(SI), R11
+ ORQ R11, R10
CMOVQEQ DX, AX
// Return true iff tags are equal
MOVB AX, ret+96(FP)
RET
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 129 bytes
openSSE128:
- // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
- MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
- MOVQ $10, itr2
+ MOVOU ·chacha20Constants<>+0(SB), X0
+ MOVOU 16(R8), X3
+ MOVOU 32(R8), X6
+ MOVOU 48(R8), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X3, X13
+ MOVO X6, X14
+ MOVO X10, X15
+ MOVQ $0x0000000a, R9
openSSE128InnerCipherLoop:
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Left; shiftB1Left; shiftB2Left
- shiftC0Left; shiftC1Left; shiftC2Left
- shiftD0Left; shiftD1Left; shiftD2Left
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Right; shiftB1Right; shiftB2Right
- shiftC0Right; shiftC1Right; shiftC2Right
- shiftD0Right; shiftD1Right; shiftD2Right
- DECQ itr2
- JNE openSSE128InnerCipherLoop
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ DECQ R9
+ JNE openSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
- PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
- PADDL T2, C1; PADDL T2, C2
- PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL ·chacha20Constants<>+0(SB), X2
+ PADDL X13, X3
+ PADDL X13, X4
+ PADDL X13, X5
+ PADDL X14, X7
+ PADDL X14, X8
+ PADDL X15, X10
+ PADDL ·sseIncMask<>+0(SB), X15
+ PADDL X15, X11
// Clamp and store the key
- PAND ·polyClampMask<>(SB), A0
- MOVOU A0, rStore; MOVOU B0, sStore
+ PAND ·polyClampMask<>+0(SB), X0
+ MOVOU X0, (BP)
+ MOVOU X3, 16(BP)
// Hash
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
openSSE128Open:
- CMPQ inl, $16
+ CMPQ BX, $0x10
JB openSSETail16
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for hashing
- polyAdd(0(inp))
+ ADDQ (SI), R10
+ ADCQ 8(SI), R11
+ ADCQ $0x01, R12
// Load for decryption
- MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
- LEAQ (1*16)(inp), inp
- LEAQ (1*16)(oup), oup
- polyMul
+ MOVOU (SI), X12
+ PXOR X12, X1
+ MOVOU X1, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Shift the stream "left"
- MOVO B1, A1
- MOVO C1, B1
- MOVO D1, C1
- MOVO A2, D1
- MOVO B2, A2
- MOVO C2, B2
- MOVO D2, C2
+ MOVO X4, X1
+ MOVO X7, X4
+ MOVO X10, X7
+ MOVO X2, X10
+ MOVO X5, X2
+ MOVO X8, X5
+ MOVO X11, X8
JMP openSSE128Open
openSSETail16:
- TESTQ inl, inl
+ TESTQ BX, BX
JE openSSEFinalize
// We can safely load the CT from the end, because it is padded with the MAC
- MOVQ inl, itr2
- SHLQ $4, itr2
- LEAQ ·andMask<>(SB), t0
- MOVOU (inp), T0
- ADDQ inl, inp
- PAND -16(t0)(itr2*1), T0
- MOVO T0, 0+tmpStore
- MOVQ T0, t0
- MOVQ 8+tmpStore, t1
- PXOR A1, T0
+ MOVQ BX, R9
+ SHLQ $0x04, R9
+ LEAQ ·andMask<>+0(SB), R13
+ MOVOU (SI), X12
+ ADDQ BX, SI
+ PAND -16(R13)(R9*1), X12
+ MOVO X12, 64(BP)
+ MOVQ X12, R13
+ MOVQ 72(BP), R14
+ PXOR X1, X12
// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
openSSETail16Store:
- MOVQ T0, t3
- MOVB t3, (oup)
- PSRLDQ $1, T0
- INCQ oup
- DECQ inl
+ MOVQ X12, R8
+ MOVB R8, (DI)
+ PSRLDQ $0x01, X12
+ INCQ DI
+ DECQ BX
JNE openSSETail16Store
- ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
- polyMul
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
JMP openSSEFinalize
-// ----------------------------------------------------------------------------
-// Special optimization for the last 64 bytes of ciphertext
openSSETail64:
- // Need to decrypt up to 64 bytes - prepare single block
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
- XORQ itr2, itr2
- MOVQ inl, itr1
- CMPQ itr1, $16
- JB openSSETail64LoopB
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 80(BP)
+ XORQ R9, R9
+ MOVQ BX, CX
+ CMPQ CX, $0x10
+ JB openSSETail64LoopB
openSSETail64LoopA:
- // Perform ChaCha rounds, while hashing the remaining input
- polyAdd(0(inp)(itr2*1))
- polyMul
- SUBQ $16, itr1
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ SUBQ $0x10, CX
openSSETail64LoopB:
- ADDQ $16, itr2
- chachaQR(A0, B0, C0, D0, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- chachaQR(A0, B0, C0, D0, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
-
- CMPQ itr1, $16
- JAE openSSETail64LoopA
-
- CMPQ itr2, $160
- JNE openSSETail64LoopB
-
- PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
+ ADDQ $0x10, R9
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ CMPQ CX, $0x10
+ JAE openSSETail64LoopA
+ CMPQ R9, $0xa0
+ JNE openSSETail64LoopB
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL 32(BP), X3
+ PADDL 48(BP), X6
+ PADDL 80(BP), X9
openSSETail64DecLoop:
- CMPQ inl, $16
+ CMPQ BX, $0x10
JB openSSETail64DecLoopDone
- SUBQ $16, inl
- MOVOU (inp), T0
- PXOR T0, A0
- MOVOU A0, (oup)
- LEAQ 16(inp), inp
- LEAQ 16(oup), oup
- MOVO B0, A0
- MOVO C0, B0
- MOVO D0, C0
+ SUBQ $0x10, BX
+ MOVOU (SI), X12
+ PXOR X12, X0
+ MOVOU X0, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
+ MOVO X3, X0
+ MOVO X6, X3
+ MOVO X9, X6
JMP openSSETail64DecLoop
openSSETail64DecLoopDone:
- MOVO A0, A1
+ MOVO X0, X1
JMP openSSETail16
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
openSSETail128:
- // Need to decrypt up to 128 bytes - prepare two blocks
- MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
- MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
- XORQ itr2, itr2
- MOVQ inl, itr1
- ANDQ $-16, itr1
+ MOVO ·chacha20Constants<>+0(SB), X1
+ MOVO 32(BP), X4
+ MOVO 48(BP), X7
+ MOVO 128(BP), X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 80(BP)
+ MOVO X1, X0
+ MOVO X4, X3
+ MOVO X7, X6
+ MOVO X10, X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 96(BP)
+ XORQ R9, R9
+ MOVQ BX, CX
+ ANDQ $-16, CX
openSSETail128LoopA:
- // Perform ChaCha rounds, while hashing the remaining input
- polyAdd(0(inp)(itr2*1))
- polyMul
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
openSSETail128LoopB:
- ADDQ $16, itr2
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- shiftB1Left; shiftC1Left; shiftD1Left
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- shiftB1Right; shiftC1Right; shiftD1Right
-
- CMPQ itr2, itr1
- JB openSSETail128LoopA
-
- CMPQ itr2, $160
- JNE openSSETail128LoopB
-
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
- PADDL state1Store, B0; PADDL state1Store, B1
- PADDL state2Store, C0; PADDL state2Store, C1
- PADDL ctr1Store, D0; PADDL ctr0Store, D1
-
- MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
- PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
- MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
-
- SUBQ $64, inl
- LEAQ 64(inp), inp
- LEAQ 64(oup), oup
- JMP openSSETail64DecLoop
+ ADDQ $0x10, R9
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ CMPQ R9, CX
+ JB openSSETail128LoopA
+ CMPQ R9, $0xa0
+ JNE openSSETail128LoopB
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL 32(BP), X3
+ PADDL 32(BP), X4
+ PADDL 48(BP), X6
+ PADDL 48(BP), X7
+ PADDL 96(BP), X9
+ PADDL 80(BP), X10
+ MOVOU (SI), X12
+ MOVOU 16(SI), X13
+ MOVOU 32(SI), X14
+ MOVOU 48(SI), X15
+ PXOR X12, X1
+ PXOR X13, X4
+ PXOR X14, X7
+ PXOR X15, X10
+ MOVOU X1, (DI)
+ MOVOU X4, 16(DI)
+ MOVOU X7, 32(DI)
+ MOVOU X10, 48(DI)
+ SUBQ $0x40, BX
+ LEAQ 64(SI), SI
+ LEAQ 64(DI), DI
+ JMP openSSETail64DecLoop
-// ----------------------------------------------------------------------------
-// Special optimization for the last 192 bytes of ciphertext
openSSETail192:
- // Need to decrypt up to 192 bytes - prepare three blocks
- MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
- MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
- MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
-
- MOVQ inl, itr1
- MOVQ $160, itr2
- CMPQ itr1, $160
- CMOVQGT itr2, itr1
- ANDQ $-16, itr1
- XORQ itr2, itr2
+ MOVO ·chacha20Constants<>+0(SB), X2
+ MOVO 32(BP), X5
+ MOVO 48(BP), X8
+ MOVO 128(BP), X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X11, 80(BP)
+ MOVO X2, X1
+ MOVO X5, X4
+ MOVO X8, X7
+ MOVO X11, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 96(BP)
+ MOVO X1, X0
+ MOVO X4, X3
+ MOVO X7, X6
+ MOVO X10, X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 112(BP)
+ MOVQ BX, CX
+ MOVQ $0x000000a0, R9
+ CMPQ CX, $0xa0
+ CMOVQGT R9, CX
+ ANDQ $-16, CX
+ XORQ R9, R9
openSSLTail192LoopA:
- // Perform ChaCha rounds, while hashing the remaining input
- polyAdd(0(inp)(itr2*1))
- polyMul
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
openSSLTail192LoopB:
- ADDQ $16, itr2
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- shiftB1Left; shiftC1Left; shiftD1Left
- shiftB2Left; shiftC2Left; shiftD2Left
-
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- shiftB1Right; shiftC1Right; shiftD1Right
- shiftB2Right; shiftC2Right; shiftD2Right
-
- CMPQ itr2, itr1
- JB openSSLTail192LoopA
-
- CMPQ itr2, $160
- JNE openSSLTail192LoopB
-
- CMPQ inl, $176
- JB openSSLTail192Store
-
- polyAdd(160(inp))
- polyMul
-
- CMPQ inl, $192
- JB openSSLTail192Store
-
- polyAdd(176(inp))
- polyMul
+ ADDQ $0x10, R9
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ CMPQ R9, CX
+ JB openSSLTail192LoopA
+ CMPQ R9, $0xa0
+ JNE openSSLTail192LoopB
+ CMPQ BX, $0xb0
+ JB openSSLTail192Store
+ ADDQ 160(SI), R10
+ ADCQ 168(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ CMPQ BX, $0xc0
+ JB openSSLTail192Store
+ ADDQ 176(SI), R10
+ ADCQ 184(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
openSSLTail192Store:
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
- PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
- PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
- PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
-
- MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
- PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
- MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
-
- MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
- PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
- MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-
- SUBQ $128, inl
- LEAQ 128(inp), inp
- LEAQ 128(oup), oup
- JMP openSSETail64DecLoop
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL ·chacha20Constants<>+0(SB), X2
+ PADDL 32(BP), X3
+ PADDL 32(BP), X4
+ PADDL 32(BP), X5
+ PADDL 48(BP), X6
+ PADDL 48(BP), X7
+ PADDL 48(BP), X8
+ PADDL 112(BP), X9
+ PADDL 96(BP), X10
+ PADDL 80(BP), X11
+ MOVOU (SI), X12
+ MOVOU 16(SI), X13
+ MOVOU 32(SI), X14
+ MOVOU 48(SI), X15
+ PXOR X12, X2
+ PXOR X13, X5
+ PXOR X14, X8
+ PXOR X15, X11
+ MOVOU X2, (DI)
+ MOVOU X5, 16(DI)
+ MOVOU X8, 32(DI)
+ MOVOU X11, 48(DI)
+ MOVOU 64(SI), X12
+ MOVOU 80(SI), X13
+ MOVOU 96(SI), X14
+ MOVOU 112(SI), X15
+ PXOR X12, X1
+ PXOR X13, X4
+ PXOR X14, X7
+ PXOR X15, X10
+ MOVOU X1, 64(DI)
+ MOVOU X4, 80(DI)
+ MOVOU X7, 96(DI)
+ MOVOU X10, 112(DI)
+ SUBQ $0x80, BX
+ LEAQ 128(SI), SI
+ LEAQ 128(DI), DI
+ JMP openSSETail64DecLoop
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
openSSETail256:
- // Need to decrypt up to 256 bytes - prepare four blocks
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X2, X12
+ MOVO X5, X13
+ MOVO X8, X14
+ MOVO X11, X15
+ PADDL ·sseIncMask<>+0(SB), X15
// Store counters
- MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
- XORQ itr2, itr2
+ MOVO X9, 80(BP)
+ MOVO X10, 96(BP)
+ MOVO X11, 112(BP)
+ MOVO X15, 128(BP)
+ XORQ R9, R9
openSSETail256Loop:
- // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
- polyAdd(0(inp)(itr2*1))
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
- shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
- shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
- polyMulStage1
- polyMulStage2
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyMulStage3
- polyMulReduceStage
- shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
- shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
- shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
- ADDQ $2*8, itr2
- CMPQ itr2, $160
- JB openSSETail256Loop
- MOVQ inl, itr1
- ANDQ $-16, itr1
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x0c
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x04
+ ADDQ $0x10, R9
+ CMPQ R9, $0xa0
+ JB openSSETail256Loop
+ MOVQ BX, CX
+ ANDQ $-16, CX
openSSETail256HashLoop:
- polyAdd(0(inp)(itr2*1))
- polyMul
- ADDQ $2*8, itr2
- CMPQ itr2, itr1
- JB openSSETail256HashLoop
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ $0x10, R9
+ CMPQ R9, CX
+ JB openSSETail256HashLoop
// Add in the state
- PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
- PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
- PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
- PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
- MOVO D3, tmpStore
+ PADDD ·chacha20Constants<>+0(SB), X0
+ PADDD ·chacha20Constants<>+0(SB), X1
+ PADDD ·chacha20Constants<>+0(SB), X2
+ PADDD ·chacha20Constants<>+0(SB), X12
+ PADDD 32(BP), X3
+ PADDD 32(BP), X4
+ PADDD 32(BP), X5
+ PADDD 32(BP), X13
+ PADDD 48(BP), X6
+ PADDD 48(BP), X7
+ PADDD 48(BP), X8
+ PADDD 48(BP), X14
+ PADDD 80(BP), X9
+ PADDD 96(BP), X10
+ PADDD 112(BP), X11
+ PADDD 128(BP), X15
+ MOVO X15, 64(BP)
// Load - xor - store
- MOVOU (0*16)(inp), D3; PXOR D3, A0
- MOVOU (1*16)(inp), D3; PXOR D3, B0
- MOVOU (2*16)(inp), D3; PXOR D3, C0
- MOVOU (3*16)(inp), D3; PXOR D3, D0
- MOVOU A0, (0*16)(oup)
- MOVOU B0, (1*16)(oup)
- MOVOU C0, (2*16)(oup)
- MOVOU D0, (3*16)(oup)
- MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
- PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
- MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
- MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
- PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
- MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
- LEAQ 192(inp), inp
- LEAQ 192(oup), oup
- SUBQ $192, inl
- MOVO A3, A0
- MOVO B3, B0
- MOVO C3, C0
- MOVO tmpStore, D0
-
- JMP openSSETail64DecLoop
+ MOVOU (SI), X15
+ PXOR X15, X0
+ MOVOU 16(SI), X15
+ PXOR X15, X3
+ MOVOU 32(SI), X15
+ PXOR X15, X6
+ MOVOU 48(SI), X15
+ PXOR X15, X9
+ MOVOU X0, (DI)
+ MOVOU X3, 16(DI)
+ MOVOU X6, 32(DI)
+ MOVOU X9, 48(DI)
+ MOVOU 64(SI), X0
+ MOVOU 80(SI), X3
+ MOVOU 96(SI), X6
+ MOVOU 112(SI), X9
+ PXOR X0, X1
+ PXOR X3, X4
+ PXOR X6, X7
+ PXOR X9, X10
+ MOVOU X1, 64(DI)
+ MOVOU X4, 80(DI)
+ MOVOU X7, 96(DI)
+ MOVOU X10, 112(DI)
+ MOVOU 128(SI), X0
+ MOVOU 144(SI), X3
+ MOVOU 160(SI), X6
+ MOVOU 176(SI), X9
+ PXOR X0, X2
+ PXOR X3, X5
+ PXOR X6, X8
+ PXOR X9, X11
+ MOVOU X2, 128(DI)
+ MOVOU X5, 144(DI)
+ MOVOU X8, 160(DI)
+ MOVOU X11, 176(DI)
+ LEAQ 192(SI), SI
+ LEAQ 192(DI), DI
+ SUBQ $0xc0, BX
+ MOVO X12, X0
+ MOVO X13, X3
+ MOVO X14, X6
+ MOVO 64(BP), X9
+ JMP openSSETail64DecLoop
-// ----------------------------------------------------------------------------
-// ------------------------- AVX2 Code ----------------------------------------
chacha20Poly1305Open_AVX2:
VZEROUPPER
- VMOVDQU ·chacha20Constants<>(SB), AA0
- BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
- BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
- BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
- VPADDD ·avx2InitMask<>(SB), DD0, DD0
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ BYTE $0xc4
+ BYTE $0x42
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x70
+ BYTE $0x10
+ BYTE $0xc4
+ BYTE $0x42
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x60
+ BYTE $0x20
+ BYTE $0xc4
+ BYTE $0xc2
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x60
+ BYTE $0x30
+ VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
// Special optimization, for very short buffers
- CMPQ inl, $192
+ CMPQ BX, $0xc0
JBE openAVX2192
- CMPQ inl, $320
+ CMPQ BX, $0x00000140
JBE openAVX2320
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
- VMOVDQA BB0, state1StoreAVX2
- VMOVDQA CC0, state2StoreAVX2
- VMOVDQA DD0, ctr3StoreAVX2
- MOVQ $10, itr2
+ VMOVDQA Y14, 32(BP)
+ VMOVDQA Y12, 64(BP)
+ VMOVDQA Y4, 192(BP)
+ MOVQ $0x0000000a, R9
openAVX2PreparePolyKey:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
- DECQ itr2
- JNE openAVX2PreparePolyKey
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0
- VPADDD state1StoreAVX2, BB0, BB0
- VPADDD state2StoreAVX2, CC0, CC0
- VPADDD ctr3StoreAVX2, DD0, DD0
-
- VPERM2I128 $0x02, AA0, BB0, TT0
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x04, Y4, Y4, Y4
+ DECQ R9
+ JNE openAVX2PreparePolyKey
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 192(BP), Y4, Y4
+ VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for the first 64 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
// Hash AD + first 64 bytes
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
- XORQ itr1, itr1
+ XORQ CX, CX
openAVX2InitialHash64:
- polyAdd(0(inp)(itr1*1))
- polyMulAVX2
- ADDQ $16, itr1
- CMPQ itr1, $64
- JNE openAVX2InitialHash64
+ ADDQ (SI)(CX*1), R10
+ ADCQ 8(SI)(CX*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ $0x10, CX
+ CMPQ CX, $0x40
+ JNE openAVX2InitialHash64
// Decrypt the first 64 bytes
- VPXOR (0*32)(inp), AA0, AA0
- VPXOR (1*32)(inp), BB0, BB0
- VMOVDQU AA0, (0*32)(oup)
- VMOVDQU BB0, (1*32)(oup)
- LEAQ (2*32)(inp), inp
- LEAQ (2*32)(oup), oup
- SUBQ $64, inl
+ VPXOR (SI), Y0, Y0
+ VPXOR 32(SI), Y14, Y14
+ VMOVDQU Y0, (DI)
+ VMOVDQU Y14, 32(DI)
+ LEAQ 64(SI), SI
+ LEAQ 64(DI), DI
+ SUBQ $0x40, BX
openAVX2MainLoop:
- CMPQ inl, $512
+ CMPQ BX, $0x00000200
JB openAVX2MainLoopDone
// Load state, increment counter blocks, store the incremented counters
- VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
- XORQ itr1, itr1
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ XORQ CX, CX
openAVX2InternalLoop:
- // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
- // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
- polyAdd(0*8(inp)(itr1*1))
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- polyMulStage1_AVX2
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- polyMulStage2_AVX2
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyMulStage3_AVX2
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulReduceStage
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- polyAdd(2*8(inp)(itr1*1))
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- polyMulStage1_AVX2
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulStage2_AVX2
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- polyMulStage3_AVX2
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- polyMulReduceStage
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyAdd(4*8(inp)(itr1*1))
- LEAQ (6*8)(itr1), itr1
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulStage1_AVX2
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- polyMulStage2_AVX2
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- polyMulStage3_AVX2
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulReduceStage
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
- CMPQ itr1, $480
+ ADDQ (SI)(CX*1), R10
+ ADCQ 8(SI)(CX*1), R11
+ ADCQ $0x01, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ ADDQ 16(SI)(CX*1), R10
+ ADCQ 24(SI)(CX*1), R11
+ ADCQ $0x01, R12
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ ADDQ 32(SI)(CX*1), R10
+ ADCQ 40(SI)(CX*1), R11
+ ADCQ $0x01, R12
+ LEAQ 48(CX), CX
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x04, Y3, Y3, Y3
+ CMPQ CX, $0x000001e0
JNE openAVX2InternalLoop
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
- VMOVDQA CC3, tmpStoreAVX2
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VMOVDQA Y15, 224(BP)
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
- polyAdd(480(inp))
- polyMulAVX2
- VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
- VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
- VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
- VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+ ADDQ 480(SI), R10
+ ADCQ 488(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPERM2I128 $0x02, Y0, Y14, Y15
+ VPERM2I128 $0x13, Y0, Y14, Y14
+ VPERM2I128 $0x02, Y12, Y4, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y12
+ VPXOR (SI), Y15, Y15
+ VPXOR 32(SI), Y0, Y0
+ VPXOR 64(SI), Y14, Y14
+ VPXOR 96(SI), Y12, Y12
+ VMOVDQU Y15, (DI)
+ VMOVDQU Y0, 32(DI)
+ VMOVDQU Y14, 64(DI)
+ VMOVDQU Y12, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 128(SI), Y0, Y0
+ VPXOR 160(SI), Y14, Y14
+ VPXOR 192(SI), Y12, Y12
+ VPXOR 224(SI), Y4, Y4
+ VMOVDQU Y0, 128(DI)
+ VMOVDQU Y14, 160(DI)
+ VMOVDQU Y12, 192(DI)
+ VMOVDQU Y4, 224(DI)
// and here
- polyAdd(496(inp))
- polyMulAVX2
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
- VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
- VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
- VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
- VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
- LEAQ (32*16)(inp), inp
- LEAQ (32*16)(oup), oup
- SUBQ $(32*16), inl
+ ADDQ 496(SI), R10
+ ADCQ 504(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 256(SI), Y0, Y0
+ VPXOR 288(SI), Y14, Y14
+ VPXOR 320(SI), Y12, Y12
+ VPXOR 352(SI), Y4, Y4
+ VMOVDQU Y0, 256(DI)
+ VMOVDQU Y14, 288(DI)
+ VMOVDQU Y12, 320(DI)
+ VMOVDQU Y4, 352(DI)
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, 224(BP), Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, 224(BP), Y3, Y4
+ VPXOR 384(SI), Y0, Y0
+ VPXOR 416(SI), Y14, Y14
+ VPXOR 448(SI), Y12, Y12
+ VPXOR 480(SI), Y4, Y4
+ VMOVDQU Y0, 384(DI)
+ VMOVDQU Y14, 416(DI)
+ VMOVDQU Y12, 448(DI)
+ VMOVDQU Y4, 480(DI)
+ LEAQ 512(SI), SI
+ LEAQ 512(DI), DI
+ SUBQ $0x00000200, BX
JMP openAVX2MainLoop
openAVX2MainLoopDone:
// Handle the various tail sizes efficiently
- TESTQ inl, inl
+ TESTQ BX, BX
JE openSSEFinalize
- CMPQ inl, $128
+ CMPQ BX, $0x80
JBE openAVX2Tail128
- CMPQ inl, $256
+ CMPQ BX, $0x00000100
JBE openAVX2Tail256
- CMPQ inl, $384
+ CMPQ BX, $0x00000180
JBE openAVX2Tail384
JMP openAVX2Tail512
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 193 bytes
openAVX2192:
- // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
- VMOVDQA AA0, AA1
- VMOVDQA BB0, BB1
- VMOVDQA CC0, CC1
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2
- VMOVDQA BB0, BB2
- VMOVDQA CC0, CC2
- VMOVDQA DD0, DD2
- VMOVDQA DD1, TT3
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y14, Y9
+ VMOVDQA Y12, Y13
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y0, Y6
+ VMOVDQA Y14, Y10
+ VMOVDQA Y12, Y8
+ VMOVDQA Y4, Y2
+ VMOVDQA Y1, Y15
+ MOVQ $0x0000000a, R9
openAVX2192InnerCipherLoop:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
- DECQ itr2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ DECQ R9
JNE openAVX2192InnerCipherLoop
- VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
- VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
- VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
- VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
- VPERM2I128 $0x02, AA0, BB0, TT0
+ VPADDD Y6, Y0, Y0
+ VPADDD Y6, Y5, Y5
+ VPADDD Y10, Y14, Y14
+ VPADDD Y10, Y9, Y9
+ VPADDD Y8, Y12, Y12
+ VPADDD Y8, Y13, Y13
+ VPADDD Y2, Y4, Y4
+ VPADDD Y15, Y1, Y1
+ VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for up to 192 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
- VPERM2I128 $0x02, AA1, BB1, CC0
- VPERM2I128 $0x02, CC1, DD1, DD0
- VPERM2I128 $0x13, AA1, BB1, AA1
- VPERM2I128 $0x13, CC1, DD1, BB1
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
+ VPERM2I128 $0x02, Y5, Y9, Y12
+ VPERM2I128 $0x02, Y13, Y1, Y4
+ VPERM2I128 $0x13, Y5, Y9, Y5
+ VPERM2I128 $0x13, Y13, Y1, Y9
openAVX2ShortOpen:
// Hash
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
openAVX2ShortOpenLoop:
- CMPQ inl, $32
+ CMPQ BX, $0x20
JB openAVX2ShortTail32
- SUBQ $32, inl
+ SUBQ $0x20, BX
// Load for hashing
- polyAdd(0*8(inp))
- polyMulAVX2
- polyAdd(2*8(inp))
- polyMulAVX2
+ ADDQ (SI), R10
+ ADCQ 8(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ 16(SI), R10
+ ADCQ 24(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Load for decryption
- VPXOR (inp), AA0, AA0
- VMOVDQU AA0, (oup)
- LEAQ (1*32)(inp), inp
- LEAQ (1*32)(oup), oup
+ VPXOR (SI), Y0, Y0
+ VMOVDQU Y0, (DI)
+ LEAQ 32(SI), SI
+ LEAQ 32(DI), DI
// Shift stream left
- VMOVDQA BB0, AA0
- VMOVDQA CC0, BB0
- VMOVDQA DD0, CC0
- VMOVDQA AA1, DD0
- VMOVDQA BB1, AA1
- VMOVDQA CC1, BB1
- VMOVDQA DD1, CC1
- VMOVDQA AA2, DD1
- VMOVDQA BB2, AA2
+ VMOVDQA Y14, Y0
+ VMOVDQA Y12, Y14
+ VMOVDQA Y4, Y12
+ VMOVDQA Y5, Y4
+ VMOVDQA Y9, Y5
+ VMOVDQA Y13, Y9
+ VMOVDQA Y1, Y13
+ VMOVDQA Y6, Y1
+ VMOVDQA Y10, Y6
JMP openAVX2ShortOpenLoop
openAVX2ShortTail32:
- CMPQ inl, $16
- VMOVDQA A0, A1
+ CMPQ BX, $0x10
+ VMOVDQA X0, X1
JB openAVX2ShortDone
-
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for hashing
- polyAdd(0*8(inp))
- polyMulAVX2
+ ADDQ (SI), R10
+ ADCQ 8(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Load for decryption
- VPXOR (inp), A0, T0
- VMOVDQU T0, (oup)
- LEAQ (1*16)(inp), inp
- LEAQ (1*16)(oup), oup
- VPERM2I128 $0x11, AA0, AA0, AA0
- VMOVDQA A0, A1
+ VPXOR (SI), X0, X12
+ VMOVDQU X12, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
+ VPERM2I128 $0x11, Y0, Y0, Y0
+ VMOVDQA X0, X1
openAVX2ShortDone:
VZEROUPPER
JMP openSSETail16
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 321 bytes
openAVX2320:
- // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
- VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
- VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y14, Y9
+ VMOVDQA Y12, Y13
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y0, Y6
+ VMOVDQA Y14, Y10
+ VMOVDQA Y12, Y8
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y14, Y7
+ VMOVDQA Y12, Y11
+ VMOVDQA Y4, Y15
+ MOVQ $0x0000000a, R9
openAVX2320InnerCipherLoop:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
- DECQ itr2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ DECQ R9
JNE openAVX2320InnerCipherLoop
-
- VMOVDQA ·chacha20Constants<>(SB), TT0
- VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
- VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
- VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
- VMOVDQA ·avx2IncMask<>(SB), TT0
- VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
- VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
- VPADDD TT3, DD2, DD2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y3
+ VPADDD Y3, Y0, Y0
+ VPADDD Y3, Y5, Y5
+ VPADDD Y3, Y6, Y6
+ VPADDD Y7, Y14, Y14
+ VPADDD Y7, Y9, Y9
+ VPADDD Y7, Y10, Y10
+ VPADDD Y11, Y12, Y12
+ VPADDD Y11, Y13, Y13
+ VPADDD Y11, Y8, Y8
+ VMOVDQA ·avx2IncMask<>+0(SB), Y3
+ VPADDD Y15, Y4, Y4
+ VPADDD Y3, Y15, Y15
+ VPADDD Y15, Y1, Y1
+ VPADDD Y3, Y15, Y15
+ VPADDD Y15, Y2, Y2
// Clamp and store poly key
- VPERM2I128 $0x02, AA0, BB0, TT0
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for up to 320 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
- VPERM2I128 $0x02, AA1, BB1, CC0
- VPERM2I128 $0x02, CC1, DD1, DD0
- VPERM2I128 $0x13, AA1, BB1, AA1
- VPERM2I128 $0x13, CC1, DD1, BB1
- VPERM2I128 $0x02, AA2, BB2, CC1
- VPERM2I128 $0x02, CC2, DD2, DD1
- VPERM2I128 $0x13, AA2, BB2, AA2
- VPERM2I128 $0x13, CC2, DD2, BB2
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
+ VPERM2I128 $0x02, Y5, Y9, Y12
+ VPERM2I128 $0x02, Y13, Y1, Y4
+ VPERM2I128 $0x13, Y5, Y9, Y5
+ VPERM2I128 $0x13, Y13, Y1, Y9
+ VPERM2I128 $0x02, Y6, Y10, Y13
+ VPERM2I128 $0x02, Y8, Y2, Y1
+ VPERM2I128 $0x13, Y6, Y10, Y6
+ VPERM2I128 $0x13, Y8, Y2, Y10
JMP openAVX2ShortOpen
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
openAVX2Tail128:
// Need to decrypt up to 128 bytes - prepare two blocks
- VMOVDQA ·chacha20Constants<>(SB), AA1
- VMOVDQA state1StoreAVX2, BB1
- VMOVDQA state2StoreAVX2, CC1
- VMOVDQA ctr3StoreAVX2, DD1
- VPADDD ·avx2IncMask<>(SB), DD1, DD1
- VMOVDQA DD1, DD0
-
- XORQ itr2, itr2
- MOVQ inl, itr1
- ANDQ $-16, itr1
- TESTQ itr1, itr1
- JE openAVX2Tail128LoopB
+ VMOVDQA ·chacha20Constants<>+0(SB), Y5
+ VMOVDQA 32(BP), Y9
+ VMOVDQA 64(BP), Y13
+ VMOVDQA 192(BP), Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y1
+ VMOVDQA Y1, Y4
+ XORQ R9, R9
+ MOVQ BX, CX
+ ANDQ $-16, CX
+ TESTQ CX, CX
+ JE openAVX2Tail128LoopB
openAVX2Tail128LoopA:
- // Perform ChaCha rounds, while hashing the remaining input
- polyAdd(0(inp)(itr2*1))
- polyMulAVX2
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
openAVX2Tail128LoopB:
- ADDQ $16, itr2
- chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD1, DD1, DD1
- chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD1, DD1, DD1
- CMPQ itr2, itr1
- JB openAVX2Tail128LoopA
- CMPQ itr2, $160
- JNE openAVX2Tail128LoopB
-
- VPADDD ·chacha20Constants<>(SB), AA1, AA1
- VPADDD state1StoreAVX2, BB1, BB1
- VPADDD state2StoreAVX2, CC1, CC1
- VPADDD DD0, DD1, DD1
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+ ADDQ $0x10, R9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y1, Y1, Y1
+ CMPQ R9, CX
+ JB openAVX2Tail128LoopA
+ CMPQ R9, $0xa0
+ JNE openAVX2Tail128LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 64(BP), Y13, Y13
+ VPADDD Y4, Y1, Y1
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
openAVX2TailLoop:
- CMPQ inl, $32
+ CMPQ BX, $0x20
JB openAVX2Tail
- SUBQ $32, inl
+ SUBQ $0x20, BX
// Load for decryption
- VPXOR (inp), AA0, AA0
- VMOVDQU AA0, (oup)
- LEAQ (1*32)(inp), inp
- LEAQ (1*32)(oup), oup
- VMOVDQA BB0, AA0
- VMOVDQA CC0, BB0
- VMOVDQA DD0, CC0
+ VPXOR (SI), Y0, Y0
+ VMOVDQU Y0, (DI)
+ LEAQ 32(SI), SI
+ LEAQ 32(DI), DI
+ VMOVDQA Y14, Y0
+ VMOVDQA Y12, Y14
+ VMOVDQA Y4, Y12
JMP openAVX2TailLoop
openAVX2Tail:
- CMPQ inl, $16
- VMOVDQA A0, A1
+ CMPQ BX, $0x10
+ VMOVDQA X0, X1
JB openAVX2TailDone
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for decryption
- VPXOR (inp), A0, T0
- VMOVDQU T0, (oup)
- LEAQ (1*16)(inp), inp
- LEAQ (1*16)(oup), oup
- VPERM2I128 $0x11, AA0, AA0, AA0
- VMOVDQA A0, A1
+ VPXOR (SI), X0, X12
+ VMOVDQU X12, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
+ VPERM2I128 $0x11, Y0, Y0, Y0
+ VMOVDQA X0, X1
openAVX2TailDone:
VZEROUPPER
JMP openSSETail16
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
openAVX2Tail256:
- // Need to decrypt up to 256 bytes - prepare four blocks
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA DD0, TT1
- VMOVDQA DD1, TT2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y4, Y7
+ VMOVDQA Y1, Y11
// Compute the number of iterations that will hash data
- MOVQ inl, tmpStoreAVX2
- MOVQ inl, itr1
- SUBQ $128, itr1
- SHRQ $4, itr1
- MOVQ $10, itr2
- CMPQ itr1, $10
- CMOVQGT itr2, itr1
- MOVQ inp, inl
- XORQ itr2, itr2
+ MOVQ BX, 224(BP)
+ MOVQ BX, CX
+ SUBQ $0x80, CX
+ SHRQ $0x04, CX
+ MOVQ $0x0000000a, R9
+ CMPQ CX, $0x0a
+ CMOVQGT R9, CX
+ MOVQ SI, BX
+ XORQ R9, R9
openAVX2Tail256LoopA:
- polyAdd(0(inl))
- polyMulAVX2
- LEAQ 16(inl), inl
+ ADDQ (BX), R10
+ ADCQ 8(BX), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(BX), BX
- // Perform ChaCha rounds, while hashing the remaining input
openAVX2Tail256LoopB:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
- INCQ itr2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
- CMPQ itr2, itr1
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ INCQ R9
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ CMPQ R9, CX
JB openAVX2Tail256LoopA
+ CMPQ R9, $0x0a
+ JNE openAVX2Tail256LoopB
+ MOVQ BX, R9
+ SUBQ SI, BX
+ MOVQ BX, CX
+ MOVQ 224(BP), BX
- CMPQ itr2, $10
- JNE openAVX2Tail256LoopB
-
- MOVQ inl, itr2
- SUBQ inp, inl
- MOVQ inl, itr1
- MOVQ tmpStoreAVX2, inl
-
- // Hash the remainder of data (if any)
openAVX2Tail256Hash:
- ADDQ $16, itr1
- CMPQ itr1, inl
- JGT openAVX2Tail256HashEnd
- polyAdd (0(itr2))
- polyMulAVX2
- LEAQ 16(itr2), itr2
- JMP openAVX2Tail256Hash
+ ADDQ $0x10, CX
+ CMPQ CX, BX
+ JGT openAVX2Tail256HashEnd
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
+ JMP openAVX2Tail256Hash
-// Store 128 bytes safely, then go to store loop
openAVX2Tail256HashEnd:
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
- VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
- VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-
- VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
- VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
- LEAQ (4*32)(inp), inp
- LEAQ (4*32)(oup), oup
- SUBQ $4*32, inl
-
- JMP openAVX2TailLoop
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD Y7, Y4, Y4
+ VPADDD Y11, Y1, Y1
+ VPERM2I128 $0x02, Y0, Y14, Y6
+ VPERM2I128 $0x02, Y12, Y4, Y10
+ VPERM2I128 $0x13, Y0, Y14, Y8
+ VPERM2I128 $0x13, Y12, Y4, Y2
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR (SI), Y6, Y6
+ VPXOR 32(SI), Y10, Y10
+ VPXOR 64(SI), Y8, Y8
+ VPXOR 96(SI), Y2, Y2
+ VMOVDQU Y6, (DI)
+ VMOVDQU Y10, 32(DI)
+ VMOVDQU Y8, 64(DI)
+ VMOVDQU Y2, 96(DI)
+ LEAQ 128(SI), SI
+ LEAQ 128(DI), DI
+ SUBQ $0x80, BX
+ JMP openAVX2TailLoop
-// ----------------------------------------------------------------------------
-// Special optimization for the last 384 bytes of ciphertext
openAVX2Tail384:
// Need to decrypt up to 384 bytes - prepare six blocks
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VPADDD ·avx2IncMask<>(SB), DD1, DD2
- VMOVDQA DD0, ctr0StoreAVX2
- VMOVDQA DD1, ctr1StoreAVX2
- VMOVDQA DD2, ctr2StoreAVX2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
// Compute the number of iterations that will hash two blocks of data
- MOVQ inl, tmpStoreAVX2
- MOVQ inl, itr1
- SUBQ $256, itr1
- SHRQ $4, itr1
- ADDQ $6, itr1
- MOVQ $10, itr2
- CMPQ itr1, $10
- CMOVQGT itr2, itr1
- MOVQ inp, inl
- XORQ itr2, itr2
+ MOVQ BX, 224(BP)
+ MOVQ BX, CX
+ SUBQ $0x00000100, CX
+ SHRQ $0x04, CX
+ ADDQ $0x06, CX
+ MOVQ $0x0000000a, R9
+ CMPQ CX, $0x0a
+ CMOVQGT R9, CX
+ MOVQ SI, BX
+ XORQ R9, R9
- // Perform ChaCha rounds, while hashing the remaining input
openAVX2Tail384LoopB:
- polyAdd(0(inl))
- polyMulAVX2
- LEAQ 16(inl), inl
+ ADDQ (BX), R10
+ ADCQ 8(BX), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(BX), BX
openAVX2Tail384LoopA:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
- polyAdd(0(inl))
- polyMulAVX2
- LEAQ 16(inl), inl
- INCQ itr2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-
- CMPQ itr2, itr1
- JB openAVX2Tail384LoopB
-
- CMPQ itr2, $10
- JNE openAVX2Tail384LoopA
-
- MOVQ inl, itr2
- SUBQ inp, inl
- MOVQ inl, itr1
- MOVQ tmpStoreAVX2, inl
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ ADDQ (BX), R10
+ ADCQ 8(BX), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(BX), BX
+ INCQ R9
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ CMPQ R9, CX
+ JB openAVX2Tail384LoopB
+ CMPQ R9, $0x0a
+ JNE openAVX2Tail384LoopA
+ MOVQ BX, R9
+ SUBQ SI, BX
+ MOVQ BX, CX
+ MOVQ 224(BP), BX
openAVX2Tail384Hash:
- ADDQ $16, itr1
- CMPQ itr1, inl
- JGT openAVX2Tail384HashEnd
- polyAdd(0(itr2))
- polyMulAVX2
- LEAQ 16(itr2), itr2
- JMP openAVX2Tail384Hash
+ ADDQ $0x10, CX
+ CMPQ CX, BX
+ JGT openAVX2Tail384HashEnd
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
+ JMP openAVX2Tail384Hash
-// Store 256 bytes safely, then go to store loop
openAVX2Tail384HashEnd:
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
- VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
- VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
- VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
- VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
- VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- LEAQ (8*32)(inp), inp
- LEAQ (8*32)(oup), oup
- SUBQ $8*32, inl
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPERM2I128 $0x02, Y12, Y4, Y7
+ VPERM2I128 $0x13, Y0, Y14, Y11
+ VPERM2I128 $0x13, Y12, Y4, Y15
+ VPXOR (SI), Y3, Y3
+ VPXOR 32(SI), Y7, Y7
+ VPXOR 64(SI), Y11, Y11
+ VPXOR 96(SI), Y15, Y15
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y7, 32(DI)
+ VMOVDQU Y11, 64(DI)
+ VMOVDQU Y15, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y3
+ VPERM2I128 $0x02, Y13, Y1, Y7
+ VPERM2I128 $0x13, Y5, Y9, Y11
+ VPERM2I128 $0x13, Y13, Y1, Y15
+ VPXOR 128(SI), Y3, Y3
+ VPXOR 160(SI), Y7, Y7
+ VPXOR 192(SI), Y11, Y11
+ VPXOR 224(SI), Y15, Y15
+ VMOVDQU Y3, 128(DI)
+ VMOVDQU Y7, 160(DI)
+ VMOVDQU Y11, 192(DI)
+ VMOVDQU Y15, 224(DI)
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ LEAQ 256(SI), SI
+ LEAQ 256(DI), DI
+ SUBQ $0x00000100, BX
JMP openAVX2TailLoop
-// ----------------------------------------------------------------------------
-// Special optimization for the last 512 bytes of ciphertext
openAVX2Tail512:
- VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
- XORQ itr1, itr1
- MOVQ inp, itr2
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ XORQ CX, CX
+ MOVQ SI, R9
openAVX2Tail512LoopB:
- polyAdd(0(itr2))
- polyMulAVX2
- LEAQ (2*8)(itr2), itr2
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
openAVX2Tail512LoopA:
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyAdd(0*8(itr2))
- polyMulAVX2
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyAdd(2*8(itr2))
- polyMulAVX2
- LEAQ (4*8)(itr2), itr2
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
- INCQ itr1
- CMPQ itr1, $4
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ ADDQ 16(R9), R10
+ ADCQ 24(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(R9), R9
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x04, Y3, Y3, Y3
+ INCQ CX
+ CMPQ CX, $0x04
JLT openAVX2Tail512LoopB
-
- CMPQ itr1, $10
- JNE openAVX2Tail512LoopA
-
- MOVQ inl, itr1
- SUBQ $384, itr1
- ANDQ $-16, itr1
+ CMPQ CX, $0x0a
+ JNE openAVX2Tail512LoopA
+ MOVQ BX, CX
+ SUBQ $0x00000180, CX
+ ANDQ $-16, CX
openAVX2Tail512HashLoop:
- TESTQ itr1, itr1
+ TESTQ CX, CX
JE openAVX2Tail512HashEnd
- polyAdd(0(itr2))
- polyMulAVX2
- LEAQ 16(itr2), itr2
- SUBQ $16, itr1
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
+ SUBQ $0x10, CX
JMP openAVX2Tail512HashLoop
openAVX2Tail512HashEnd:
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
- VMOVDQA CC3, tmpStoreAVX2
- VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
- VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
- VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
- VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
- VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
- VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VMOVDQA Y15, 224(BP)
+ VPERM2I128 $0x02, Y0, Y14, Y15
+ VPERM2I128 $0x13, Y0, Y14, Y14
+ VPERM2I128 $0x02, Y12, Y4, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y12
+ VPXOR (SI), Y15, Y15
+ VPXOR 32(SI), Y0, Y0
+ VPXOR 64(SI), Y14, Y14
+ VPXOR 96(SI), Y12, Y12
+ VMOVDQU Y15, (DI)
+ VMOVDQU Y0, 32(DI)
+ VMOVDQU Y14, 64(DI)
+ VMOVDQU Y12, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 128(SI), Y0, Y0
+ VPXOR 160(SI), Y14, Y14
+ VPXOR 192(SI), Y12, Y12
+ VPXOR 224(SI), Y4, Y4
+ VMOVDQU Y0, 128(DI)
+ VMOVDQU Y14, 160(DI)
+ VMOVDQU Y12, 192(DI)
+ VMOVDQU Y4, 224(DI)
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 256(SI), Y0, Y0
+ VPXOR 288(SI), Y14, Y14
+ VPXOR 320(SI), Y12, Y12
+ VPXOR 352(SI), Y4, Y4
+ VMOVDQU Y0, 256(DI)
+ VMOVDQU Y14, 288(DI)
+ VMOVDQU Y12, 320(DI)
+ VMOVDQU Y4, 352(DI)
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, 224(BP), Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, 224(BP), Y3, Y4
+ LEAQ 384(SI), SI
+ LEAQ 384(DI), DI
+ SUBQ $0x00000180, BX
+ JMP openAVX2TailLoop
- LEAQ (12*32)(inp), inp
- LEAQ (12*32)(oup), oup
- SUBQ $12*32, inl
+DATA ·chacha20Constants<>+0(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574
+DATA ·chacha20Constants<>+16(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574
+GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32
- JMP openAVX2TailLoop
+DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff
+DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc
+DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff
+DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff
+GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32
-// ----------------------------------------------------------------------------
-// ----------------------------------------------------------------------------
-// func chacha20Poly1305Seal(dst, key, src, ad []byte)
-TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
- // For aligned stack access
+DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001
+DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000
+GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16
+
+DATA ·andMask<>+0(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+8(SB)/8, $0x0000000000000000
+DATA ·andMask<>+16(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+24(SB)/8, $0x0000000000000000
+DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+40(SB)/8, $0x0000000000000000
+DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+56(SB)/8, $0x0000000000000000
+DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+72(SB)/8, $0x0000000000000000
+DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+88(SB)/8, $0x0000000000000000
+DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff
+DATA ·andMask<>+104(SB)/8, $0x0000000000000000
+DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+120(SB)/8, $0x0000000000000000
+DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+136(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+152(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff
+GLOBL ·andMask<>(SB), RODATA|NOPTR, $240
+
+DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000
+DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000
+DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001
+DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000
+GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32
+
+DATA ·rol16<>+0(SB)/8, $0x0504070601000302
+DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
+DATA ·rol16<>+16(SB)/8, $0x0504070601000302
+DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a
+GLOBL ·rol16<>(SB), RODATA|NOPTR, $32
+
+DATA ·rol8<>+0(SB)/8, $0x0605040702010003
+DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b
+DATA ·rol8<>+16(SB)/8, $0x0605040702010003
+DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b
+GLOBL ·rol8<>(SB), RODATA|NOPTR, $32
+
+DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002
+DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000
+DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002
+DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000
+GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32
+
+// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte)
+// Requires: AVX, AVX2, BMI2, CMOV, SSE2
+TEXT ·chacha20Poly1305Seal(SB), $288-96
MOVQ SP, BP
- ADDQ $32, BP
+ ADDQ $0x20, BP
ANDQ $-32, BP
- MOVQ dst+0(FP), oup
- MOVQ key+24(FP), keyp
- MOVQ src+48(FP), inp
- MOVQ src_len+56(FP), inl
- MOVQ ad+72(FP), adp
-
- CMPB ·useAVX2(SB), $1
+ MOVQ dst_base+0(FP), DI
+ MOVQ key_base+24(FP), R8
+ MOVQ src_base+48(FP), SI
+ MOVQ src_len+56(FP), BX
+ MOVQ ad_base+72(FP), CX
+ CMPB ·useAVX2+0(SB), $0x01
JE chacha20Poly1305Seal_AVX2
// Special optimization, for very short buffers
- CMPQ inl, $128
- JBE sealSSE128 // About 15% faster
+ CMPQ BX, $0x80
+ JBE sealSSE128
// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
- MOVOU ·chacha20Constants<>(SB), A0
- MOVOU (1*16)(keyp), B0
- MOVOU (2*16)(keyp), C0
- MOVOU (3*16)(keyp), D0
+ MOVOU ·chacha20Constants<>+0(SB), X0
+ MOVOU 16(R8), X3
+ MOVOU 32(R8), X6
+ MOVOU 48(R8), X9
// Store state on stack for future use
- MOVO B0, state1Store
- MOVO C0, state2Store
+ MOVO X3, 32(BP)
+ MOVO X6, 48(BP)
// Load state, increment counter blocks
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X2, X12
+ MOVO X5, X13
+ MOVO X8, X14
+ MOVO X11, X15
+ PADDL ·sseIncMask<>+0(SB), X15
// Store counters
- MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
- MOVQ $10, itr2
+ MOVO X9, 80(BP)
+ MOVO X10, 96(BP)
+ MOVO X11, 112(BP)
+ MOVO X15, 128(BP)
+ MOVQ $0x0000000a, R9
sealSSEIntroLoop:
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
- shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
- shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
-
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
- shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
- shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
- DECQ itr2
- JNE sealSSEIntroLoop
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x0c
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x04
+ DECQ R9
+ JNE sealSSEIntroLoop
// Add in the state
- PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
- PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
- PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
- PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+ PADDD ·chacha20Constants<>+0(SB), X0
+ PADDD ·chacha20Constants<>+0(SB), X1
+ PADDD ·chacha20Constants<>+0(SB), X2
+ PADDD ·chacha20Constants<>+0(SB), X12
+ PADDD 32(BP), X3
+ PADDD 32(BP), X4
+ PADDD 32(BP), X5
+ PADDD 32(BP), X13
+ PADDD 48(BP), X7
+ PADDD 48(BP), X8
+ PADDD 48(BP), X14
+ PADDD 96(BP), X10
+ PADDD 112(BP), X11
+ PADDD 128(BP), X15
// Clamp and store the key
- PAND ·polyClampMask<>(SB), A0
- MOVO A0, rStore
- MOVO B0, sStore
+ PAND ·polyClampMask<>+0(SB), X0
+ MOVO X0, (BP)
+ MOVO X3, 16(BP)
// Hash AAD
- MOVQ ad_len+80(FP), itr2
- CALL polyHashADInternal<>(SB)
-
- MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
- PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
- MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
- MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
- PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
- MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
-
- MOVQ $128, itr1
- SUBQ $128, inl
- LEAQ 128(inp), inp
-
- MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
-
- CMPQ inl, $64
- JBE sealSSE128SealHash
-
- MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
- PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
- MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
-
- ADDQ $64, itr1
- SUBQ $64, inl
- LEAQ 64(inp), inp
-
- MOVQ $2, itr1
- MOVQ $8, itr2
-
- CMPQ inl, $64
- JBE sealSSETail64
- CMPQ inl, $128
- JBE sealSSETail128
- CMPQ inl, $192
- JBE sealSSETail192
+ MOVQ ad_len+80(FP), R9
+ CALL polyHashADInternal<>(SB)
+ MOVOU (SI), X0
+ MOVOU 16(SI), X3
+ MOVOU 32(SI), X6
+ MOVOU 48(SI), X9
+ PXOR X0, X1
+ PXOR X3, X4
+ PXOR X6, X7
+ PXOR X9, X10
+ MOVOU X1, (DI)
+ MOVOU X4, 16(DI)
+ MOVOU X7, 32(DI)
+ MOVOU X10, 48(DI)
+ MOVOU 64(SI), X0
+ MOVOU 80(SI), X3
+ MOVOU 96(SI), X6
+ MOVOU 112(SI), X9
+ PXOR X0, X2
+ PXOR X3, X5
+ PXOR X6, X8
+ PXOR X9, X11
+ MOVOU X2, 64(DI)
+ MOVOU X5, 80(DI)
+ MOVOU X8, 96(DI)
+ MOVOU X11, 112(DI)
+ MOVQ $0x00000080, CX
+ SUBQ $0x80, BX
+ LEAQ 128(SI), SI
+ MOVO X12, X1
+ MOVO X13, X4
+ MOVO X14, X7
+ MOVO X15, X10
+ CMPQ BX, $0x40
+ JBE sealSSE128SealHash
+ MOVOU (SI), X0
+ MOVOU 16(SI), X3
+ MOVOU 32(SI), X6
+ MOVOU 48(SI), X9
+ PXOR X0, X12
+ PXOR X3, X13
+ PXOR X6, X14
+ PXOR X9, X15
+ MOVOU X12, 128(DI)
+ MOVOU X13, 144(DI)
+ MOVOU X14, 160(DI)
+ MOVOU X15, 176(DI)
+ ADDQ $0x40, CX
+ SUBQ $0x40, BX
+ LEAQ 64(SI), SI
+ MOVQ $0x00000002, CX
+ MOVQ $0x00000008, R9
+ CMPQ BX, $0x40
+ JBE sealSSETail64
+ CMPQ BX, $0x80
+ JBE sealSSETail128
+ CMPQ BX, $0xc0
+ JBE sealSSETail192
sealSSEMainLoop:
// Load state, increment counter blocks
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X2, X12
+ MOVO X5, X13
+ MOVO X8, X14
+ MOVO X11, X15
+ PADDL ·sseIncMask<>+0(SB), X15
// Store counters
- MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+ MOVO X9, 80(BP)
+ MOVO X10, 96(BP)
+ MOVO X11, 112(BP)
+ MOVO X15, 128(BP)
sealSSEInnerLoop:
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyAdd(0(oup))
- shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
- shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
- shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
- polyMulStage1
- polyMulStage2
- LEAQ (2*8)(oup), oup
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- polyMulStage3
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyMulReduceStage
- shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
- shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
- shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
- DECQ itr2
- JGE sealSSEInnerLoop
- polyAdd(0(oup))
- polyMul
- LEAQ (2*8)(oup), oup
- DECQ itr1
- JG sealSSEInnerLoop
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x0c
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ LEAQ 16(DI), DI
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x04
+ DECQ R9
+ JGE sealSSEInnerLoop
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ DECQ CX
+ JG sealSSEInnerLoop
// Add in the state
- PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
- PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
- PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
- PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
- MOVO D3, tmpStore
+ PADDD ·chacha20Constants<>+0(SB), X0
+ PADDD ·chacha20Constants<>+0(SB), X1
+ PADDD ·chacha20Constants<>+0(SB), X2
+ PADDD ·chacha20Constants<>+0(SB), X12
+ PADDD 32(BP), X3
+ PADDD 32(BP), X4
+ PADDD 32(BP), X5
+ PADDD 32(BP), X13
+ PADDD 48(BP), X6
+ PADDD 48(BP), X7
+ PADDD 48(BP), X8
+ PADDD 48(BP), X14
+ PADDD 80(BP), X9
+ PADDD 96(BP), X10
+ PADDD 112(BP), X11
+ PADDD 128(BP), X15
+ MOVO X15, 64(BP)
// Load - xor - store
- MOVOU (0*16)(inp), D3; PXOR D3, A0
- MOVOU (1*16)(inp), D3; PXOR D3, B0
- MOVOU (2*16)(inp), D3; PXOR D3, C0
- MOVOU (3*16)(inp), D3; PXOR D3, D0
- MOVOU A0, (0*16)(oup)
- MOVOU B0, (1*16)(oup)
- MOVOU C0, (2*16)(oup)
- MOVOU D0, (3*16)(oup)
- MOVO tmpStore, D3
-
- MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
- PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
- MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
- MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
- PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
- MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
- ADDQ $192, inp
- MOVQ $192, itr1
- SUBQ $192, inl
- MOVO A3, A1
- MOVO B3, B1
- MOVO C3, C1
- MOVO D3, D1
- CMPQ inl, $64
+ MOVOU (SI), X15
+ PXOR X15, X0
+ MOVOU 16(SI), X15
+ PXOR X15, X3
+ MOVOU 32(SI), X15
+ PXOR X15, X6
+ MOVOU 48(SI), X15
+ PXOR X15, X9
+ MOVOU X0, (DI)
+ MOVOU X3, 16(DI)
+ MOVOU X6, 32(DI)
+ MOVOU X9, 48(DI)
+ MOVO 64(BP), X15
+ MOVOU 64(SI), X0
+ MOVOU 80(SI), X3
+ MOVOU 96(SI), X6
+ MOVOU 112(SI), X9
+ PXOR X0, X1
+ PXOR X3, X4
+ PXOR X6, X7
+ PXOR X9, X10
+ MOVOU X1, 64(DI)
+ MOVOU X4, 80(DI)
+ MOVOU X7, 96(DI)
+ MOVOU X10, 112(DI)
+ MOVOU 128(SI), X0
+ MOVOU 144(SI), X3
+ MOVOU 160(SI), X6
+ MOVOU 176(SI), X9
+ PXOR X0, X2
+ PXOR X3, X5
+ PXOR X6, X8
+ PXOR X9, X11
+ MOVOU X2, 128(DI)
+ MOVOU X5, 144(DI)
+ MOVOU X8, 160(DI)
+ MOVOU X11, 176(DI)
+ ADDQ $0xc0, SI
+ MOVQ $0x000000c0, CX
+ SUBQ $0xc0, BX
+ MOVO X12, X1
+ MOVO X13, X4
+ MOVO X14, X7
+ MOVO X15, X10
+ CMPQ BX, $0x40
JBE sealSSE128SealHash
- MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
- PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
- MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
- LEAQ 64(inp), inp
- SUBQ $64, inl
- MOVQ $6, itr1
- MOVQ $4, itr2
- CMPQ inl, $192
+ MOVOU (SI), X0
+ MOVOU 16(SI), X3
+ MOVOU 32(SI), X6
+ MOVOU 48(SI), X9
+ PXOR X0, X12
+ PXOR X3, X13
+ PXOR X6, X14
+ PXOR X9, X15
+ MOVOU X12, 192(DI)
+ MOVOU X13, 208(DI)
+ MOVOU X14, 224(DI)
+ MOVOU X15, 240(DI)
+ LEAQ 64(SI), SI
+ SUBQ $0x40, BX
+ MOVQ $0x00000006, CX
+ MOVQ $0x00000004, R9
+ CMPQ BX, $0xc0
JG sealSSEMainLoop
-
- MOVQ inl, itr1
- TESTQ inl, inl
+ MOVQ BX, CX
+ TESTQ BX, BX
JE sealSSE128SealHash
- MOVQ $6, itr1
- CMPQ inl, $64
+ MOVQ $0x00000006, CX
+ CMPQ BX, $0x40
JBE sealSSETail64
- CMPQ inl, $128
+ CMPQ BX, $0x80
JBE sealSSETail128
JMP sealSSETail192
-// ----------------------------------------------------------------------------
-// Special optimization for the last 64 bytes of plaintext
sealSSETail64:
- // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
- MOVO ·chacha20Constants<>(SB), A1
- MOVO state1Store, B1
- MOVO state2Store, C1
- MOVO ctr3Store, D1
- PADDL ·sseIncMask<>(SB), D1
- MOVO D1, ctr0Store
+ MOVO ·chacha20Constants<>+0(SB), X1
+ MOVO 32(BP), X4
+ MOVO 48(BP), X7
+ MOVO 128(BP), X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 80(BP)
sealSSETail64LoopA:
- // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealSSETail64LoopB:
- chachaQR(A1, B1, C1, D1, T1)
- shiftB1Left; shiftC1Left; shiftD1Left
- chachaQR(A1, B1, C1, D1, T1)
- shiftB1Right; shiftC1Right; shiftD1Right
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
-
- DECQ itr1
- JG sealSSETail64LoopA
-
- DECQ itr2
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X13)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X13
+ PSLLL $0x0c, X13
+ PSRLL $0x14, X4
+ PXOR X13, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X13)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X13
+ PSLLL $0x07, X13
+ PSRLL $0x19, X4
+ PXOR X13, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X13)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X13
+ PSLLL $0x0c, X13
+ PSRLL $0x14, X4
+ PXOR X13, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X13)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X13
+ PSLLL $0x07, X13
+ PSRLL $0x19, X4
+ PXOR X13, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ DECQ CX
+ JG sealSSETail64LoopA
+ DECQ R9
JGE sealSSETail64LoopB
- PADDL ·chacha20Constants<>(SB), A1
- PADDL state1Store, B1
- PADDL state2Store, C1
- PADDL ctr0Store, D1
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL 32(BP), X4
+ PADDL 48(BP), X7
+ PADDL 80(BP), X10
+ JMP sealSSE128Seal
- JMP sealSSE128Seal
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of plaintext
sealSSETail128:
- // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 80(BP)
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 96(BP)
sealSSETail128LoopA:
- // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealSSETail128LoopB:
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- shiftB1Left; shiftC1Left; shiftD1Left
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- shiftB1Right; shiftC1Right; shiftD1Right
-
- DECQ itr1
- JG sealSSETail128LoopA
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ DECQ CX
+ JG sealSSETail128LoopA
+ DECQ R9
+ JGE sealSSETail128LoopB
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL 32(BP), X3
+ PADDL 32(BP), X4
+ PADDL 48(BP), X6
+ PADDL 48(BP), X7
+ PADDL 80(BP), X9
+ PADDL 96(BP), X10
+ MOVOU (SI), X12
+ MOVOU 16(SI), X13
+ MOVOU 32(SI), X14
+ MOVOU 48(SI), X15
+ PXOR X12, X0
+ PXOR X13, X3
+ PXOR X14, X6
+ PXOR X15, X9
+ MOVOU X0, (DI)
+ MOVOU X3, 16(DI)
+ MOVOU X6, 32(DI)
+ MOVOU X9, 48(DI)
+ MOVQ $0x00000040, CX
+ LEAQ 64(SI), SI
+ SUBQ $0x40, BX
+ JMP sealSSE128SealHash
- DECQ itr2
- JGE sealSSETail128LoopB
-
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
- PADDL state1Store, B0; PADDL state1Store, B1
- PADDL state2Store, C0; PADDL state2Store, C1
- PADDL ctr0Store, D0; PADDL ctr1Store, D1
-
- MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
- PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
- MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
-
- MOVQ $64, itr1
- LEAQ 64(inp), inp
- SUBQ $64, inl
-
- JMP sealSSE128SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 192 bytes of plaintext
sealSSETail192:
- // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 80(BP)
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 96(BP)
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X11, 112(BP)
sealSSETail192LoopA:
- // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealSSETail192LoopB:
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- shiftB1Left; shiftC1Left; shiftD1Left
- shiftB2Left; shiftC2Left; shiftD2Left
-
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
-
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- shiftB1Right; shiftC1Right; shiftD1Right
- shiftB2Right; shiftC2Right; shiftD2Right
-
- DECQ itr1
- JG sealSSETail192LoopA
-
- DECQ itr2
- JGE sealSSETail192LoopB
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ DECQ CX
+ JG sealSSETail192LoopA
+ DECQ R9
+ JGE sealSSETail192LoopB
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL ·chacha20Constants<>+0(SB), X2
+ PADDL 32(BP), X3
+ PADDL 32(BP), X4
+ PADDL 32(BP), X5
+ PADDL 48(BP), X6
+ PADDL 48(BP), X7
+ PADDL 48(BP), X8
+ PADDL 80(BP), X9
+ PADDL 96(BP), X10
+ PADDL 112(BP), X11
+ MOVOU (SI), X12
+ MOVOU 16(SI), X13
+ MOVOU 32(SI), X14
+ MOVOU 48(SI), X15
+ PXOR X12, X0
+ PXOR X13, X3
+ PXOR X14, X6
+ PXOR X15, X9
+ MOVOU X0, (DI)
+ MOVOU X3, 16(DI)
+ MOVOU X6, 32(DI)
+ MOVOU X9, 48(DI)
+ MOVOU 64(SI), X12
+ MOVOU 80(SI), X13
+ MOVOU 96(SI), X14
+ MOVOU 112(SI), X15
+ PXOR X12, X1
+ PXOR X13, X4
+ PXOR X14, X7
+ PXOR X15, X10
+ MOVOU X1, 64(DI)
+ MOVOU X4, 80(DI)
+ MOVOU X7, 96(DI)
+ MOVOU X10, 112(DI)
+ MOVO X2, X1
+ MOVO X5, X4
+ MOVO X8, X7
+ MOVO X11, X10
+ MOVQ $0x00000080, CX
+ LEAQ 128(SI), SI
+ SUBQ $0x80, BX
+ JMP sealSSE128SealHash
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
- PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
- PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
- PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
-
- MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
- PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
- MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
- MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
- PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
- MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-
- MOVO A2, A1
- MOVO B2, B1
- MOVO C2, C1
- MOVO D2, D1
- MOVQ $128, itr1
- LEAQ 128(inp), inp
- SUBQ $128, inl
-
- JMP sealSSE128SealHash
-
-// ----------------------------------------------------------------------------
-// Special seal optimization for buffers smaller than 129 bytes
sealSSE128:
- // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
- MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
- MOVQ $10, itr2
+ MOVOU ·chacha20Constants<>+0(SB), X0
+ MOVOU 16(R8), X3
+ MOVOU 32(R8), X6
+ MOVOU 48(R8), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X3, X13
+ MOVO X6, X14
+ MOVO X10, X15
+ MOVQ $0x0000000a, R9
sealSSE128InnerCipherLoop:
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Left; shiftB1Left; shiftB2Left
- shiftC0Left; shiftC1Left; shiftC2Left
- shiftD0Left; shiftD1Left; shiftD2Left
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Right; shiftB1Right; shiftB2Right
- shiftC0Right; shiftC1Right; shiftC2Right
- shiftD0Right; shiftD1Right; shiftD2Right
- DECQ itr2
- JNE sealSSE128InnerCipherLoop
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ DECQ R9
+ JNE sealSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
- PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
- PADDL T2, C1; PADDL T2, C2
- PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
- PAND ·polyClampMask<>(SB), A0
- MOVOU A0, rStore
- MOVOU B0, sStore
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL ·chacha20Constants<>+0(SB), X2
+ PADDL X13, X3
+ PADDL X13, X4
+ PADDL X13, X5
+ PADDL X14, X7
+ PADDL X14, X8
+ PADDL X15, X10
+ PADDL ·sseIncMask<>+0(SB), X15
+ PADDL X15, X11
+ PAND ·polyClampMask<>+0(SB), X0
+ MOVOU X0, (BP)
+ MOVOU X3, 16(BP)
// Hash
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
- XORQ itr1, itr1
+ XORQ CX, CX
sealSSE128SealHash:
- // itr1 holds the number of bytes encrypted but not yet hashed
- CMPQ itr1, $16
- JB sealSSE128Seal
- polyAdd(0(oup))
- polyMul
-
- SUBQ $16, itr1
- ADDQ $16, oup
-
- JMP sealSSE128SealHash
+ CMPQ CX, $0x10
+ JB sealSSE128Seal
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ SUBQ $0x10, CX
+ ADDQ $0x10, DI
+ JMP sealSSE128SealHash
sealSSE128Seal:
- CMPQ inl, $16
+ CMPQ BX, $0x10
JB sealSSETail
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for decryption
- MOVOU (inp), T0
- PXOR T0, A1
- MOVOU A1, (oup)
- LEAQ (1*16)(inp), inp
- LEAQ (1*16)(oup), oup
+ MOVOU (SI), X12
+ PXOR X12, X1
+ MOVOU X1, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
// Extract for hashing
- MOVQ A1, t0
- PSRLDQ $8, A1
- MOVQ A1, t1
- ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
- polyMul
+ MOVQ X1, R13
+ PSRLDQ $0x08, X1
+ MOVQ X1, R14
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Shift the stream "left"
- MOVO B1, A1
- MOVO C1, B1
- MOVO D1, C1
- MOVO A2, D1
- MOVO B2, A2
- MOVO C2, B2
- MOVO D2, C2
+ MOVO X4, X1
+ MOVO X7, X4
+ MOVO X10, X7
+ MOVO X2, X10
+ MOVO X5, X2
+ MOVO X8, X5
+ MOVO X11, X8
JMP sealSSE128Seal
sealSSETail:
- TESTQ inl, inl
+ TESTQ BX, BX
JE sealSSEFinalize
// We can only load the PT one byte at a time to avoid read after end of buffer
- MOVQ inl, itr2
- SHLQ $4, itr2
- LEAQ ·andMask<>(SB), t0
- MOVQ inl, itr1
- LEAQ -1(inp)(inl*1), inp
- XORQ t2, t2
- XORQ t3, t3
+ MOVQ BX, R9
+ SHLQ $0x04, R9
+ LEAQ ·andMask<>+0(SB), R13
+ MOVQ BX, CX
+ LEAQ -1(SI)(BX*1), SI
+ XORQ R15, R15
+ XORQ R8, R8
XORQ AX, AX
sealSSETailLoadLoop:
- SHLQ $8, t2, t3
- SHLQ $8, t2
- MOVB (inp), AX
- XORQ AX, t2
- LEAQ -1(inp), inp
- DECQ itr1
+ SHLQ $0x08, R15, R8
+ SHLQ $0x08, R15
+ MOVB (SI), AX
+ XORQ AX, R15
+ LEAQ -1(SI), SI
+ DECQ CX
JNE sealSSETailLoadLoop
- MOVQ t2, 0+tmpStore
- MOVQ t3, 8+tmpStore
- PXOR 0+tmpStore, A1
- MOVOU A1, (oup)
- MOVOU -16(t0)(itr2*1), T0
- PAND T0, A1
- MOVQ A1, t0
- PSRLDQ $8, A1
- MOVQ A1, t1
- ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
- polyMul
-
- ADDQ inl, oup
+ MOVQ R15, 64(BP)
+ MOVQ R8, 72(BP)
+ PXOR 64(BP), X1
+ MOVOU X1, (DI)
+ MOVOU -16(R13)(R9*1), X12
+ PAND X12, X1
+ MOVQ X1, R13
+ PSRLDQ $0x08, X1
+ MOVQ X1, R14
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ BX, DI
sealSSEFinalize:
// Hash in the buffer lengths
- ADDQ ad_len+80(FP), acc0
- ADCQ src_len+56(FP), acc1
- ADCQ $1, acc2
- polyMul
+ ADDQ ad_len+80(FP), R10
+ ADCQ src_len+56(FP), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Final reduce
- MOVQ acc0, t0
- MOVQ acc1, t1
- MOVQ acc2, t2
- SUBQ $-5, acc0
- SBBQ $-1, acc1
- SBBQ $3, acc2
- CMOVQCS t0, acc0
- CMOVQCS t1, acc1
- CMOVQCS t2, acc2
+ MOVQ R10, R13
+ MOVQ R11, R14
+ MOVQ R12, R15
+ SUBQ $-5, R10
+ SBBQ $-1, R11
+ SBBQ $0x03, R12
+ CMOVQCS R13, R10
+ CMOVQCS R14, R11
+ CMOVQCS R15, R12
// Add in the "s" part of the key
- ADDQ 0+sStore, acc0
- ADCQ 8+sStore, acc1
+ ADDQ 16(BP), R10
+ ADCQ 24(BP), R11
// Finally store the tag at the end of the message
- MOVQ acc0, (0*8)(oup)
- MOVQ acc1, (1*8)(oup)
+ MOVQ R10, (DI)
+ MOVQ R11, 8(DI)
RET
-// ----------------------------------------------------------------------------
-// ------------------------- AVX2 Code ----------------------------------------
chacha20Poly1305Seal_AVX2:
VZEROUPPER
- VMOVDQU ·chacha20Constants<>(SB), AA0
- BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
- BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
- BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
- VPADDD ·avx2InitMask<>(SB), DD0, DD0
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ BYTE $0xc4
+ BYTE $0x42
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x70
+ BYTE $0x10
+ BYTE $0xc4
+ BYTE $0x42
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x60
+ BYTE $0x20
+ BYTE $0xc4
+ BYTE $0xc2
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x60
+ BYTE $0x30
+ VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
// Special optimizations, for very short buffers
- CMPQ inl, $192
- JBE seal192AVX2 // 33% faster
- CMPQ inl, $320
- JBE seal320AVX2 // 17% faster
+ CMPQ BX, $0x000000c0
+ JBE seal192AVX2
+ CMPQ BX, $0x00000140
+ JBE seal320AVX2
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
- VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
- VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
- VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
- VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
- VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
- VMOVDQA DD3, ctr3StoreAVX2
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA Y14, 32(BP)
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA Y12, 64(BP)
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y4, 96(BP)
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y1, 128(BP)
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ MOVQ $0x0000000a, R9
sealAVX2IntroLoop:
- VMOVDQA CC3, tmpStoreAVX2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
- VMOVDQA tmpStoreAVX2, CC3
- VMOVDQA CC1, tmpStoreAVX2
- chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
- VMOVDQA tmpStoreAVX2, CC1
-
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
- VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
- VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
- VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
-
- VMOVDQA CC3, tmpStoreAVX2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
- VMOVDQA tmpStoreAVX2, CC3
- VMOVDQA CC1, tmpStoreAVX2
- chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
- VMOVDQA tmpStoreAVX2, CC1
-
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
- VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
- VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
- VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
- DECQ itr2
- JNE sealAVX2IntroLoop
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-
- VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
- VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
- VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
+ VMOVDQA Y15, 224(BP)
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VMOVDQA 224(BP), Y15
+ VMOVDQA Y13, 224(BP)
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x0c, Y11, Y13
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x07, Y11, Y13
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VMOVDQA 224(BP), Y13
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VMOVDQA Y15, 224(BP)
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VMOVDQA 224(BP), Y15
+ VMOVDQA Y13, 224(BP)
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x0c, Y11, Y13
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x07, Y11, Y13
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VMOVDQA 224(BP), Y13
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y3, Y3, Y3
+ DECQ R9
+ JNE sealAVX2IntroLoop
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VPERM2I128 $0x13, Y12, Y4, Y12
+ VPERM2I128 $0x02, Y0, Y14, Y4
+ VPERM2I128 $0x13, Y0, Y14, Y0
// Clamp and store poly key
- VPAND ·polyClampMask<>(SB), DD0, DD0
- VMOVDQA DD0, rsStoreAVX2
+ VPAND ·polyClampMask<>+0(SB), Y4, Y4
+ VMOVDQA Y4, (BP)
// Hash AD
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
// Can store at least 320 bytes
- VPXOR (0*32)(inp), AA0, AA0
- VPXOR (1*32)(inp), CC0, CC0
- VMOVDQU AA0, (0*32)(oup)
- VMOVDQU CC0, (1*32)(oup)
-
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
- VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
- VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
-
- MOVQ $320, itr1
- SUBQ $320, inl
- LEAQ 320(inp), inp
-
- VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
- CMPQ inl, $128
+ VPXOR (SI), Y0, Y0
+ VPXOR 32(SI), Y12, Y12
+ VMOVDQU Y0, (DI)
+ VMOVDQU Y12, 32(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 64(SI), Y0, Y0
+ VPXOR 96(SI), Y14, Y14
+ VPXOR 128(SI), Y12, Y12
+ VPXOR 160(SI), Y4, Y4
+ VMOVDQU Y0, 64(DI)
+ VMOVDQU Y14, 96(DI)
+ VMOVDQU Y12, 128(DI)
+ VMOVDQU Y4, 160(DI)
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 192(SI), Y0, Y0
+ VPXOR 224(SI), Y14, Y14
+ VPXOR 256(SI), Y12, Y12
+ VPXOR 288(SI), Y4, Y4
+ VMOVDQU Y0, 192(DI)
+ VMOVDQU Y14, 224(DI)
+ VMOVDQU Y12, 256(DI)
+ VMOVDQU Y4, 288(DI)
+ MOVQ $0x00000140, CX
+ SUBQ $0x00000140, BX
+ LEAQ 320(SI), SI
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, Y15, Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, Y15, Y3, Y4
+ CMPQ BX, $0x80
JBE sealAVX2SealHash
-
- VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
- VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
- SUBQ $128, inl
- LEAQ 128(inp), inp
-
- MOVQ $8, itr1
- MOVQ $2, itr2
-
- CMPQ inl, $128
- JBE sealAVX2Tail128
- CMPQ inl, $256
- JBE sealAVX2Tail256
- CMPQ inl, $384
- JBE sealAVX2Tail384
- CMPQ inl, $512
- JBE sealAVX2Tail512
+ VPXOR (SI), Y0, Y0
+ VPXOR 32(SI), Y14, Y14
+ VPXOR 64(SI), Y12, Y12
+ VPXOR 96(SI), Y4, Y4
+ VMOVDQU Y0, 320(DI)
+ VMOVDQU Y14, 352(DI)
+ VMOVDQU Y12, 384(DI)
+ VMOVDQU Y4, 416(DI)
+ SUBQ $0x80, BX
+ LEAQ 128(SI), SI
+ MOVQ $0x00000008, CX
+ MOVQ $0x00000002, R9
+ CMPQ BX, $0x80
+ JBE sealAVX2Tail128
+ CMPQ BX, $0x00000100
+ JBE sealAVX2Tail256
+ CMPQ BX, $0x00000180
+ JBE sealAVX2Tail384
+ CMPQ BX, $0x00000200
+ JBE sealAVX2Tail512
// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-
- VMOVDQA CC3, tmpStoreAVX2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
- VMOVDQA tmpStoreAVX2, CC3
- VMOVDQA CC1, tmpStoreAVX2
- chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
- VMOVDQA tmpStoreAVX2, CC1
-
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
- VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
- VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
- VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
-
- VMOVDQA CC3, tmpStoreAVX2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
- VMOVDQA tmpStoreAVX2, CC3
- VMOVDQA CC1, tmpStoreAVX2
- chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
- VMOVDQA tmpStoreAVX2, CC1
-
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
- VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
- VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
- VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
-
- SUBQ $16, oup // Adjust the pointer
- MOVQ $9, itr1
- JMP sealAVX2InternalLoopStart
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ VMOVDQA Y15, 224(BP)
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VMOVDQA 224(BP), Y15
+ VMOVDQA Y13, 224(BP)
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x0c, Y11, Y13
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x07, Y11, Y13
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VMOVDQA 224(BP), Y13
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VMOVDQA Y15, 224(BP)
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VMOVDQA 224(BP), Y15
+ VMOVDQA Y13, 224(BP)
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x0c, Y11, Y13
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x07, Y11, Y13
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VMOVDQA 224(BP), Y13
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ SUBQ $0x10, DI
+ MOVQ $0x00000009, CX
+ JMP sealAVX2InternalLoopStart
sealAVX2MainLoop:
- // Load state, increment counter blocks, store the incremented counters
- VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
- MOVQ $10, itr1
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ MOVQ $0x0000000a, CX
sealAVX2InternalLoop:
- polyAdd(0*8(oup))
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- polyMulStage1_AVX2
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- polyMulStage2_AVX2
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyMulStage3_AVX2
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulReduceStage
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
sealAVX2InternalLoopStart:
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- polyAdd(2*8(oup))
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- polyMulStage1_AVX2
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulStage2_AVX2
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- polyMulStage3_AVX2
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- polyMulReduceStage
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyAdd(4*8(oup))
- LEAQ (6*8)(oup), oup
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulStage1_AVX2
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- polyMulStage2_AVX2
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- polyMulStage3_AVX2
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulReduceStage
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
- DECQ itr1
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ ADDQ 32(DI), R10
+ ADCQ 40(DI), R11
+ ADCQ $0x01, R12
+ LEAQ 48(DI), DI
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x04, Y3, Y3, Y3
+ DECQ CX
JNE sealAVX2InternalLoop
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
- VMOVDQA CC3, tmpStoreAVX2
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VMOVDQA Y15, 224(BP)
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
- polyAdd(0*8(oup))
- polyMulAVX2
- LEAQ (4*8)(oup), oup
- VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
- VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
- VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
- VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VPERM2I128 $0x02, Y0, Y14, Y15
+ VPERM2I128 $0x13, Y0, Y14, Y14
+ VPERM2I128 $0x02, Y12, Y4, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y12
+ VPXOR (SI), Y15, Y15
+ VPXOR 32(SI), Y0, Y0
+ VPXOR 64(SI), Y14, Y14
+ VPXOR 96(SI), Y12, Y12
+ VMOVDQU Y15, (DI)
+ VMOVDQU Y0, 32(DI)
+ VMOVDQU Y14, 64(DI)
+ VMOVDQU Y12, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 128(SI), Y0, Y0
+ VPXOR 160(SI), Y14, Y14
+ VPXOR 192(SI), Y12, Y12
+ VPXOR 224(SI), Y4, Y4
+ VMOVDQU Y0, 128(DI)
+ VMOVDQU Y14, 160(DI)
+ VMOVDQU Y12, 192(DI)
+ VMOVDQU Y4, 224(DI)
// and here
- polyAdd(-2*8(oup))
- polyMulAVX2
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
- VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
- VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
- VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
- VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
- LEAQ (32*16)(inp), inp
- SUBQ $(32*16), inl
- CMPQ inl, $512
+ ADDQ -16(DI), R10
+ ADCQ -8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 256(SI), Y0, Y0
+ VPXOR 288(SI), Y14, Y14
+ VPXOR 320(SI), Y12, Y12
+ VPXOR 352(SI), Y4, Y4
+ VMOVDQU Y0, 256(DI)
+ VMOVDQU Y14, 288(DI)
+ VMOVDQU Y12, 320(DI)
+ VMOVDQU Y4, 352(DI)
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, 224(BP), Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, 224(BP), Y3, Y4
+ VPXOR 384(SI), Y0, Y0
+ VPXOR 416(SI), Y14, Y14
+ VPXOR 448(SI), Y12, Y12
+ VPXOR 480(SI), Y4, Y4
+ VMOVDQU Y0, 384(DI)
+ VMOVDQU Y14, 416(DI)
+ VMOVDQU Y12, 448(DI)
+ VMOVDQU Y4, 480(DI)
+ LEAQ 512(SI), SI
+ SUBQ $0x00000200, BX
+ CMPQ BX, $0x00000200
JG sealAVX2MainLoop
// Tail can only hash 480 bytes
- polyAdd(0*8(oup))
- polyMulAVX2
- polyAdd(2*8(oup))
- polyMulAVX2
- LEAQ 32(oup), oup
-
- MOVQ $10, itr1
- MOVQ $0, itr2
- CMPQ inl, $128
- JBE sealAVX2Tail128
- CMPQ inl, $256
- JBE sealAVX2Tail256
- CMPQ inl, $384
- JBE sealAVX2Tail384
- JMP sealAVX2Tail512
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ MOVQ $0x0000000a, CX
+ MOVQ $0x00000000, R9
+ CMPQ BX, $0x80
+ JBE sealAVX2Tail128
+ CMPQ BX, $0x00000100
+ JBE sealAVX2Tail256
+ CMPQ BX, $0x00000180
+ JBE sealAVX2Tail384
+ JMP sealAVX2Tail512
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 193 bytes
seal192AVX2:
- // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
- VMOVDQA AA0, AA1
- VMOVDQA BB0, BB1
- VMOVDQA CC0, CC1
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2
- VMOVDQA BB0, BB2
- VMOVDQA CC0, CC2
- VMOVDQA DD0, DD2
- VMOVDQA DD1, TT3
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y14, Y9
+ VMOVDQA Y12, Y13
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y0, Y6
+ VMOVDQA Y14, Y10
+ VMOVDQA Y12, Y8
+ VMOVDQA Y4, Y2
+ VMOVDQA Y1, Y15
+ MOVQ $0x0000000a, R9
sealAVX2192InnerCipherLoop:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
- DECQ itr2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ DECQ R9
JNE sealAVX2192InnerCipherLoop
- VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
- VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
- VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
- VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
- VPERM2I128 $0x02, AA0, BB0, TT0
+ VPADDD Y6, Y0, Y0
+ VPADDD Y6, Y5, Y5
+ VPADDD Y10, Y14, Y14
+ VPADDD Y10, Y9, Y9
+ VPADDD Y8, Y12, Y12
+ VPADDD Y8, Y13, Y13
+ VPADDD Y2, Y4, Y4
+ VPADDD Y15, Y1, Y1
+ VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for up to 192 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
- VPERM2I128 $0x02, AA1, BB1, CC0
- VPERM2I128 $0x02, CC1, DD1, DD0
- VPERM2I128 $0x13, AA1, BB1, AA1
- VPERM2I128 $0x13, CC1, DD1, BB1
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
+ VPERM2I128 $0x02, Y5, Y9, Y12
+ VPERM2I128 $0x02, Y13, Y1, Y4
+ VPERM2I128 $0x13, Y5, Y9, Y5
+ VPERM2I128 $0x13, Y13, Y1, Y9
sealAVX2ShortSeal:
// Hash aad
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
- XORQ itr1, itr1
+ XORQ CX, CX
sealAVX2SealHash:
// itr1 holds the number of bytes encrypted but not yet hashed
- CMPQ itr1, $16
- JB sealAVX2ShortSealLoop
- polyAdd(0(oup))
- polyMul
- SUBQ $16, itr1
- ADDQ $16, oup
- JMP sealAVX2SealHash
+ CMPQ CX, $0x10
+ JB sealAVX2ShortSealLoop
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ SUBQ $0x10, CX
+ ADDQ $0x10, DI
+ JMP sealAVX2SealHash
sealAVX2ShortSealLoop:
- CMPQ inl, $32
+ CMPQ BX, $0x20
JB sealAVX2ShortTail32
- SUBQ $32, inl
+ SUBQ $0x20, BX
// Load for encryption
- VPXOR (inp), AA0, AA0
- VMOVDQU AA0, (oup)
- LEAQ (1*32)(inp), inp
+ VPXOR (SI), Y0, Y0
+ VMOVDQU Y0, (DI)
+ LEAQ 32(SI), SI
// Now can hash
- polyAdd(0*8(oup))
- polyMulAVX2
- polyAdd(2*8(oup))
- polyMulAVX2
- LEAQ (1*32)(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
// Shift stream left
- VMOVDQA BB0, AA0
- VMOVDQA CC0, BB0
- VMOVDQA DD0, CC0
- VMOVDQA AA1, DD0
- VMOVDQA BB1, AA1
- VMOVDQA CC1, BB1
- VMOVDQA DD1, CC1
- VMOVDQA AA2, DD1
- VMOVDQA BB2, AA2
+ VMOVDQA Y14, Y0
+ VMOVDQA Y12, Y14
+ VMOVDQA Y4, Y12
+ VMOVDQA Y5, Y4
+ VMOVDQA Y9, Y5
+ VMOVDQA Y13, Y9
+ VMOVDQA Y1, Y13
+ VMOVDQA Y6, Y1
+ VMOVDQA Y10, Y6
JMP sealAVX2ShortSealLoop
sealAVX2ShortTail32:
- CMPQ inl, $16
- VMOVDQA A0, A1
+ CMPQ BX, $0x10
+ VMOVDQA X0, X1
JB sealAVX2ShortDone
-
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for encryption
- VPXOR (inp), A0, T0
- VMOVDQU T0, (oup)
- LEAQ (1*16)(inp), inp
+ VPXOR (SI), X0, X12
+ VMOVDQU X12, (DI)
+ LEAQ 16(SI), SI
// Hash
- polyAdd(0*8(oup))
- polyMulAVX2
- LEAQ (1*16)(oup), oup
- VPERM2I128 $0x11, AA0, AA0, AA0
- VMOVDQA A0, A1
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ VPERM2I128 $0x11, Y0, Y0, Y0
+ VMOVDQA X0, X1
sealAVX2ShortDone:
VZEROUPPER
JMP sealSSETail
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 321 bytes
seal320AVX2:
- // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
- VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
- VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y14, Y9
+ VMOVDQA Y12, Y13
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y0, Y6
+ VMOVDQA Y14, Y10
+ VMOVDQA Y12, Y8
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y14, Y7
+ VMOVDQA Y12, Y11
+ VMOVDQA Y4, Y15
+ MOVQ $0x0000000a, R9
sealAVX2320InnerCipherLoop:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
- DECQ itr2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ DECQ R9
JNE sealAVX2320InnerCipherLoop
-
- VMOVDQA ·chacha20Constants<>(SB), TT0
- VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
- VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
- VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
- VMOVDQA ·avx2IncMask<>(SB), TT0
- VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
- VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
- VPADDD TT3, DD2, DD2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y3
+ VPADDD Y3, Y0, Y0
+ VPADDD Y3, Y5, Y5
+ VPADDD Y3, Y6, Y6
+ VPADDD Y7, Y14, Y14
+ VPADDD Y7, Y9, Y9
+ VPADDD Y7, Y10, Y10
+ VPADDD Y11, Y12, Y12
+ VPADDD Y11, Y13, Y13
+ VPADDD Y11, Y8, Y8
+ VMOVDQA ·avx2IncMask<>+0(SB), Y3
+ VPADDD Y15, Y4, Y4
+ VPADDD Y3, Y15, Y15
+ VPADDD Y15, Y1, Y1
+ VPADDD Y3, Y15, Y15
+ VPADDD Y15, Y2, Y2
// Clamp and store poly key
- VPERM2I128 $0x02, AA0, BB0, TT0
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for up to 320 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
- VPERM2I128 $0x02, AA1, BB1, CC0
- VPERM2I128 $0x02, CC1, DD1, DD0
- VPERM2I128 $0x13, AA1, BB1, AA1
- VPERM2I128 $0x13, CC1, DD1, BB1
- VPERM2I128 $0x02, AA2, BB2, CC1
- VPERM2I128 $0x02, CC2, DD2, DD1
- VPERM2I128 $0x13, AA2, BB2, AA2
- VPERM2I128 $0x13, CC2, DD2, BB2
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
+ VPERM2I128 $0x02, Y5, Y9, Y12
+ VPERM2I128 $0x02, Y13, Y1, Y4
+ VPERM2I128 $0x13, Y5, Y9, Y5
+ VPERM2I128 $0x13, Y13, Y1, Y9
+ VPERM2I128 $0x02, Y6, Y10, Y13
+ VPERM2I128 $0x02, Y8, Y2, Y1
+ VPERM2I128 $0x13, Y6, Y10, Y6
+ VPERM2I128 $0x13, Y8, Y2, Y10
JMP sealAVX2ShortSeal
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
sealAVX2Tail128:
- // Need to decrypt up to 128 bytes - prepare two blocks
- // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
- // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA ·chacha20Constants<>(SB), AA0
- VMOVDQA state1StoreAVX2, BB0
- VMOVDQA state2StoreAVX2, CC0
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0
- VMOVDQA DD0, DD1
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA 32(BP), Y14
+ VMOVDQA 64(BP), Y12
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VMOVDQA Y4, Y1
sealAVX2Tail128LoopA:
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealAVX2Tail128LoopB:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
- polyAdd(0(oup))
- polyMul
- VPALIGNR $4, BB0, BB0, BB0
- VPALIGNR $8, CC0, CC0, CC0
- VPALIGNR $12, DD0, DD0, DD0
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
- polyAdd(16(oup))
- polyMul
- LEAQ 32(oup), oup
- VPALIGNR $12, BB0, BB0, BB0
- VPALIGNR $8, CC0, CC0, CC0
- VPALIGNR $4, DD0, DD0, DD0
- DECQ itr1
- JG sealAVX2Tail128LoopA
- DECQ itr2
- JGE sealAVX2Tail128LoopB
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA1
- VPADDD state1StoreAVX2, BB0, BB1
- VPADDD state2StoreAVX2, CC0, CC1
- VPADDD DD1, DD0, DD1
-
- VPERM2I128 $0x02, AA1, BB1, AA0
- VPERM2I128 $0x02, CC1, DD1, BB0
- VPERM2I128 $0x13, AA1, BB1, CC0
- VPERM2I128 $0x13, CC1, DD1, DD0
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x04, Y4, Y4, Y4
+ DECQ CX
+ JG sealAVX2Tail128LoopA
+ DECQ R9
+ JGE sealAVX2Tail128LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y5
+ VPADDD 32(BP), Y14, Y9
+ VPADDD 64(BP), Y12, Y13
+ VPADDD Y1, Y4, Y1
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
JMP sealAVX2ShortSealLoop
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
sealAVX2Tail256:
- // Need to decrypt up to 256 bytes - prepare two blocks
- // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
- // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA DD0, TT1
- VMOVDQA DD1, TT2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA ·chacha20Constants<>+0(SB), Y5
+ VMOVDQA 32(BP), Y14
+ VMOVDQA 32(BP), Y9
+ VMOVDQA 64(BP), Y12
+ VMOVDQA 64(BP), Y13
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y4, Y7
+ VMOVDQA Y1, Y11
sealAVX2Tail256LoopA:
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealAVX2Tail256LoopB:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- polyAdd(0(oup))
- polyMul
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- polyAdd(16(oup))
- polyMul
- LEAQ 32(oup), oup
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
- DECQ itr1
- JG sealAVX2Tail256LoopA
- DECQ itr2
- JGE sealAVX2Tail256LoopB
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
- VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
- VPERM2I128 $0x02, AA0, BB0, TT0
- VPERM2I128 $0x02, CC0, DD0, TT1
- VPERM2I128 $0x13, AA0, BB0, TT2
- VPERM2I128 $0x13, CC0, DD0, TT3
- VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
- VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
- MOVQ $128, itr1
- LEAQ 128(inp), inp
- SUBQ $128, inl
- VPERM2I128 $0x02, AA1, BB1, AA0
- VPERM2I128 $0x02, CC1, DD1, BB0
- VPERM2I128 $0x13, AA1, BB1, CC0
- VPERM2I128 $0x13, CC1, DD1, DD0
-
- JMP sealAVX2SealHash
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ DECQ CX
+ JG sealAVX2Tail256LoopA
+ DECQ R9
+ JGE sealAVX2Tail256LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD Y7, Y4, Y4
+ VPADDD Y11, Y1, Y1
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPERM2I128 $0x02, Y12, Y4, Y7
+ VPERM2I128 $0x13, Y0, Y14, Y11
+ VPERM2I128 $0x13, Y12, Y4, Y15
+ VPXOR (SI), Y3, Y3
+ VPXOR 32(SI), Y7, Y7
+ VPXOR 64(SI), Y11, Y11
+ VPXOR 96(SI), Y15, Y15
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y7, 32(DI)
+ VMOVDQU Y11, 64(DI)
+ VMOVDQU Y15, 96(DI)
+ MOVQ $0x00000080, CX
+ LEAQ 128(SI), SI
+ SUBQ $0x80, BX
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ JMP sealAVX2SealHash
-// ----------------------------------------------------------------------------
-// Special optimization for the last 384 bytes of ciphertext
sealAVX2Tail384:
- // Need to decrypt up to 384 bytes - prepare two blocks
- // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
- // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
- VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y4, Y7
+ VMOVDQA Y1, Y11
+ VMOVDQA Y2, Y15
sealAVX2Tail384LoopA:
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealAVX2Tail384LoopB:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- polyAdd(0(oup))
- polyMul
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- polyAdd(16(oup))
- polyMul
- LEAQ 32(oup), oup
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
- DECQ itr1
- JG sealAVX2Tail384LoopA
- DECQ itr2
- JGE sealAVX2Tail384LoopB
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ DECQ CX
+ JG sealAVX2Tail384LoopA
+ DECQ R9
+ JGE sealAVX2Tail384LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD Y7, Y4, Y4
+ VPADDD Y11, Y1, Y1
+ VPADDD Y15, Y2, Y2
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPERM2I128 $0x02, Y12, Y4, Y7
+ VPERM2I128 $0x13, Y0, Y14, Y11
+ VPERM2I128 $0x13, Y12, Y4, Y15
+ VPXOR (SI), Y3, Y3
+ VPXOR 32(SI), Y7, Y7
+ VPXOR 64(SI), Y11, Y11
+ VPXOR 96(SI), Y15, Y15
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y7, 32(DI)
+ VMOVDQU Y11, 64(DI)
+ VMOVDQU Y15, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y3
+ VPERM2I128 $0x02, Y13, Y1, Y7
+ VPERM2I128 $0x13, Y5, Y9, Y11
+ VPERM2I128 $0x13, Y13, Y1, Y15
+ VPXOR 128(SI), Y3, Y3
+ VPXOR 160(SI), Y7, Y7
+ VPXOR 192(SI), Y11, Y11
+ VPXOR 224(SI), Y15, Y15
+ VMOVDQU Y3, 128(DI)
+ VMOVDQU Y7, 160(DI)
+ VMOVDQU Y11, 192(DI)
+ VMOVDQU Y15, 224(DI)
+ MOVQ $0x00000100, CX
+ LEAQ 256(SI), SI
+ SUBQ $0x00000100, BX
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ JMP sealAVX2SealHash
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
- VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
- VPERM2I128 $0x02, AA0, BB0, TT0
- VPERM2I128 $0x02, CC0, DD0, TT1
- VPERM2I128 $0x13, AA0, BB0, TT2
- VPERM2I128 $0x13, CC0, DD0, TT3
- VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
- VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, TT0
- VPERM2I128 $0x02, CC1, DD1, TT1
- VPERM2I128 $0x13, AA1, BB1, TT2
- VPERM2I128 $0x13, CC1, DD1, TT3
- VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
- VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
- MOVQ $256, itr1
- LEAQ 256(inp), inp
- SUBQ $256, inl
- VPERM2I128 $0x02, AA2, BB2, AA0
- VPERM2I128 $0x02, CC2, DD2, BB0
- VPERM2I128 $0x13, AA2, BB2, CC0
- VPERM2I128 $0x13, CC2, DD2, DD0
-
- JMP sealAVX2SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 512 bytes of ciphertext
sealAVX2Tail512:
- // Need to decrypt up to 512 bytes - prepare two blocks
- // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
- // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
sealAVX2Tail512LoopA:
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealAVX2Tail512LoopB:
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyAdd(0*8(oup))
- polyMulAVX2
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyAdd(2*8(oup))
- polyMulAVX2
- LEAQ (4*8)(oup), oup
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-
- DECQ itr1
- JG sealAVX2Tail512LoopA
- DECQ itr2
- JGE sealAVX2Tail512LoopB
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
- VMOVDQA CC3, tmpStoreAVX2
- VPERM2I128 $0x02, AA0, BB0, CC3
- VPXOR (0*32)(inp), CC3, CC3
- VMOVDQU CC3, (0*32)(oup)
- VPERM2I128 $0x02, CC0, DD0, CC3
- VPXOR (1*32)(inp), CC3, CC3
- VMOVDQU CC3, (1*32)(oup)
- VPERM2I128 $0x13, AA0, BB0, CC3
- VPXOR (2*32)(inp), CC3, CC3
- VMOVDQU CC3, (2*32)(oup)
- VPERM2I128 $0x13, CC0, DD0, CC3
- VPXOR (3*32)(inp), CC3, CC3
- VMOVDQU CC3, (3*32)(oup)
-
- VPERM2I128 $0x02, AA1, BB1, AA0
- VPERM2I128 $0x02, CC1, DD1, BB0
- VPERM2I128 $0x13, AA1, BB1, CC0
- VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
- VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
-
- VPERM2I128 $0x02, AA2, BB2, AA0
- VPERM2I128 $0x02, CC2, DD2, BB0
- VPERM2I128 $0x13, AA2, BB2, CC0
- VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
- VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-
- MOVQ $384, itr1
- LEAQ 384(inp), inp
- SUBQ $384, inl
- VPERM2I128 $0x02, AA3, BB3, AA0
- VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
- VPERM2I128 $0x13, AA3, BB3, CC0
- VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-
- JMP sealAVX2SealHash
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x04, Y3, Y3, Y3
+ DECQ CX
+ JG sealAVX2Tail512LoopA
+ DECQ R9
+ JGE sealAVX2Tail512LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VMOVDQA Y15, 224(BP)
+ VPERM2I128 $0x02, Y0, Y14, Y15
+ VPXOR (SI), Y15, Y15
+ VMOVDQU Y15, (DI)
+ VPERM2I128 $0x02, Y12, Y4, Y15
+ VPXOR 32(SI), Y15, Y15
+ VMOVDQU Y15, 32(DI)
+ VPERM2I128 $0x13, Y0, Y14, Y15
+ VPXOR 64(SI), Y15, Y15
+ VMOVDQU Y15, 64(DI)
+ VPERM2I128 $0x13, Y12, Y4, Y15
+ VPXOR 96(SI), Y15, Y15
+ VMOVDQU Y15, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 128(SI), Y0, Y0
+ VPXOR 160(SI), Y14, Y14
+ VPXOR 192(SI), Y12, Y12
+ VPXOR 224(SI), Y4, Y4
+ VMOVDQU Y0, 128(DI)
+ VMOVDQU Y14, 160(DI)
+ VMOVDQU Y12, 192(DI)
+ VMOVDQU Y4, 224(DI)
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 256(SI), Y0, Y0
+ VPXOR 288(SI), Y14, Y14
+ VPXOR 320(SI), Y12, Y12
+ VPXOR 352(SI), Y4, Y4
+ VMOVDQU Y0, 256(DI)
+ VMOVDQU Y14, 288(DI)
+ VMOVDQU Y12, 320(DI)
+ VMOVDQU Y4, 352(DI)
+ MOVQ $0x00000180, CX
+ LEAQ 384(SI), SI
+ SUBQ $0x00000180, BX
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, 224(BP), Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, 224(BP), Y3, Y4
+ JMP sealAVX2SealHash
diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go b/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
index 333da285b3..bd896bdc76 100644
--- a/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
+++ b/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego
+//go:build (!amd64 && !ppc64le && !ppc64 && !s390x) || !gc || purego
package poly1305
diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
index e0d3c64756..133757384b 100644
--- a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
+++ b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
@@ -1,108 +1,93 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT.
//go:build gc && !purego
-#include "textflag.h"
-
-#define POLY1305_ADD(msg, h0, h1, h2) \
- ADDQ 0(msg), h0; \
- ADCQ 8(msg), h1; \
- ADCQ $1, h2; \
- LEAQ 16(msg), msg
-
-#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
- MOVQ r0, AX; \
- MULQ h0; \
- MOVQ AX, t0; \
- MOVQ DX, t1; \
- MOVQ r0, AX; \
- MULQ h1; \
- ADDQ AX, t1; \
- ADCQ $0, DX; \
- MOVQ r0, t2; \
- IMULQ h2, t2; \
- ADDQ DX, t2; \
- \
- MOVQ r1, AX; \
- MULQ h0; \
- ADDQ AX, t1; \
- ADCQ $0, DX; \
- MOVQ DX, h0; \
- MOVQ r1, t3; \
- IMULQ h2, t3; \
- MOVQ r1, AX; \
- MULQ h1; \
- ADDQ AX, t2; \
- ADCQ DX, t3; \
- ADDQ h0, t2; \
- ADCQ $0, t3; \
- \
- MOVQ t0, h0; \
- MOVQ t1, h1; \
- MOVQ t2, h2; \
- ANDQ $3, h2; \
- MOVQ t2, t0; \
- ANDQ $0xFFFFFFFFFFFFFFFC, t0; \
- ADDQ t0, h0; \
- ADCQ t3, h1; \
- ADCQ $0, h2; \
- SHRQ $2, t3, t2; \
- SHRQ $2, t3; \
- ADDQ t2, h0; \
- ADCQ t3, h1; \
- ADCQ $0, h2
-
-// func update(state *[7]uint64, msg []byte)
+// func update(state *macState, msg []byte)
TEXT ·update(SB), $0-32
MOVQ state+0(FP), DI
MOVQ msg_base+8(FP), SI
MOVQ msg_len+16(FP), R15
-
- MOVQ 0(DI), R8 // h0
- MOVQ 8(DI), R9 // h1
- MOVQ 16(DI), R10 // h2
- MOVQ 24(DI), R11 // r0
- MOVQ 32(DI), R12 // r1
-
- CMPQ R15, $16
+ MOVQ (DI), R8
+ MOVQ 8(DI), R9
+ MOVQ 16(DI), R10
+ MOVQ 24(DI), R11
+ MOVQ 32(DI), R12
+ CMPQ R15, $0x10
JB bytes_between_0_and_15
loop:
- POLY1305_ADD(SI, R8, R9, R10)
+ ADDQ (SI), R8
+ ADCQ 8(SI), R9
+ ADCQ $0x01, R10
+ LEAQ 16(SI), SI
multiply:
- POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
- SUBQ $16, R15
- CMPQ R15, $16
- JAE loop
+ MOVQ R11, AX
+ MULQ R8
+ MOVQ AX, BX
+ MOVQ DX, CX
+ MOVQ R11, AX
+ MULQ R9
+ ADDQ AX, CX
+ ADCQ $0x00, DX
+ MOVQ R11, R13
+ IMULQ R10, R13
+ ADDQ DX, R13
+ MOVQ R12, AX
+ MULQ R8
+ ADDQ AX, CX
+ ADCQ $0x00, DX
+ MOVQ DX, R8
+ MOVQ R12, R14
+ IMULQ R10, R14
+ MOVQ R12, AX
+ MULQ R9
+ ADDQ AX, R13
+ ADCQ DX, R14
+ ADDQ R8, R13
+ ADCQ $0x00, R14
+ MOVQ BX, R8
+ MOVQ CX, R9
+ MOVQ R13, R10
+ ANDQ $0x03, R10
+ MOVQ R13, BX
+ ANDQ $-4, BX
+ ADDQ BX, R8
+ ADCQ R14, R9
+ ADCQ $0x00, R10
+ SHRQ $0x02, R14, R13
+ SHRQ $0x02, R14
+ ADDQ R13, R8
+ ADCQ R14, R9
+ ADCQ $0x00, R10
+ SUBQ $0x10, R15
+ CMPQ R15, $0x10
+ JAE loop
bytes_between_0_and_15:
TESTQ R15, R15
JZ done
- MOVQ $1, BX
+ MOVQ $0x00000001, BX
XORQ CX, CX
XORQ R13, R13
ADDQ R15, SI
flush_buffer:
- SHLQ $8, BX, CX
- SHLQ $8, BX
+ SHLQ $0x08, BX, CX
+ SHLQ $0x08, BX
MOVB -1(SI), R13
XORQ R13, BX
DECQ SI
DECQ R15
JNZ flush_buffer
-
ADDQ BX, R8
ADCQ CX, R9
- ADCQ $0, R10
- MOVQ $16, R15
+ ADCQ $0x00, R10
+ MOVQ $0x00000010, R15
JMP multiply
done:
- MOVQ R8, 0(DI)
+ MOVQ R8, (DI)
MOVQ R9, 8(DI)
MOVQ R10, 16(DI)
RET
diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.go
index 4aec4874b5..1a1679aaad 100644
--- a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go
+++ b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build gc && !purego
+//go:build gc && !purego && (ppc64 || ppc64le)
package poly1305
diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.s
index b3c1699bff..6899a1dabc 100644
--- a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s
+++ b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.s
@@ -2,15 +2,25 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build gc && !purego
+//go:build gc && !purego && (ppc64 || ppc64le)
#include "textflag.h"
// This was ported from the amd64 implementation.
+#ifdef GOARCH_ppc64le
+#define LE_MOVD MOVD
+#define LE_MOVWZ MOVWZ
+#define LE_MOVHZ MOVHZ
+#else
+#define LE_MOVD MOVDBR
+#define LE_MOVWZ MOVWBR
+#define LE_MOVHZ MOVHBR
+#endif
+
#define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \
- MOVD (msg), t0; \
- MOVD 8(msg), t1; \
+ LE_MOVD (msg)( R0), t0; \
+ LE_MOVD (msg)(R24), t1; \
MOVD $1, t2; \
ADDC t0, h0, h0; \
ADDE t1, h1, h1; \
@@ -50,10 +60,6 @@
ADDE t3, h1, h1; \
ADDZE h2
-DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
-DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
-GLOBL ·poly1305Mask<>(SB), RODATA, $16
-
// func update(state *[7]uint64, msg []byte)
TEXT ·update(SB), $0-32
MOVD state+0(FP), R3
@@ -66,6 +72,8 @@ TEXT ·update(SB), $0-32
MOVD 24(R3), R11 // r0
MOVD 32(R3), R12 // r1
+ MOVD $8, R24
+
CMP R5, $16
BLT bytes_between_0_and_15
@@ -94,7 +102,7 @@ flush_buffer:
// Greater than 8 -- load the rightmost remaining bytes in msg
// and put into R17 (h1)
- MOVD (R4)(R21), R17
+ LE_MOVD (R4)(R21), R17
MOVD $16, R22
// Find the offset to those bytes
@@ -118,7 +126,7 @@ just1:
BLT less8
// Exactly 8
- MOVD (R4), R16
+ LE_MOVD (R4), R16
CMP R17, $0
@@ -133,7 +141,7 @@ less8:
MOVD $0, R22 // shift count
CMP R5, $4
BLT less4
- MOVWZ (R4), R16
+ LE_MOVWZ (R4), R16
ADD $4, R4
ADD $-4, R5
MOVD $32, R22
@@ -141,7 +149,7 @@ less8:
less4:
CMP R5, $2
BLT less2
- MOVHZ (R4), R21
+ LE_MOVHZ (R4), R21
SLD R22, R21, R21
OR R16, R21, R16
ADD $16, R22
diff --git a/src/vendor/golang.org/x/net/route/address.go b/src/vendor/golang.org/x/net/route/address.go
index 5443d67223..bae63003f0 100644
--- a/src/vendor/golang.org/x/net/route/address.go
+++ b/src/vendor/golang.org/x/net/route/address.go
@@ -170,20 +170,37 @@ func (a *Inet6Addr) marshal(b []byte) (int, error) {
// parseInetAddr parses b as an internet address for IPv4 or IPv6.
func parseInetAddr(af int, b []byte) (Addr, error) {
+ const (
+ off4 = 4 // offset of in_addr
+ off6 = 8 // offset of in6_addr
+ )
switch af {
case syscall.AF_INET:
- if len(b) < sizeofSockaddrInet {
+ if len(b) < (off4+1) || len(b) < int(b[0]) {
return nil, errInvalidAddr
}
+ sockAddrLen := int(b[0])
a := &Inet4Addr{}
- copy(a.IP[:], b[4:8])
+ n := off4 + 4
+ if sockAddrLen < n {
+ n = sockAddrLen
+ }
+ copy(a.IP[:], b[off4:n])
return a, nil
case syscall.AF_INET6:
- if len(b) < sizeofSockaddrInet6 {
+ if len(b) < (off6+1) || len(b) < int(b[0]) {
return nil, errInvalidAddr
}
- a := &Inet6Addr{ZoneID: int(nativeEndian.Uint32(b[24:28]))}
- copy(a.IP[:], b[8:24])
+ sockAddrLen := int(b[0])
+ n := off6 + 16
+ if sockAddrLen < n {
+ n = sockAddrLen
+ }
+ a := &Inet6Addr{}
+ if sockAddrLen == sizeofSockaddrInet6 {
+ a.ZoneID = int(nativeEndian.Uint32(b[24:28]))
+ }
+ copy(a.IP[:], b[off6:n])
if a.IP[0] == 0xfe && a.IP[1]&0xc0 == 0x80 || a.IP[0] == 0xff && (a.IP[1]&0x0f == 0x01 || a.IP[1]&0x0f == 0x02) {
// KAME based IPv6 protocol stack usually
// embeds the interface index in the
diff --git a/src/vendor/modules.txt b/src/vendor/modules.txt
index 2ef127ad5a..164cdd09c9 100644
--- a/src/vendor/modules.txt
+++ b/src/vendor/modules.txt
@@ -1,4 +1,4 @@
-# golang.org/x/crypto v0.25.1-0.20240722173533-bb80217080b0
+# golang.org/x/crypto v0.29.0
## explicit; go 1.20
golang.org/x/crypto/chacha20
golang.org/x/crypto/chacha20poly1305
@@ -6,7 +6,7 @@ golang.org/x/crypto/cryptobyte
golang.org/x/crypto/cryptobyte/asn1
golang.org/x/crypto/internal/alias
golang.org/x/crypto/internal/poly1305
-# golang.org/x/net v0.27.1-0.20240722181819-765c7e89b3bd
+# golang.org/x/net v0.31.0
## explicit; go 1.18
golang.org/x/net/dns/dnsmessage
golang.org/x/net/http/httpguts
@@ -19,7 +19,7 @@ golang.org/x/net/route
# golang.org/x/sys v0.27.0
## explicit; go 1.18
golang.org/x/sys/cpu
-# golang.org/x/text v0.19.0
+# golang.org/x/text v0.20.0
## explicit; go 1.18
golang.org/x/text/secure/bidirule
golang.org/x/text/transform