| Age | Commit message (Collapse) | Author |
|
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
│ old │ new │
│ sec/op │ sec/op vs base │
BytesCompare/1 7.238n ± 25% 5.204n ± 0% -28.10% (p=0.001 n=10)
BytesCompare/2 7.242n ± 6% 5.204n ± 0% -28.14% (p=0.000 n=10)
BytesCompare/4 7.229n ± 5% 4.403n ± 0% -39.10% (p=0.000 n=10)
BytesCompare/8 7.077n ± 36% 4.403n ± 0% -37.78% (p=0.000 n=10)
BytesCompare/16 8.373n ± 6% 6.004n ± 0% -28.30% (p=0.000 n=10)
BytesCompare/32 8.040n ± 3% 4.803n ± 0% -40.26% (p=0.000 n=10)
BytesCompare/64 8.434n ± 24% 10.410n ± 0% +23.42% (p=0.014 n=10)
BytesCompare/128 11.530n ± 23% 5.604n ± 0% -51.40% (p=0.000 n=10)
BytesCompare/256 14.180n ± 0% 7.606n ± 0% -46.36% (p=0.000 n=10)
BytesCompare/512 26.83n ± 0% 10.81n ± 0% -59.71% (p=0.000 n=10)
BytesCompare/1024 52.60n ± 0% 17.21n ± 0% -67.28% (p=0.000 n=10)
BytesCompare/2048 103.70n ± 0% 30.02n ± 0% -71.05% (p=0.000 n=10)
geomean 13.49n 7.607n -43.63%
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
│ old │ new │
│ sec/op │ sec/op vs base │
CompareBytesEqual 5.603n ± 0% 5.604n ± 0% ~ (p=0.191 n=10)
CompareBytesToNil 3.202n ± 0% 3.202n ± 0% ~ (p=1.000 n=10)
CompareBytesEmpty 2.802n ± 0% 2.802n ± 0% ~ (p=1.000 n=10)
CompareBytesIdentical 3.202n ± 0% 2.538n ± 1% -20.72% (p=0.000 n=10)
CompareBytesSameLength 8.805n ± 0% 4.803n ± 0% -45.45% (p=0.000 n=10)
CompareBytesDifferentLength 9.206n ± 0% 4.403n ± 0% -52.17% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=1 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=2 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=3 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=4 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=5 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=6 82.03µ ± 0% 45.93µ ± 0% -44.01% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=7 82.04µ ± 0% 45.93µ ± 0% -44.01% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=0 78.76µ ± 0% 45.69µ ± 0% -41.98% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=1 85.32µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=2 85.31µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=3 85.32µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=4 85.32µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=5 85.32µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=6 85.31µ ± 0% 46.06µ ± 0% -46.02% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=7 85.32µ ± 0% 52.32µ ± 7% -38.68% (p=0.000 n=10)
CompareBytesBig 78.76µ ± 0% 50.20µ ± 6% -36.26% (p=0.000 n=10)
CompareBytesBigIdentical 3.202n ± 0% 3.442n ± 24% ~ (p=0.462 n=10)
geomean 4.197µ 2.630µ -37.34%
Change-Id: I621145aef3e6a2c68e7127152f26ed047c6b2ece
Reviewed-on: https://go-review.googlesource.com/c/go/+/671315
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Cherry Mui <cherryyz@google.com>
|
|
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3C5000 @ 2200.00MHz
│ old │ new │
│ sec/op │ sec/op vs base │
IndexByte/10 19.32n ± 0% 11.84n ± 0% -38.72% (p=0.000 n=10)
IndexByte/32 49.34n ± 0% 14.11n ± 0% -71.40% (p=0.000 n=10)
IndexByte/4K 5608.0n ± 0% 138.8n ± 0% -97.52% (p=0.000 n=10)
IndexByte/4M 3822.8µ ± 0% 119.4µ ± 0% -96.88% (p=0.000 n=10)
IndexByte/64M 61.826m ± 1% 3.812m ± 0% -93.83% (p=0.000 n=10)
geomean 16.61µ 1.602µ -90.35%
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
│ old │ new │
│ sec/op │ sec/op vs base │
IndexByte/10 6.809n ± 0% 5.804n ± 0% -14.75% (p=0.000 n=10)
IndexByte/32 16.015n ± 0% 6.404n ± 0% -60.01% (p=0.000 n=10)
IndexByte/4K 1651.00n ± 0% 52.83n ± 0% -96.80% (p=0.000 n=10)
IndexByte/4M 1680.76µ ± 0% 91.10µ ± 0% -94.58% (p=0.000 n=10)
IndexByte/64M 26.878m ± 0% 2.010m ± 27% -92.52% (p=0.000 n=10)
geomean 6.054µ 815.0n -86.54%
Change-Id: Ib75b997249708f921c6717eba43543c6650bf376
Reviewed-on: https://go-review.googlesource.com/c/go/+/668055
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
|
|
Change-Id: I22eb4e7444e5fe5f6767cc960895f3c6e2fa13cc
Reviewed-on: https://go-review.googlesource.com/c/go/+/661615
Reviewed-by: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@golang.org>
Auto-Submit: Carlos Amedee <carlos@golang.org>
Reviewed-by: Carlos Amedee <carlos@golang.org>
|
|
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
│ old │ new │
│ sec/op │ sec/op vs base │
Equal/0 0.4012n ± 0% 0.4003n ± 0% -0.21% (p=0.000 n=10)
Equal/same/1 2.555n ± 1% 2.419n ± 0% -5.32% (p=0.000 n=10)
Equal/same/6 2.574n ± 1% 2.425n ± 1% -5.79% (p=0.000 n=10)
Equal/same/9 2.578n ± 0% 2.419n ± 1% -6.19% (p=0.000 n=10)
Equal/same/15 2.565n ± 1% 2.417n ± 0% -5.73% (p=0.000 n=10)
Equal/same/16 2.576n ± 1% 2.414n ± 0% -6.31% (p=0.000 n=10)
Equal/same/20 2.573n ± 1% 2.416n ± 0% -6.10% (p=0.000 n=10)
Equal/same/32 2.559n ± 0% 2.411n ± 0% -5.80% (p=0.000 n=10)
Equal/same/4K 2.579n ± 1% 2.410n ± 0% -6.53% (p=0.000 n=10)
Equal/same/4M 2.571n ± 0% 2.411n ± 0% -6.22% (p=0.000 n=10)
Equal/same/64M 2.568n ± 1% 2.413n ± 0% -6.05% (p=0.000 n=10)
Equal/1 5.215n ± 0% 6.404n ± 0% +22.80% (p=0.000 n=10)
Equal/6 11.630n ± 0% 6.404n ± 0% -44.94% (p=0.000 n=10)
Equal/9 15.240n ± 0% 6.404n ± 0% -57.98% (p=0.000 n=10)
Equal/15 22.925n ± 0% 6.404n ± 0% -72.07% (p=0.000 n=10)
Equal/16 24.070n ± 0% 5.203n ± 0% -78.38% (p=0.000 n=10)
Equal/20 28.880n ± 0% 6.404n ± 0% -77.83% (p=0.000 n=10)
Equal/32 43.320n ± 0% 6.404n ± 0% -85.22% (p=0.000 n=10)
Equal/4K 4938.50n ± 0% 55.43n ± 0% -98.88% (p=0.000 n=10)
Equal/4M 5048.8µ ± 0% 202.0µ ± 0% -96.00% (p=0.000 n=10)
Equal/64M 80.819m ± 0% 4.539m ± 0% -94.38% (p=0.000 n=10)
EqualBothUnaligned/64_0 79.830n ± 0% 4.803n ± 0% -93.98% (p=0.000 n=10)
EqualBothUnaligned/64_1 79.830n ± 0% 4.803n ± 0% -93.98% (p=0.000 n=10)
EqualBothUnaligned/64_4 79.830n ± 0% 4.803n ± 0% -93.98% (p=0.000 n=10)
EqualBothUnaligned/64_7 79.830n ± 0% 4.803n ± 0% -93.98% (p=0.000 n=10)
EqualBothUnaligned/4096_0 4937.00n ± 0% 65.64n ± 0% -98.67% (p=0.000 n=10)
EqualBothUnaligned/4096_1 4937.00n ± 0% 78.85n ± 0% -98.40% (p=0.000 n=10)
EqualBothUnaligned/4096_4 4937.00n ± 0% 78.87n ± 0% -98.40% (p=0.000 n=10)
EqualBothUnaligned/4096_7 4937.00n ± 0% 78.87n ± 0% -98.40% (p=0.000 n=10)
EqualBothUnaligned/4194304_0 5049.2µ ± 0% 204.2µ ± 0% -95.96% (p=0.000 n=10)
EqualBothUnaligned/4194304_1 5049.2µ ± 0% 205.1µ ± 0% -95.94% (p=0.000 n=10)
EqualBothUnaligned/4194304_4 5049.4µ ± 0% 205.1µ ± 0% -95.94% (p=0.000 n=10)
EqualBothUnaligned/4194304_7 5049.2µ ± 0% 205.1µ ± 0% -95.94% (p=0.000 n=10)
EqualBothUnaligned/67108864_0 80.796m ± 0% 3.863m ± 0% -95.22% (p=0.000 n=10)
EqualBothUnaligned/67108864_1 80.801m ± 0% 3.706m ± 0% -95.41% (p=0.000 n=10)
EqualBothUnaligned/67108864_4 80.799m ± 0% 3.706m ± 0% -95.41% (p=0.000 n=10)
EqualBothUnaligned/67108864_7 80.781m ± 0% 3.706m ± 0% -95.41% (p=0.000 n=10)
geomean 1.040µ 149.6n -85.63%
Change-Id: Id4c2bc0ca758337dd9759df83750c761814be488
Reviewed-on: https://go-review.googlesource.com/c/go/+/667255
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
|
|
Introduce ABIInternal support for Index/IndexString/IndexByte/IndexByteString
goos: linux
goarch: arm64
pkg: bytes
│ base.txt │ new.txt │
│ B/s │ B/s vs base │
IndexByte/10 1.090Gi ± 0% 1.313Gi ± 0% +20.51% (p=0.000 n=10)
IndexByte/32 3.714Gi ± 0% 4.289Gi ± 0% +15.47% (p=0.000 n=10)
IndexByte/4K 22.92Gi ± 0% 23.01Gi ± 0% +0.37% (p=0.000 n=10)
IndexByte/4M 20.23Gi ± 0% 20.35Gi ± 0% +0.60% (p=0.000 n=10)
IndexByte/64M 23.82Gi ± 0% 23.81Gi ± 0% -0.01% (p=0.002 n=10)
IndexBytePortable/10 788.5Mi ± 0% 788.5Mi ± 0% ~ (p=0.722 n=10)
IndexBytePortable/32 1002.3Mi ± 0% 1002.3Mi ± 0% ~ (p=0.137 n=10)
IndexBytePortable/4K 1.111Gi ± 0% 1.111Gi ± 0% ~ (p=0.692 n=10)
IndexBytePortable/4M 1.116Gi ± 0% 1.116Gi ± 0% ~ (p=0.158 n=10)
IndexBytePortable/64M 1.116Gi ± 0% 1.116Gi ± 0% -0.01% (p=0.000 n=10)
IndexRune/10 352.1Mi ± 0% 445.0Mi ± 0% +26.38% (p=0.000 n=10)
IndexRune/32 1.101Gi ± 0% 1.391Gi ± 0% +26.43% (p=0.000 n=10)
IndexRune/4K 21.07Gi ± 0% 21.25Gi ± 0% +0.82% (p=0.000 n=10)
IndexRune/4M 23.81Gi ± 0% 23.81Gi ± 0% ~ (p=0.218 n=10)
IndexRune/64M 23.81Gi ± 0% 23.81Gi ± 0% ~ (p=0.271 n=10)
IndexRuneASCII/10 1.038Gi ± 0% 1.190Gi ± 1% +14.63% (p=0.000 n=10)
IndexRuneASCII/32 3.643Gi ± 2% 4.203Gi ± 0% +15.38% (p=0.000 n=10)
IndexRuneASCII/4K 22.90Gi ± 0% 22.98Gi ± 0% +0.34% (p=0.000 n=10)
IndexRuneASCII/4M 23.81Gi ± 0% 23.81Gi ± 0% ~ (p=0.108 n=10)
IndexRuneASCII/64M 23.82Gi ± 0% 23.81Gi ± 0% ~ (p=0.105 n=10)
IndexRuneUnicode/Latin/10 404.4Mi ± 0% 493.7Mi ± 0% +22.10% (p=0.000 n=10)
IndexRuneUnicode/Latin/32 1.261Gi ± 0% 1.543Gi ± 0% +22.31% (p=0.000 n=10)
IndexRuneUnicode/Latin/4K 6.966Gi ± 0% 8.115Gi ± 0% +16.50% (p=0.000 n=10)
IndexRuneUnicode/Latin/4M 6.599Gi ± 0% 7.576Gi ± 0% +14.80% (p=0.000 n=10)
IndexRuneUnicode/Latin/64M 6.297Gi ± 0% 7.070Gi ± 2% +12.28% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/10 385.9Mi ± 0% 440.1Mi ± 0% +14.03% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/32 1.206Gi ± 0% 1.375Gi ± 0% +14.05% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/4K 2.468Gi ± 0% 2.921Gi ± 0% +18.37% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/4M 2.386Gi ± 0% 2.845Gi ± 0% +19.23% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/64M 2.280Gi ± 0% 2.717Gi ± 0% +19.14% (p=0.000 n=10)
IndexRuneUnicode/Han/10 307.1Mi ± 0% 331.5Mi ± 0% +7.94% (p=0.000 n=10)
IndexRuneUnicode/Han/32 982.2Mi ± 0% 1060.2Mi ± 0% +7.94% (p=0.000 n=10)
IndexRuneUnicode/Han/4K 4.986Gi ± 0% 5.957Gi ± 0% +19.48% (p=0.000 n=10)
IndexRuneUnicode/Han/4M 3.822Gi ± 0% 4.198Gi ± 0% +9.83% (p=0.000 n=10)
IndexRuneUnicode/Han/64M 3.765Gi ± 0% 4.140Gi ± 0% +9.96% (p=0.000 n=10)
Index/10 634.6Mi ± 0% 635.2Mi ± 0% +0.09% (p=0.000 n=10)
Index/32 375.3Mi ± 0% 385.1Mi ± 0% +2.63% (p=0.000 n=10)
Index/4K 754.8Mi ± 0% 755.2Mi ± 0% +0.04% (p=0.001 n=10)
Index/4M 746.5Mi ± 0% 746.3Mi ± 0% -0.03% (p=0.000 n=10)
Index/64M 746.5Mi ± 0% 746.3Mi ± 0% -0.03% (p=0.000 n=10)
IndexEasy/10 714.6Mi ± 0% 714.6Mi ± 0% +0.00% (p=0.001 n=10)
IndexEasy/32 1.221Gi ± 0% 1.524Gi ± 0% +24.81% (p=0.000 n=10)
IndexEasy/4K 21.06Gi ± 0% 21.47Gi ± 0% +1.91% (p=0.000 n=10)
IndexEasy/4M 20.23Gi ± 0% 20.24Gi ± 0% ~ (p=0.684 n=10)
IndexEasy/64M 13.07Gi ± 0% 12.58Gi ± 4% -3.75% (p=0.000 n=10)
IndexHard1 1.114Gi ± 0% 1.114Gi ± 0% ~ (p=0.193 n=10)
IndexHard2 1.111Gi ± 0% 1.112Gi ± 0% +0.04% (p=0.001 n=10)
IndexHard3 1.086Gi ± 0% 1.081Gi ± 0% -0.37% (p=0.000 n=10)
IndexHard4 607.9Mi ± 0% 607.9Mi ± 0% ~ (p=0.136 n=10)
geomean 2.536Gi 2.720Gi +7.26%
Change-Id: I1fc246783ebb215882d7144d05dbe2433dc66751
Reviewed-on: https://go-review.googlesource.com/c/go/+/662415
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Keith Randall <khr@golang.org>
|
|
Introduce ABIInternal support for Count/CountString
Move <32 size block from function end to beginning as fastpath
goos: linux
goarch: arm64
pkg: strings
│ base.txt │ new.txt │
│ B/s │ B/s vs base │
CountByte/10 672.5Mi ± 0% 692.9Mi ± 0% +3.04% (p=0.000 n=10)
CountByte/32 3.592Gi ± 0% 3.970Gi ± 0% +10.53% (p=0.000 n=10)
CountByte/4096 16.63Gi ± 0% 16.73Gi ± 0% +0.64% (p=0.000 n=10)
CountByte/4194304 14.97Gi ± 2% 15.02Gi ± 1% ~ (p=0.190 n=10)
CountByte/67108864 12.50Gi ± 0% 12.50Gi ± 0% ~ (p=0.853 n=10)
geomean 5.931Gi 6.099Gi +2.83%
Change-Id: I5af1be2b117d9fb8d570739637499923de62251c
Reviewed-on: https://go-review.googlesource.com/c/go/+/662395
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Commit-Queue: Keith Randall <khr@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
|
|
instructions
Provide a synthesized version of the MIN/MAX/MINU/MAXU instructions
if they're not natively available. This allows these instructions to
be used in assembly unconditionally.
Use MIN in internal/bytealg.compare.
Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64
Change-Id: I8a5a3a59f0a9205e136fc3d673b23eaf3ca469f8
Reviewed-on: https://go-review.googlesource.com/c/go/+/653295
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
|
|
Benchmark on Loongson 3A6000 and 3A5000:
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000 @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
CountSingle/10 13.210n ± 0% 9.984n ± 0% -24.42% (p=0.000 n=15)
CountSingle/32 31.970n ± 1% 7.205n ± 0% -77.46% (p=0.000 n=15)
CountSingle/4K 4039.0n ± 0% 108.7n ± 0% -97.31% (p=0.000 n=15)
CountSingle/4M 4158.9µ ± 0% 117.3µ ± 0% -97.18% (p=0.000 n=15)
CountSingle/64M 68.641m ± 0% 2.585m ± 1% -96.23% (p=0.000 n=15)
geomean 13.72µ 1.189µ -91.34%
| bench.old | bench.new |
| B/s | B/s vs base |
CountSingle/10 722.0Mi ± 0% 955.2Mi ± 0% +32.30% (p=0.000 n=15)
CountSingle/32 954.6Mi ± 1% 4235.4Mi ± 0% +343.68% (p=0.000 n=15)
CountSingle/4K 967.2Mi ± 0% 35947.6Mi ± 0% +3616.64% (p=0.000 n=15)
CountSingle/4M 961.8Mi ± 0% 34092.7Mi ± 0% +3444.71% (p=0.000 n=15)
CountSingle/64M 932.4Mi ± 0% 24757.2Mi ± 1% +2555.24% (p=0.000 n=15)
geomean 902.2Mi 10.17Gi +1054.77%
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A5000 @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
CountSingle/10 14.41n ± 0% 12.81n ± 0% -11.10% (p=0.000 n=15)
CountSingle/32 36.230n ± 0% 9.609n ± 0% -73.48% (p=0.000 n=15)
CountSingle/4K 4366.0n ± 0% 165.5n ± 0% -96.21% (p=0.000 n=15)
CountSingle/4M 4464.7µ ± 0% 325.2µ ± 0% -92.72% (p=0.000 n=15)
CountSingle/64M 75.627m ± 0% 8.307m ± 69% -89.02% (p=0.000 n=15)
geomean 15.04µ 2.229µ -85.18%
| bench.old | bench.new |
| B/s | B/s vs base |
CountSingle/10 661.8Mi ± 0% 744.4Mi ± 0% +12.49% (p=0.000 n=15)
CountSingle/32 842.4Mi ± 0% 3176.1Mi ± 0% +277.03% (p=0.000 n=15)
CountSingle/4K 894.7Mi ± 0% 23596.7Mi ± 0% +2537.34% (p=0.000 n=15)
CountSingle/4M 895.9Mi ± 0% 12299.7Mi ± 0% +1272.88% (p=0.000 n=15)
CountSingle/64M 846.3Mi ± 0% 7703.9Mi ± 41% +810.34% (p=0.000 n=15)
geomean 823.3Mi 5.424Gi +574.68%
Change-Id: Ie07592beac61bdb093470c524049ed494df4d703
Reviewed-on: https://go-review.googlesource.com/c/go/+/586055
Reviewed-by: Meidan Li <limeidan@loongson.cn>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
|
|
Add a simple assembly implementation of Count/CountString for mips64x.
name old sec/op new sec/op vs base
CountSingle/10-4 31.16n ± 0% 41.69n ± 0% +33.79% (p=0.000 n=11)
CountSingle/32-4 69.58n ± 0% 59.61n ± 0% -14.33% (p=0.000 n=11)
CountSingle/4K-4 7.428µ ± 0% 5.153µ ± 0% -30.63% (p=0.000 n=11)
CountSingle/4M-4 7.634m ± 0% 5.300m ± 0% -30.58% (p=0.000 n=11)
CountSingle/64M-4 134.4m ± 0% 100.8m ± 3% -24.99% (p=0.000 n=11)
name old B/s new B/s vs base
CountSingle/10-4 306.1Mi ± 0% 228.8Mi ± 0% -25.25% (p=0.000 n=11)
CountSingle/32-4 438.6Mi ± 0% 512.0Mi ± 0% +16.74% (p=0.000 n=11)
CountSingle/4K-4 525.9Mi ± 0% 758.0Mi ± 0% +44.15% (p=0.000 n=11)
CountSingle/4M-4 523.9Mi ± 0% 754.7Mi ± 0% +44.05% (p=0.000 n=11)
CountSingle/64M-4 476.3Mi ± 0% 635.0Mi ± 0% +33.31% (p=0.000 n=11)
Change-Id: Id5ddbea0d080e2903156ef8dc86c030a8179115b
Reviewed-on: https://go-review.googlesource.com/c/go/+/650995
Reviewed-by: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Auto-Submit: Keith Randall <khr@golang.org>
|
|
Now that riscv64 is only regabi, remove the entrypoint separation and
have runtime.memequal_varlen call runtime.memequal. Add a zero byte
length check and replace the equal and not equal exit paths with a
single exit path that conditions on length reaching zero.
Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64
Change-Id: Ida4e54378daa7fd423f759753eba04ce513a27cb
Reviewed-on: https://go-review.googlesource.com/c/go/+/648855
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
|
|
The existing implementations of IndexByte and IndexByteString for
riscv64 are very simplistic. They load and compare a single byte at
a time in a tight loop. It's possible to improve performance in the
general case by loading and checking 8 bytes at a time. This is
achieved using the 'Determine if a word has a byte equal to n' bit
hack from https://graphics.stanford.edu/~seander/bithacks.html.
We broadcast the byte we're looking for across a 64 bit register,
let v be the result of xoring that register with 8 bytes loaded
from the buffer and then use the formula,
(((v) - 0x0101010101010101UL) & ~(v) & 0x8080808080808080UL)
which evaluates to true if any one of the bytes in v is 0, i.e,
matches the byte we're looking for. We then just need to figure
out which byte out of the 8 it is to return the correct index.
This change generally improves performance when the byte we're
looking for is not in the first 24 bytes of the buffer and degrades
performance slightly when it is.
Some example benchmarks results from the bytes and strings package
are presented below. These were generated on a VisionFive2 running
Ubuntu 24.04.
Subset of bytes Index benchmarks
IndexByte/10 46.49n ± 0% 44.08n ± 0% -5.19% (p=0.000 n=10)
IndexByte/32 75.98n ± 0% 67.90n ± 0% -10.63% (p=0.000 n=10)
IndexByte/4K 5.512µ ± 0% 2.113µ ± 0% -61.67% (p=0.000 n=10)
IndexByte/4M 7.354m ± 0% 3.218m ± 0% -56.24% (p=0.000 n=10)
IndexByte/64M 90.15m ± 0% 33.86m ± 0% -62.44% (p=0.000 n=10)
IndexBytePortable/10 50.41n ± 0% 54.92n ± 1% +8.94% (p=0.000 n=10)
IndexBytePortable/32 111.9n ± 0% 115.5n ± 0% +3.22% (p=0.000 n=10)
IndexBytePortable/4K 10.99µ ± 0% 10.99µ ± 0% +0.04% (p=0.000 n=10)
IndexBytePortable/4M 11.24m ± 0% 11.24m ± 0% ~ (p=0.218 n=10)
IndexBytePortable/64M 179.8m ± 0% 179.8m ± 0% +0.01% (p=0.001 n=10)
IndexRune/10 104.2n ± 0% 104.4n ± 0% +0.19% (p=0.000 n=10)
IndexRune/32 133.7n ± 0% 139.3n ± 0% +4.23% (p=0.000 n=10)
IndexRune/4K 5.573µ ± 0% 2.184µ ± 0% -60.81% (p=0.000 n=10)
IndexRune/4M 5.634m ± 0% 2.112m ± 0% -62.51% (p=0.000 n=10)
IndexRune/64M 90.19m ± 0% 33.87m ± 0% -62.45% (p=0.000 n=10)
IndexRuneASCII/10 50.42n ± 2% 47.14n ± 0% -6.52% (p=0.000 n=10)
IndexRuneASCII/32 79.64n ± 1% 70.39n ± 0% -11.61% (p=0.000 n=10)
IndexRuneASCII/4K 5.516µ ± 0% 2.115µ ± 0% -61.66% (p=0.000 n=10)
IndexRuneASCII/4M 5.634m ± 0% 2.112m ± 0% -62.51% (p=0.000 n=10)
IndexRuneASCII/64M 90.16m ± 0% 33.86m ± 0% -62.44% (p=0.000 n=10)
IndexRuneUnicode/Latin/10 82.14n ± 0% 82.07n ± 0% -0.09% (p=0.000 n=10)
IndexRuneUnicode/Latin/32 111.6n ± 0% 117.1n ± 0% +4.93% (p=0.000 n=10)
IndexRuneUnicode/Latin/4K 6.222µ ± 0% 3.429µ ± 0% -44.89% (p=0.000 n=10)
IndexRuneUnicode/Latin/4M 8.189m ± 0% 4.706m ± 0% -42.53% (p=0.000 n=10)
IndexRuneUnicode/Latin/64M 171.8m ± 2% 105.8m ± 0% -38.44% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/10 89.69n ± 0% 89.67n ± 0% -0.02% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/32 119.1n ± 0% 124.1n ± 0% +4.20% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/4K 8.002µ ± 0% 6.232µ ± 0% -22.12% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/4M 9.501m ± 0% 7.510m ± 0% -20.95% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/64M 186.5m ± 0% 150.3m ± 0% -19.41% (p=0.000 n=10)
IndexRuneUnicode/Han/10 117.8n ± 0% 118.1n ± 0% +0.25% (p=0.000 n=10)
IndexRuneUnicode/Han/32 151.5n ± 0% 154.0n ± 0% +1.65% (p=0.000 n=10)
IndexRuneUnicode/Han/4K 6.664µ ± 0% 4.125µ ± 0% -38.11% (p=0.000 n=10)
IndexRuneUnicode/Han/4M 8.526m ± 0% 5.502m ± 0% -35.46% (p=0.000 n=10)
IndexRuneUnicode/Han/64M 171.8m ± 1% 112.2m ± 0% -34.68% (p=0.000 n=10)
Index/10 199.3n ± 1% 199.4n ± 0% ~ (p=1.000 n=10)
Index/32 547.7n ± 0% 547.3n ± 0% -0.08% (p=0.001 n=10)
Index/4K 38.62µ ± 0% 38.62µ ± 0% -0.01% (p=0.023 n=10)
Index/4M 40.46m ± 0% 40.45m ± 0% ~ (p=0.105 n=10)
Index/64M 648.5m ± 0% 648.4m ± 0% ~ (p=1.000 n=10)
IndexEasy/10 70.25n ± 0% 70.92n ± 0% +0.95% (p=0.000 n=10)
IndexEasy/32 104.60n ± 0% 95.67n ± 0% -8.54% (p=0.000 n=10)
IndexEasy/4K 5.544µ ± 0% 2.142µ ± 0% -61.36% (p=0.000 n=10)
IndexEasy/4M 7.354m ± 0% 3.213m ± 0% -56.32% (p=0.000 n=10)
IndexEasy/64M 114.93m ± 2% 52.61m ± 0% -54.22% (p=0.000 n=10)
IndexHard1 10.09m ± 0% 10.09m ± 0% ~ (p=0.393 n=10)
IndexHard2 10.09m ± 0% 10.09m ± 0% ~ (p=0.481 n=10)
IndexHard3 10.09m ± 0% 10.09m ± 0% ~ (p=1.000 n=10)
IndexHard4 10.09m ± 0% 10.09m ± 0% ~ (p=0.739 n=10)
LastIndexHard1 10.71m ± 0% 10.71m ± 0% ~ (p=0.052 n=10)
LastIndexHard2 10.71m ± 0% 10.71m ± 0% ~ (p=0.218 n=10)
LastIndexHard3 10.71m ± 0% 10.71m ± 0% ~ (p=0.739 n=10)
IndexAnyASCII/1:1 30.13n ± 0% 30.79n ± 0% +2.19% (p=0.000 n=10)
IndexAnyASCII/1:2 31.49n ± 0% 32.16n ± 0% +2.13% (p=0.000 n=10)
IndexAnyASCII/1:4 34.16n ± 0% 34.82n ± 0% +1.93% (p=0.000 n=10)
IndexAnyASCII/1:8 39.50n ± 0% 40.16n ± 0% +1.67% (p=0.000 n=10)
IndexAnyASCII/1:16 50.20n ± 0% 50.87n ± 0% +1.33% (p=0.000 n=10)
IndexAnyASCII/1:32 81.04n ± 0% 50.29n ± 0% -37.94% (p=0.000 n=10)
IndexAnyASCII/1:64 119.80n ± 0% 66.94n ± 0% -44.13% (p=0.000 n=10)
IndexAnyASCII/16:1 54.86n ± 0% 55.53n ± 0% +1.22% (p=0.000 n=10)
IndexAnyASCII/16:2 268.2n ± 0% 268.2n ± 0% ~ (p=1.000 n=10)
IndexAnyASCII/16:4 288.1n ± 0% 288.1n ± 0% ~ (p=1.000 n=10) ¹
IndexAnyASCII/16:8 328.3n ± 0% 328.2n ± 0% ~ (p=0.370 n=10)
IndexAnyASCII/16:16 413.4n ± 0% 413.4n ± 0% ~ (p=0.628 n=10)
IndexAnyASCII/16:32 574.0n ± 0% 573.9n ± 0% ~ (p=0.141 n=10)
IndexAnyASCII/16:64 895.1n ± 0% 895.1n ± 0% ~ (p=0.548 n=10)
IndexAnyASCII/256:1 381.4n ± 0% 175.4n ± 0% -53.99% (p=0.000 n=10)
IndexAnyASCII/256:2 2.998µ ± 0% 2.998µ ± 0% ~ (p=0.365 n=10)
IndexAnyASCII/256:4 3.018µ ± 0% 3.018µ ± 0% ~ (p=0.650 n=10)
IndexAnyASCII/256:8 3.058µ ± 0% 3.064µ ± 0% +0.20% (p=0.011 n=10)
IndexAnyASCII/256:16 3.143µ ± 0% 3.150µ ± 0% +0.21% (p=0.000 n=10)
IndexAnyASCII/256:32 3.303µ ± 0% 3.307µ ± 0% +0.12% (p=0.000 n=10)
IndexAnyASCII/256:64 3.625µ ± 0% 3.638µ ± 0% +0.36% (p=0.000 n=10)
IndexAnyUTF8/1:1 30.13n ± 0% 30.94n ± 0% +2.69% (p=0.000 n=10)
IndexAnyUTF8/1:2 31.49n ± 0% 32.30n ± 0% +2.59% (p=0.000 n=10)
IndexAnyUTF8/1:4 34.16n ± 0% 35.03n ± 0% +2.55% (p=0.000 n=10)
IndexAnyUTF8/1:8 39.50n ± 0% 40.16n ± 0% +1.67% (p=0.000 n=10)
IndexAnyUTF8/1:16 50.20n ± 0% 50.84n ± 0% +1.27% (p=0.000 n=10)
IndexAnyUTF8/1:32 81.02n ± 0% 61.55n ± 0% -24.03% (p=0.000 n=10)
IndexAnyUTF8/1:64 119.80n ± 0% 80.04n ± 0% -33.19% (p=0.000 n=10)
IndexAnyUTF8/16:1 489.0n ± 0% 489.0n ± 0% ~ (p=1.000 n=10)
IndexAnyUTF8/16:2 361.9n ± 0% 372.6n ± 0% +2.96% (p=0.000 n=10)
IndexAnyUTF8/16:4 404.7n ± 0% 415.4n ± 0% +2.64% (p=0.000 n=10)
IndexAnyUTF8/16:8 489.9n ± 0% 500.7n ± 0% +2.20% (p=0.000 n=10)
IndexAnyUTF8/16:16 661.2n ± 0% 671.9n ± 0% +1.62% (p=0.000 n=10)
IndexAnyUTF8/16:32 1004.0n ± 0% 881.6n ± 0% -12.19% (p=0.000 n=10)
IndexAnyUTF8/16:64 1.767µ ± 0% 1.129µ ± 0% -36.11% (p=0.000 n=10)
IndexAnyUTF8/256:1 7.072µ ± 0% 7.072µ ± 0% ~ (p=0.387 n=10)
IndexAnyUTF8/256:2 4.700µ ± 0% 4.872µ ± 0% +3.66% (p=0.000 n=10)
IndexAnyUTF8/256:4 5.386µ ± 0% 5.557µ ± 0% +3.18% (p=0.000 n=10)
IndexAnyUTF8/256:8 6.752µ ± 0% 6.923µ ± 0% +2.53% (p=0.000 n=10)
IndexAnyUTF8/256:16 9.493µ ± 0% 9.664µ ± 0% +1.80% (p=0.000 n=10)
IndexAnyUTF8/256:32 14.97µ ± 0% 12.93µ ± 0% -13.64% (p=0.000 n=10)
IndexAnyUTF8/256:64 27.15µ ± 0% 16.89µ ± 0% -37.80% (p=0.000 n=10)
LastIndexAnyASCII/1:1 30.78n ± 0% 31.45n ± 0% +2.18% (p=0.000 n=10)
LastIndexAnyASCII/1:2 32.13n ± 0% 32.80n ± 0% +2.07% (p=0.000 n=10)
LastIndexAnyASCII/1:4 34.81n ± 0% 35.48n ± 0% +1.92% (p=0.000 n=10)
LastIndexAnyASCII/1:8 40.14n ± 0% 40.81n ± 0% +1.67% (p=0.000 n=10)
LastIndexAnyASCII/1:16 50.85n ± 0% 51.51n ± 0% +1.30% (p=0.000 n=10)
LastIndexAnyASCII/1:32 84.03n ± 0% 50.85n ± 0% -39.49% (p=0.000 n=10)
LastIndexAnyASCII/1:64 121.50n ± 0% 68.16n ± 0% -43.90% (p=0.000 n=10)
LastIndexAnyASCII/16:1 249.7n ± 0% 249.7n ± 0% ~ (p=1.000 n=10) ¹
LastIndexAnyASCII/16:2 255.2n ± 0% 255.2n ± 0% ~ (p=1.000 n=10) ¹
LastIndexAnyASCII/16:4 274.0n ± 0% 274.0n ± 0% ~ (p=1.000 n=10) ¹
LastIndexAnyASCII/16:8 314.1n ± 0% 314.1n ± 0% ~ (p=1.000 n=10)
LastIndexAnyASCII/16:16 403.8n ± 0% 403.8n ± 0% ~ (p=1.000 n=10)
LastIndexAnyASCII/16:32 564.4n ± 0% 564.4n ± 0% ~ (p=1.000 n=10)
LastIndexAnyASCII/16:64 885.5n ± 0% 885.5n ± 0% ~ (p=0.474 n=10)
LastIndexAnyASCII/256:1 2.819µ ± 0% 2.819µ ± 0% ~ (p=0.211 n=10)
LastIndexAnyASCII/256:2 2.824µ ± 0% 2.824µ ± 0% ~ (p=1.000 n=10) ¹
LastIndexAnyASCII/256:4 2.843µ ± 0% 2.843µ ± 0% ~ (p=1.000 n=10) ¹
LastIndexAnyASCII/256:8 2.883µ ± 0% 2.883µ ± 0% ~ (p=1.000 n=10) ¹
LastIndexAnyASCII/256:16 2.973µ ± 0% 2.973µ ± 0% ~ (p=1.000 n=10)
LastIndexAnyASCII/256:32 3.133µ ± 0% 3.133µ ± 0% ~ (p=0.628 n=10)
LastIndexAnyASCII/256:64 3.454µ ± 0% 3.454µ ± 0% ~ (p=1.000 n=10)
LastIndexAnyUTF8/1:1 30.78n ± 0% 31.45n ± 0% +2.18% (p=0.000 n=10)
LastIndexAnyUTF8/1:2 32.13n ± 0% 32.80n ± 0% +2.07% (p=0.000 n=10)
LastIndexAnyUTF8/1:4 34.81n ± 0% 35.48n ± 0% +1.92% (p=0.000 n=10)
LastIndexAnyUTF8/1:8 40.14n ± 0% 40.81n ± 0% +1.67% (p=0.000 n=10)
LastIndexAnyUTF8/1:16 50.84n ± 0% 51.52n ± 0% +1.33% (p=0.000 n=10)
LastIndexAnyUTF8/1:32 83.87n ± 0% 62.90n ± 0% -25.00% (p=0.000 n=10)
LastIndexAnyUTF8/1:64 121.50n ± 0% 81.67n ± 0% -32.78% (p=0.000 n=10)
LastIndexAnyUTF8/16:1 330.0n ± 0% 330.0n ± 0% ~ (p=1.000 n=10)
LastIndexAnyUTF8/16:2 365.4n ± 1% 376.1n ± 0% +2.93% (p=0.000 n=10)
LastIndexAnyUTF8/16:4 399.9n ± 0% 410.6n ± 0% +2.68% (p=0.000 n=10)
LastIndexAnyUTF8/16:8 485.5n ± 0% 496.2n ± 0% +2.20% (p=0.000 n=10)
LastIndexAnyUTF8/16:16 656.8n ± 0% 667.5n ± 0% +1.63% (p=0.000 n=10)
LastIndexAnyUTF8/16:32 999.3n ± 0% 882.6n ± 0% -11.68% (p=0.000 n=10)
LastIndexAnyUTF8/16:64 1.744µ ± 0% 1.129µ ± 0% -35.26% (p=0.000 n=10)
LastIndexAnyUTF8/256:1 4.023µ ± 0% 4.023µ ± 0% 0.00% (p=0.033 n=10)
LastIndexAnyUTF8/256:2 4.645µ ± 0% 4.816µ ± 0% +3.68% (p=0.000 n=10)
LastIndexAnyUTF8/256:4 5.217µ ± 0% 5.388µ ± 0% +3.28% (p=0.000 n=10)
LastIndexAnyUTF8/256:8 6.587µ ± 0% 6.758µ ± 0% +2.60% (p=0.000 n=10)
LastIndexAnyUTF8/256:16 9.327µ ± 0% 9.498µ ± 0% +1.83% (p=0.000 n=10)
LastIndexAnyUTF8/256:32 14.81µ ± 0% 12.92µ ± 0% -12.73% (p=0.000 n=10)
LastIndexAnyUTF8/256:64 26.69µ ± 0% 16.84µ ± 0% -36.92% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic2 625.6µ ± 0% 625.6µ ± 0% ~ (p=0.529 n=10)
IndexPeriodic/IndexPeriodic4 625.5µ ± 0% 625.6µ ± 0% +0.01% (p=0.002 n=10)
IndexPeriodic/IndexPeriodic8 625.4µ ± 0% 625.4µ ± 0% +0.01% (p=0.001 n=10)
IndexPeriodic/IndexPeriodic16 236.5µ ± 0% 225.4µ ± 0% -4.69% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic32 171.1µ ± 3% 133.4µ ± 0% -22.05% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic64 139.10µ ± 3% 89.28µ ± 0% -35.82% (p=0.000 n=10)
geomean 4.222µ 3.628µ -14.0
Subset of strings Index benchmarks
IndexRune 110.7n ± 0% 117.7n ± 0% +6.32% (p=0.000 n=10)
IndexRuneLongString 246.6n ± 0% 187.4n ± 3% -24.01% (p=0.000 n=10)
IndexRuneFastPath 46.82n ± 0% 46.06n ± 0% -1.62% (p=0.000 n=10)
Index 48.28n ± 0% 47.61n ± 0% -1.39% (p=0.000 n=10)
LastIndex 34.50n ± 0% 34.50n ± 0% ~ (p=1.000 n=10) ¹
IndexByte 41.72n ± 0% 40.83n ± 0% -2.13% (p=0.000 n=10)
IndexHard1 10.01m ± 0% 10.01m ± 0% +0.02% (p=0.000 n=10)
IndexHard2 10.01m ± 0% 10.01m ± 0% +0.02% (p=0.000 n=10)
IndexHard3 10.01m ± 0% 10.01m ± 0% +0.02% (p=0.000 n=10)
IndexHard4 10.01m ± 0% 10.01m ± 0% +0.02% (p=0.000 n=10)
LastIndexHard1 10.71m ± 0% 10.71m ± 0% +0.03% (p=0.000 n=10)
LastIndexHard2 10.71m ± 0% 10.71m ± 0% +0.03% (p=0.000 n=10)
LastIndexHard3 10.71m ± 0% 10.71m ± 0% +0.03% (p=0.000 n=10)
IndexTorture 71.33µ ± 0% 71.37µ ± 0% +0.05% (p=0.000 n=10)
IndexAnyASCII/1:1 34.40n ± 0% 35.07n ± 0% +1.95% (p=0.000 n=10)
IndexAnyASCII/1:2 46.87n ± 0% 47.54n ± 0% +1.43% (p=0.000 n=10)
IndexAnyASCII/1:4 49.53n ± 0% 50.20n ± 0% +1.35% (p=0.000 n=10)
IndexAnyASCII/1:8 54.86n ± 0% 55.53n ± 0% +1.22% (p=0.000 n=10)
IndexAnyASCII/1:16 65.56n ± 0% 66.24n ± 0% +1.04% (p=0.000 n=10)
IndexAnyASCII/1:32 86.97n ± 0% 77.82n ± 0% -10.52% (p=0.000 n=10)
IndexAnyASCII/1:64 134.50n ± 0% 98.57n ± 0% -26.71% (p=0.000 n=10)
IndexAnyASCII/16:1 54.19n ± 0% 54.86n ± 0% +1.24% (p=0.000 n=10)
IndexAnyASCII/16:2 257.4n ± 0% 256.7n ± 0% -0.27% (p=0.000 n=10)
IndexAnyASCII/16:4 275.3n ± 0% 275.3n ± 0% ~ (p=1.000 n=10)
IndexAnyASCII/16:8 315.4n ± 0% 315.5n ± 0% +0.03% (p=0.001 n=10)
IndexAnyASCII/16:16 405.4n ± 0% 405.4n ± 0% ~ (p=1.000 n=10)
IndexAnyASCII/16:32 566.0n ± 0% 566.0n ± 0% ~ (p=1.000 n=10)
IndexAnyASCII/16:64 887.0n ± 0% 887.1n ± 0% ~ (p=0.181 n=10)
IndexAnyASCII/256:1 380.0n ± 0% 174.7n ± 0% -54.03% (p=0.000 n=10)
IndexAnyASCII/256:2 2.826µ ± 0% 2.826µ ± 0% ~ (p=1.000 n=10) ¹
IndexAnyASCII/256:4 2.844µ ± 0% 2.844µ ± 0% ~ (p=1.000 n=10) ¹
IndexAnyASCII/256:8 2.884µ ± 0% 2.884µ ± 0% ~ (p=0.087 n=10)
IndexAnyASCII/256:16 2.974µ ± 0% 2.974µ ± 0% ~ (p=1.000 n=10)
IndexAnyASCII/256:32 3.135µ ± 0% 3.135µ ± 0% ~ (p=1.000 n=10)
IndexAnyASCII/256:64 3.456µ ± 0% 3.456µ ± 0% ~ (p=1.000 n=10) ¹
IndexAnyUTF8/1:1 38.13n ± 0% 38.13n ± 0% ~ (p=1.000 n=10) ¹
IndexAnyUTF8/1:2 46.87n ± 0% 47.54n ± 0% +1.43% (p=0.000 n=10)
IndexAnyUTF8/1:4 49.53n ± 0% 50.19n ± 0% +1.33% (p=0.000 n=10)
IndexAnyUTF8/1:8 54.86n ± 0% 55.52n ± 0% +1.20% (p=0.000 n=10)
IndexAnyUTF8/1:16 65.56n ± 0% 66.23n ± 0% +1.02% (p=0.000 n=10)
IndexAnyUTF8/1:32 86.97n ± 0% 82.25n ± 0% -5.42% (p=0.000 n=10)
IndexAnyUTF8/1:64 134.50n ± 0% 99.96n ± 0% -25.68% (p=0.000 n=10)
IndexAnyUTF8/16:1 98.34n ± 0% 98.34n ± 0% ~ (p=1.000 n=10)
IndexAnyUTF8/16:2 462.7n ± 0% 473.7n ± 0% +2.38% (p=0.000 n=10)
IndexAnyUTF8/16:4 504.6n ± 0% 515.3n ± 0% +2.11% (p=0.000 n=10)
IndexAnyUTF8/16:8 589.1n ± 0% 599.7n ± 0% +1.80% (p=0.000 n=10)
IndexAnyUTF8/16:16 760.4n ± 0% 770.9n ± 0% +1.38% (p=0.000 n=10)
IndexAnyUTF8/16:32 1.103µ ± 0% 1.023µ ± 0% -7.25% (p=0.000 n=10)
IndexAnyUTF8/16:64 1.857µ ± 0% 1.294µ ± 0% -30.32% (p=0.000 n=10)
IndexAnyUTF8/256:1 1.066µ ± 0% 1.066µ ± 0% ~ (p=1.000 n=10) ¹
IndexAnyUTF8/256:2 6.106µ ± 0% 6.277µ ± 0% +2.81% (p=0.000 n=10)
IndexAnyUTF8/256:4 6.787µ ± 0% 6.958µ ± 0% +2.52% (p=0.000 n=10)
IndexAnyUTF8/256:8 8.136µ ± 0% 8.308µ ± 0% +2.11% (p=0.000 n=10)
IndexAnyUTF8/256:16 10.88µ ± 0% 11.05µ ± 0% +1.57% (p=0.000 n=10)
IndexAnyUTF8/256:32 16.36µ ± 0% 14.90µ ± 0% -8.93% (p=0.000 n=10)
IndexAnyUTF8/256:64 28.51µ ± 0% 19.41µ ± 0% -31.92% (p=0.000 n=10)
LastIndexAnyASCII/1:1 35.79n ± 0% 38.52n ± 0% +7.63% (p=0.000 n=10)
LastIndexAnyASCII/1:2 37.12n ± 0% 39.85n ± 0% +7.35% (p=0.000 n=10)
LastIndexAnyASCII/1:4 39.76n ± 0% 42.08n ± 0% +5.84% (p=0.000 n=10)
LastIndexAnyASCII/1:8 44.82n ± 0% 47.22n ± 0% +5.34% (p=0.000 n=10)
LastIndexAnyASCII/1:16 55.53n ± 0% 57.92n ± 3% +4.30% (p=0.000 n=10)
LastIndexAnyASCII/1:32 76.94n ± 0% 70.16n ± 0% -8.81% (p=0.000 n=10)
LastIndexAnyASCII/1:64 124.40n ± 0% 89.67n ± 0% -27.92% (p=0.000 n=10)
LastIndexAnyASCII/16:1 245.9n ± 0% 245.9n ± 0% ~ (p=1.000 n=10)
LastIndexAnyASCII/16:2 255.2n ± 0% 255.2n ± 0% ~ (p=1.000 n=10) ¹
LastIndexAnyASCII/16:4 275.1n ± 0% 275.1n ± 0% ~ (p=1.000 n=10) ¹
LastIndexAnyASCII/16:8 315.2n ± 0% 315.2n ± 0% ~ (p=1.000 n=10)
LastIndexAnyASCII/16:16 400.4n ± 0% 400.4n ± 0% ~ (p=0.087 n=10)
LastIndexAnyASCII/16:32 560.9n ± 0% 560.9n ± 0% ~ (p=0.124 n=10)
LastIndexAnyASCII/16:64 882.1n ± 0% 882.0n ± 0% -0.01% (p=0.003 n=10)
LastIndexAnyASCII/256:1 2.815µ ± 0% 2.815µ ± 0% ~ (p=0.211 n=10)
LastIndexAnyASCII/256:2 2.824µ ± 0% 2.824µ ± 0% ~ (p=1.000 n=10)
LastIndexAnyASCII/256:4 2.844µ ± 0% 2.844µ ± 0% ~ (p=1.000 n=10) ¹
LastIndexAnyASCII/256:8 2.884µ ± 0% 2.884µ ± 0% ~ (p=1.000 n=10) ¹
LastIndexAnyASCII/256:16 2.969µ ± 0% 2.969µ ± 0% ~ (p=1.000 n=10)
LastIndexAnyASCII/256:32 3.130µ ± 0% 3.130µ ± 0% ~ (p=1.000 n=10) ¹
LastIndexAnyASCII/256:64 3.451µ ± 0% 3.451µ ± 0% ~ (p=0.474 n=10)
LastIndexAnyUTF8/1:1 35.79n ± 0% 36.13n ± 0% +0.95% (p=0.000 n=10)
LastIndexAnyUTF8/1:2 37.11n ± 0% 37.47n ± 0% +0.97% (p=0.000 n=10)
LastIndexAnyUTF8/1:4 39.75n ± 0% 40.14n ± 0% +0.97% (p=0.000 n=10)
LastIndexAnyUTF8/1:8 44.82n ± 0% 45.49n ± 0% +1.49% (p=0.000 n=10)
LastIndexAnyUTF8/1:16 55.52n ± 0% 56.20n ± 0% +1.22% (p=0.000 n=10)
LastIndexAnyUTF8/1:32 76.93n ± 0% 74.25n ± 0% -3.48% (p=0.000 n=10)
LastIndexAnyUTF8/1:64 124.40n ± 0% 91.15n ± 0% -26.73% (p=0.000 n=10)
LastIndexAnyUTF8/16:1 322.5n ± 0% 322.5n ± 0% ~ (p=0.087 n=10)
LastIndexAnyUTF8/16:2 634.2n ± 0% 616.4n ± 0% -2.81% (p=0.000 n=10)
LastIndexAnyUTF8/16:4 674.5n ± 0% 657.9n ± 0% -2.46% (p=0.000 n=10)
LastIndexAnyUTF8/16:8 758.3n ± 0% 741.0n ± 0% -2.28% (p=0.000 n=10)
LastIndexAnyUTF8/16:16 929.6n ± 0% 912.3n ± 0% -1.86% (p=0.000 n=10)
LastIndexAnyUTF8/16:32 1.272µ ± 0% 1.176µ ± 0% -7.55% (p=0.000 n=10)
LastIndexAnyUTF8/16:64 2.018µ ± 0% 1.453µ ± 0% -28.00% (p=0.000 n=10)
LastIndexAnyUTF8/256:1 4.015µ ± 0% 4.016µ ± 0% +0.02% (p=0.000 n=10)
LastIndexAnyUTF8/256:2 8.896µ ± 0% 8.537µ ± 0% -4.04% (p=0.000 n=10)
LastIndexAnyUTF8/256:4 9.553µ ± 0% 9.217µ ± 0% -3.52% (p=0.000 n=10)
LastIndexAnyUTF8/256:8 10.90µ ± 0% 10.54µ ± 0% -3.29% (p=0.000 n=10)
LastIndexAnyUTF8/256:16 13.64µ ± 0% 13.28µ ± 0% -2.63% (p=0.000 n=10)
LastIndexAnyUTF8/256:32 19.12µ ± 0% 17.16µ ± 1% -10.23% (p=0.000 n=10)
LastIndexAnyUTF8/256:64 31.11µ ± 0% 21.98µ ± 0% -29.36% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic2 625.5µ ± 0% 625.5µ ± 0% ~ (p=0.955 n=10)
IndexPeriodic/IndexPeriodic4 625.4µ ± 0% 625.4µ ± 0% ~ (p=0.838 n=10)
IndexPeriodic/IndexPeriodic8 625.3µ ± 0% 625.3µ ± 0% +0.01% (p=0.009 n=10)
IndexPeriodic/IndexPeriodic16 229.8µ ± 0% 227.0µ ± 0% -1.22% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic32 168.9µ ± 3% 131.8µ ± 0% -22.00% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic64 126.36µ ± 0% 86.66µ ± 0% -31.42% (p=0.000 n=10)
geomean 1.361µ 1.302µ -4.31%
As these functions are so heavily used this change impacts other
benchmarks. I include the improvements in geomean for the all the
benchmarks in the strings and bytes packages, along with some
selected benchmarks to illustrate the impact of the change.
geomean for bytes 13.81µ 12.92µ -6.44%
geomean for string 9.385µ 9.224µ -1.72%
Note that when building for rva22u64 a single Zbb instruction is used
in the main loop. This also helps to improve performance slightly.
The geomean for all the bytes benchmarks when building with
GORISCV64=rva22u64 with and without the patch is shown below.
geomean for bytes (rva22u64) 13.46µ 12.49µ -7.21%
Examples of non-Index benchmarks affected by this commit.
ReadString uses IndexByte to search for a byte stored at the end of
32KB buffer, so we see a speed up. SplitSingleByteSeparator searches
large buffers, but the byte being sought occurs within the first 15
bytes of the buffer, 76% of the time, hence the slowdown. In
SplitMultiByteSeparator the first byte of the separator only occurs
in the first 15 bytes 33% of the time so we see a speed up.
ReadString 05.13µ ± 2% 74.67µ ± 0% -28.97% (p=0.000 n=10)
SplitSingleByteSeparator 11.31m ± 2% 12.43m ± 1% +9.83% (p=0.000 n=10)
SplitMultiByteSeparator 8.070m ± 1% 7.707m ± 1% -4.49% (p=0.000 n=10)
Change-Id: I6210ea2f3decdc6d2e0609df72b1b66e6d6f5395
Reviewed-on: https://go-review.googlesource.com/c/go/+/561275
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
|
|
BGT, BLT, BLE, BGE, BNE, BVS, BVC, and BEQ support by assembler. This will simplify the usage of BC constructs like
BC 12, 30, LR <=> BEQ CR7, LR
BC 12, 2, LR <=> BEQ CR0, LR
BC 12, 0, target <=> BLT CR0, target
BC 12, 2, target <=> BEQ CR0, target
BC 12, 5, target <=> BGT CR1, target
BC 12, 30, target <=> BEQ CR7, target
BC 4, 6, target <=> BNE CR1, target
BC 4, 5, target <=> BLE CR1, target
code cleanup based on the above additions.
Change-Id: I02fdb212b6fe3f85ce447e05f4d42118c9ce63b5
Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-ppc64_power8,gotip-linux-ppc64le_power8,gotip-linux-ppc64le_power9,gotip-linux-ppc64le_power10
Reviewed-on: https://go-review.googlesource.com/c/go/+/612395
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Paul Murphy <murp@ibm.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
|
|
The relevant performance improved by 66.73%.
benchmark:
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000 @ 2500.00MHz
│ old │ new │
│ sec/op │ sec/op vs base │
BytesCompare/1 5.603n ± 0% 4.002n ± 0% -28.57% (p=0.000 n=20)
BytesCompare/2 6.405n ± 0% 4.002n ± 0% -37.52% (p=0.000 n=20)
BytesCompare/4 8.007n ± 0% 4.002n ± 0% -50.02% (p=0.000 n=20)
BytesCompare/8 11.210n ± 0% 4.002n ± 0% -64.30% (p=0.000 n=20)
BytesCompare/16 6.005n ± 0% 4.802n ± 0% -20.03% (p=0.000 n=20)
BytesCompare/32 6.806n ± 0% 4.402n ± 0% -35.32% (p=0.000 n=20)
BytesCompare/64 8.407n ± 0% 6.003n ± 0% -28.60% (p=0.000 n=20)
BytesCompare/128 11.610n ± 0% 8.404n ± 0% -27.61% (p=0.000 n=20)
BytesCompare/256 18.02n ± 0% 14.01n ± 0% -22.25% (p=0.000 n=20)
BytesCompare/512 31.23n ± 0% 26.98n ± 0% -13.61% (p=0.000 n=20)
BytesCompare/1024 56.85n ± 0% 52.43n ± 0% -7.77% (p=0.000 n=20)
BytesCompare/2048 108.1n ± 0% 103.8n ± 0% -3.98% (p=0.000 n=20)
CompareBytesEqual 15.610n ± 0% 5.203n ± 0% -66.67% (p=0.000 n=20)
CompareBytesToNil 3.203n ± 0% 3.202n ± 0% -0.03% (p=0.000 n=20)
CompareBytesEmpty 3.203n ± 0% 2.423n ± 0% -24.35% (p=0.000 n=20)
CompareBytesIdentical 3.203n ± 0% 2.424n ± 0% -24.32% (p=0.000 n=20)
CompareBytesSameLength 8.407n ± 0% 8.004n ± 0% -4.79% (p=0.000 n=20)
CompareBytesDifferentLength 8.808n ± 0% 7.604n ± 0% -13.67% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=1 839.85µ ± 0% 82.04µ ± 0% -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=2 839.86µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=3 839.86µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=4 839.86µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=5 839.85µ ± 0% 82.04µ ± 0% -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=6 839.85µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=7 839.85µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=0 78.77µ ± 0% 78.75µ ± 0% -0.03% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=1 839.84µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=2 839.84µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=3 839.85µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=4 839.83µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=5 839.85µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=6 839.85µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=7 839.84µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20)
CompareBytesBig 78.77µ ± 0% 78.75µ ± 0% -0.03% (p=0.001 n=20)
CompareBytesBigIdentical 2.802n ± 0% 2.801n ± 0% -0.04% (p=0.001 n=20)
geomean 1.524µ 507.2n -66.73%
Change-Id: Ice9f4ef0ce0fbb5a6424823c5f8e0c0c369fd159
Reviewed-on: https://go-review.googlesource.com/c/go/+/589538
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Tim King <taking@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Auto-Submit: Tim King <taking@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
|
|
Remove redundant intermediate jump in runtime.memequal
Remove redundant a.ptr==b.ptr check in runtime.memequal_varlen
Add 16-bytes alignment before some labels in runtime.memequal
goos: linux
goarch: arm64
pkg: bytes
│ ./master.log │ ./opt.log │
│ sec/op │ sec/op vs base │
Equal/0-4 0.8342n ± 0% 0.5254n ± 3% -37.01% (p=0.000 n=8)
Equal/same/1-4 2.720n ± 0% 2.720n ± 2% ~ (p=0.779 n=8)
Equal/same/6-4 2.720n ± 5% 2.720n ± 2% ~ (p=0.908 n=8)
Equal/same/9-4 2.722n ± 2% 2.721n ± 2% ~ (p=0.779 n=8)
Equal/same/15-4 2.719n ± 0% 2.719n ± 0% ~ (p=0.641 n=8)
Equal/same/16-4 2.721n ± 2% 2.719n ± 0% -0.07% (p=0.014 n=8)
Equal/same/20-4 2.720n ± 0% 2.721n ± 2% ~ (p=0.236 n=8)
Equal/same/32-4 2.720n ± 1% 2.720n ± 0% ~ (p=0.396 n=8)
Equal/same/4K-4 2.719n ± 0% 2.720n ± 0% ~ (p=0.663 n=8)
Equal/same/4M-4 2.721n ± 0% 2.720n ± 0% ~ (p=0.075 n=8)
Equal/same/64M-4 2.720n ± 0% 2.720n ± 2% ~ (p=0.806 n=8)
Equal/1-4 6.671n ± 0% 5.449n ± 0% -18.33% (p=0.000 n=8)
Equal/6-4 8.761n ± 2% 7.508n ± 0% -14.30% (p=0.000 n=8)
Equal/9-4 8.343n ± 0% 7.091n ± 0% -15.01% (p=0.000 n=8)
Equal/15-4 8.339n ± 2% 7.090n ± 0% -14.98% (p=0.000 n=8)
Equal/16-4 9.173n ± 0% 7.925n ± 2% -13.61% (p=0.000 n=8)
Equal/20-4 11.26n ± 0% 10.01n ± 0% -11.10% (p=0.000 n=8)
Equal/32-4 10.425n ± 0% 9.176n ± 0% -11.98% (p=0.000 n=8)
Equal/4K-4 192.9n ± 0% 192.7n ± 0% -0.10% (p=0.044 n=8)
Equal/4M-4 191.3µ ± 0% 191.3µ ± 0% ~ (p=0.798 n=8)
Equal/64M-4 3.066m ± 2% 3.065m ± 0% ~ (p=0.083 n=8)
EqualBothUnaligned/64_0-4 7.506n ± 2% 7.090n ± 2% -5.55% (p=0.000 n=8)
EqualBothUnaligned/64_1-4 7.850n ± 1% 7.423n ± 0% -5.43% (p=0.000 n=8)
EqualBothUnaligned/64_4-4 7.505n ± 0% 7.088n ± 0% -5.56% (p=0.000 n=8)
EqualBothUnaligned/64_7-4 7.840n ± 0% 7.413n ± 0% -5.44% (p=0.000 n=8)
EqualBothUnaligned/4096_0-4 193.0n ± 4% 190.9n ± 0% -1.09% (p=0.004 n=8)
EqualBothUnaligned/4096_1-4 223.9n ± 0% 223.1n ± 0% -0.36% (p=0.000 n=8)
EqualBothUnaligned/4096_4-4 191.9n ± 2% 191.5n ± 0% -0.21% (p=0.004 n=8)
EqualBothUnaligned/4096_7-4 223.8n ± 0% 223.1n ± 1% ~ (p=0.098 n=8)
EqualBothUnaligned/4194304_0-4 191.8µ ± 0% 191.8µ ± 0% ~ (p=0.504 n=8)
EqualBothUnaligned/4194304_1-4 225.4µ ± 2% 225.5µ ± 0% ~ (p=0.065 n=8)
EqualBothUnaligned/4194304_4-4 192.6µ ± 0% 192.7µ ± 2% +0.06% (p=0.041 n=8)
EqualBothUnaligned/4194304_7-4 225.4µ ± 0% 225.5µ ± 0% +0.05% (p=0.050 n=8)
EqualBothUnaligned/67108864_0-4 3.069m ± 0% 3.069m ± 0% ~ (p=0.314 n=8)
EqualBothUnaligned/67108864_1-4 3.589m ± 0% 3.588m ± 0% ~ (p=0.959 n=8)
EqualBothUnaligned/67108864_4-4 3.083m ± 0% 3.083m ± 2% ~ (p=0.505 n=8)
EqualBothUnaligned/67108864_7-4 3.588m ± 0% 3.588m ± 0% ~ (p=1.000 n=8)
geomean 199.9n 190.5n -4.70%
Change-Id: Ib8d0d4006dd39162a600ac98a5f44a0f05136ed3
Reviewed-on: https://go-review.googlesource.com/c/go/+/601135
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@golang.org>
Auto-Submit: Keith Randall <khr@golang.org>
|
|
The mem address should be regarded as uint32.
Fixes #65571
Change-Id: Icee38d11f2d93eeca7d50b2e133159e321daeb90
GitHub-Last-Rev: c2568b104369bcf5c4d42c6281d235a52bb9675f
GitHub-Pull-Request: golang/go#68400
Reviewed-on: https://go-review.googlesource.com/c/go/+/597955
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Cuong Manh Le <cuong.manhle.vn@gmail.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
|
|
Add linknames for most modules with ≥50 dependents.
Add linknames for a few other modules that we know
are important but are below 50.
Remove linknames from badlinkname.go that do not merit
inclusion (very small number of dependents).
We can add them back later if the need arises.
Fixes #67401. (For now.)
Change-Id: I1e49fec0292265256044d64b1841d366c4106002
Reviewed-on: https://go-review.googlesource.com/c/go/+/587756
Auto-Submit: Russ Cox <rsc@golang.org>
TryBot-Bypass: Russ Cox <rsc@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
|
|
For #67401.
Change-Id: I015408a3f437c1733d97160ef2fb5da6d4efcc5c
Reviewed-on: https://go-review.googlesource.com/c/go/+/587598
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Russ Cox <rsc@golang.org>
|
|
For #67401.
Change-Id: I7dd28c3b01a1a647f84929d15412aa43ab0089ee
Reviewed-on: https://go-review.googlesource.com/c/go/+/587575
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Russ Cox <rsc@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
|
|
Change-Id: If1d3eba9a922ac6f9d78301bb8f07e445c712899
Reviewed-on: https://go-review.googlesource.com/c/go/+/525576
Run-TryBot: Cherry Mui <cherryyz@google.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
Commit-Queue: abner chenc <chenguoqi@loongson.cn>
Run-TryBot: abner chenc <chenguoqi@loongson.cn>
|
|
slices.SortFunc requires a three-way comparison and we need an
efficient strings.Compare to perform three-way string comparisons.
This new implementation adds bytealg.CompareString as a wrapper of
runtime_cmpstring and changes Compare to use bytealg.CompareString.
The new implementation of Compare with runtime_cmpstring is about
28% faster than the previous one.
Fixes #61725
│ /tmp/gobench-sort-cmp.txt │ /tmp/gobench-sort-strings.txt │
│ sec/op │ sec/op vs base │
SortFuncStruct/Size16-48 918.8n ± 1% 726.6n ± 0% -20.92% (p=0.000 n=10)
SortFuncStruct/Size32-48 2.666µ ± 1% 2.003µ ± 1% -24.85% (p=0.000 n=10)
SortFuncStruct/Size64-48 1.934µ ± 1% 1.331µ ± 1% -31.22% (p=0.000 n=10)
SortFuncStruct/Size128-48 3.560µ ± 1% 2.423µ ± 0% -31.94% (p=0.000 n=10)
SortFuncStruct/Size512-48 13.019µ ± 0% 9.071µ ± 0% -30.33% (p=0.000 n=10)
SortFuncStruct/Size1024-48 25.61µ ± 0% 17.75µ ± 0% -30.70% (p=0.000 n=10)
geomean 4.217µ 3.018µ -28.44%
Change-Id: I2513b6f8c1b9b273ef2d23f0a86f691e2d097eb6
Reviewed-on: https://go-review.googlesource.com/c/go/+/532195
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Ian Lance Taylor <iant@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Ian Lance Taylor <iant@golang.org>
Reviewed-by: qiu laidongfeng2 <2645477756@qq.com>
Reviewed-by: Keith Randall <khr@google.com>
|
|
Fixes #64833
Change-Id: Ice3f5dfab65f5525bc7a6f57ddeaabda8d64dfa3
GitHub-Last-Rev: 38f1d6c19d8ec29ae5645ce677839a301f798df3
GitHub-Pull-Request: golang/go#64835
Reviewed-on: https://go-review.googlesource.com/c/go/+/552135
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
|
|
If memequal is invoked with the same pointers as arguments it ends up
comparing the whole memory contents, instead of just comparing the pointers.
This effectively makes an operation that could be O(1) into O(n). All the
other architectures already have this optimization in place. For
instance, arm64 also have it, in memequal_varlen.
Such optimization is very specific, one case that it will probably benefit is
programs that rely heavily on interning of strings.
goos: darwin
goarch: arm64
pkg: bytes
│ old.txt │ new.txt │
│ sec/op │ sec/op vs base │
Equal/same/1-8 2.678n ± ∞ ¹ 2.400n ± ∞ ¹ -10.38% (p=0.008 n=5)
Equal/same/6-8 3.267n ± ∞ ¹ 2.431n ± ∞ ¹ -25.59% (p=0.008 n=5)
Equal/same/9-8 2.981n ± ∞ ¹ 2.385n ± ∞ ¹ -19.99% (p=0.008 n=5)
Equal/same/15-8 2.974n ± ∞ ¹ 2.390n ± ∞ ¹ -19.64% (p=0.008 n=5)
Equal/same/16-8 2.983n ± ∞ ¹ 2.380n ± ∞ ¹ -20.21% (p=0.008 n=5)
Equal/same/20-8 3.567n ± ∞ ¹ 2.384n ± ∞ ¹ -33.17% (p=0.008 n=5)
Equal/same/32-8 3.568n ± ∞ ¹ 2.385n ± ∞ ¹ -33.16% (p=0.008 n=5)
Equal/same/4K-8 78.040n ± ∞ ¹ 2.378n ± ∞ ¹ -96.95% (p=0.008 n=5)
Equal/same/4M-8 78713.000n ± ∞ ¹ 2.385n ± ∞ ¹ -100.00% (p=0.008 n=5)
Equal/same/64M-8 1348095.000n ± ∞ ¹ 2.381n ± ∞ ¹ -100.00% (p=0.008 n=5)
geomean 43.52n 2.390n -94.51%
¹ need >= 6 samples for confidence interval at level 0.95
│ old.txt │ new.txt │
│ B/s │ B/s vs base │
Equal/same/1-8 356.1Mi ± ∞ ¹ 397.3Mi ± ∞ ¹ +11.57% (p=0.008 n=5)
Equal/same/6-8 1.711Gi ± ∞ ¹ 2.298Gi ± ∞ ¹ +34.35% (p=0.008 n=5)
Equal/same/9-8 2.812Gi ± ∞ ¹ 3.515Gi ± ∞ ¹ +24.99% (p=0.008 n=5)
Equal/same/15-8 4.698Gi ± ∞ ¹ 5.844Gi ± ∞ ¹ +24.41% (p=0.008 n=5)
Equal/same/16-8 4.995Gi ± ∞ ¹ 6.260Gi ± ∞ ¹ +25.34% (p=0.008 n=5)
Equal/same/20-8 5.222Gi ± ∞ ¹ 7.814Gi ± ∞ ¹ +49.63% (p=0.008 n=5)
Equal/same/32-8 8.353Gi ± ∞ ¹ 12.496Gi ± ∞ ¹ +49.59% (p=0.008 n=5)
Equal/same/4K-8 48.88Gi ± ∞ ¹ 1603.96Gi ± ∞ ¹ +3181.17% (p=0.008 n=5)
Equal/same/4M-8 49.63Gi ± ∞ ¹ 1637911.85Gi ± ∞ ¹ +3300381.91% (p=0.008 n=5)
Equal/same/64M-8 46.36Gi ± ∞ ¹ 26253069.97Gi ± ∞ ¹ +56626517.99% (p=0.008 n=5)
geomean 6.737Gi 122.7Gi +1721.01%
¹ need >= 6 samples for confidence interval at level 0.95
Fixes #64381
Change-Id: I7d423930a688edd88c4ba60d45e097296d9be852
GitHub-Last-Rev: ae8189fafb1cba87b5394f09f971746ae9299273
GitHub-Pull-Request: golang/go#64419
Reviewed-on: https://go-review.googlesource.com/c/go/+/545416
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Run-TryBot: Cherry Mui <cherryyz@google.com>
|
|
For #63678
Benchmark on Milk-V Mars CM eMMC (Starfive/JH7110 SoC)
goos: linux
goarch: riscv64
pkg: bytes
│ /root/bytes.old.bench │ /root/bytes.pc16.bench │
│ sec/op │ sec/op vs base │
Count/10 223.9n ± 1% 220.8n ± 1% -1.36% (p=0.001 n=10)
Count/32 571.6n ± 0% 571.3n ± 0% ~ (p=0.054 n=10)
Count/4K 38.56µ ± 0% 38.55µ ± 0% -0.01% (p=0.010 n=10)
Count/4M 40.13m ± 0% 39.21m ± 0% -2.28% (p=0.000 n=10)
Count/64M 627.5m ± 0% 627.4m ± 0% -0.01% (p=0.019 n=10)
CountEasy/10 101.3n ± 0% 101.3n ± 0% ~ (p=1.000 n=10) ¹
CountEasy/32 139.3n ± 0% 139.3n ± 0% ~ (p=1.000 n=10) ¹
CountEasy/4K 5.565µ ± 0% 5.564µ ± 0% -0.02% (p=0.001 n=10)
CountEasy/4M 5.619m ± 0% 5.619m ± 0% ~ (p=0.190 n=10)
CountEasy/64M 89.94m ± 0% 89.93m ± 0% ~ (p=0.436 n=10)
CountSingle/10 53.80n ± 0% 46.06n ± 0% -14.39% (p=0.000 n=10)
CountSingle/32 104.30n ± 0% 79.64n ± 0% -23.64% (p=0.000 n=10)
CountSingle/4K 10.413µ ± 0% 7.247µ ± 0% -30.40% (p=0.000 n=10)
CountSingle/4M 11.603m ± 0% 8.388m ± 0% -27.71% (p=0.000 n=10)
CountSingle/64M 230.9m ± 0% 172.3m ± 0% -25.40% (p=0.000 n=10)
CountHard1 9.981m ± 0% 9.981m ± 0% ~ (p=0.810 n=10)
CountHard2 9.981m ± 0% 9.981m ± 0% ~ (p=0.315 n=10)
CountHard3 9.981m ± 0% 9.981m ± 0% ~ (p=0.159 n=10)
geomean 144.6µ 133.5µ -7.70%
¹ all samples are equal
│ /root/bytes.old.bench │ /root/bytes.pc16.bench │
│ B/s │ B/s vs base │
Count/10 42.60Mi ± 1% 43.19Mi ± 1% +1.39% (p=0.001 n=10)
Count/32 53.38Mi ± 0% 53.42Mi ± 0% +0.06% (p=0.049 n=10)
Count/4K 101.3Mi ± 0% 101.3Mi ± 0% ~ (p=0.077 n=10)
Count/4M 99.68Mi ± 0% 102.01Mi ± 0% +2.34% (p=0.000 n=10)
Count/64M 102.0Mi ± 0% 102.0Mi ± 0% ~ (p=0.076 n=10)
CountEasy/10 94.18Mi ± 0% 94.18Mi ± 0% ~ (p=0.054 n=10)
CountEasy/32 219.1Mi ± 0% 219.1Mi ± 0% +0.01% (p=0.016 n=10)
CountEasy/4K 702.0Mi ± 0% 702.0Mi ± 0% +0.00% (p=0.000 n=10)
CountEasy/4M 711.9Mi ± 0% 711.9Mi ± 0% ~ (p=0.133 n=10)
CountEasy/64M 711.6Mi ± 0% 711.7Mi ± 0% ~ (p=0.447 n=10)
CountSingle/10 177.2Mi ± 0% 207.0Mi ± 0% +16.81% (p=0.000 n=10)
CountSingle/32 292.7Mi ± 0% 383.2Mi ± 0% +30.91% (p=0.000 n=10)
CountSingle/4K 375.1Mi ± 0% 539.0Mi ± 0% +43.70% (p=0.000 n=10)
CountSingle/4M 344.7Mi ± 0% 476.9Mi ± 0% +38.33% (p=0.000 n=10)
CountSingle/64M 277.2Mi ± 0% 371.5Mi ± 0% +34.05% (p=0.000 n=10)
geomean 199.7Mi 219.8Mi +10.10%
Change-Id: I1abf6b220b9802028f8ad5eebc8d3b7cfa3e89ea
Reviewed-on: https://go-review.googlesource.com/c/go/+/541756
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Run-TryBot: M Zhuo <mzh@golangcn.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Wang Yaduo <wangyaduo@linux.alibaba.com>
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
|
|
Update #40724
Co-authored-by: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
Change-Id: I4a7392afd7238d44e7d09aaca7e0d733649926ac
Reviewed-on: https://go-review.googlesource.com/c/go/+/521785
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Run-TryBot: David Chase <drchase@google.com>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
Reviewed-by: David Chase <drchase@google.com>
Auto-Submit: David Chase <drchase@google.com>
|
|
goos: linux
goarch: amd64
pkg: bytes
cpu: Intel(R) Core(TM) i5-8350U CPU @ 1.70GHz
│ master │ HEAD │
│ sec/op │ sec/op vs base │
Equal/0-8 0.2800n ± 22% 0.2865n ± 26% ~ (p=0.075 n=10)
Equal/1-8 18.57n ± 2% 19.34n ± 6% +4.15% (p=0.014 n=10)
Equal/6-8 19.07n ± 1% 19.38n ± 2% +1.63% (p=0.014 n=10)
Equal/9-8 19.39n ± 2% 19.05n ± 1% -1.78% (p=0.005 n=10)
Equal/15-8 19.46n ± 1% 19.10n ± 1% -1.85% (p=0.000 n=10)
Equal/16-8 19.36n ± 2% 18.95n ± 1% -2.09% (p=0.011 n=10)
Equal/20-8 20.20n ± 1% 19.83n ± 1% -1.86% (p=0.001 n=10)
Equal/32-8 20.95n ± 1% 20.84n ± 1% -0.57% (p=0.010 n=10)
Equal/4K-8 97.40n ± 2% 81.34n ± 3% -16.49% (p=0.000 n=10)
Equal/4M-8 81.74µ ± 3% 71.52µ ± 4% -12.49% (p=0.000 n=10)
Equal/64M-8 1.319m ± 1% 1.139m ± 3% -13.68% (p=0.000 n=10)
EqualBothUnaligned/64_0-8 8.707n ± 4% 8.588n ± 3% ~ (p=0.353 n=10)
EqualBothUnaligned/64_1-8 8.513n ± 3% 8.614n ± 2% ~ (p=0.481 n=10)
EqualBothUnaligned/64_4-8 8.752n ± 3% 8.637n ± 4% ~ (p=0.148 n=10)
EqualBothUnaligned/64_7-8 8.742n ± 3% 8.514n ± 2% ~ (p=0.052 n=10)
EqualBothUnaligned/4096_0-8 89.87n ± 3% 70.44n ± 5% -21.63% (p=0.000 n=10)
EqualBothUnaligned/4096_1-8 91.67n ± 5% 70.89n ± 3% -22.67% (p=0.000 n=10)
EqualBothUnaligned/4096_4-8 90.43n ± 2% 70.52n ± 3% -22.01% (p=0.000 n=10)
EqualBothUnaligned/4096_7-8 89.53n ± 3% 72.02n ± 5% -19.56% (p=0.000 n=10)
EqualBothUnaligned/4194304_0-8 86.43µ ± 3% 73.40µ ± 4% -15.07% (p=0.000 n=10)
EqualBothUnaligned/4194304_1-8 85.48µ ± 2% 75.35µ ± 1% -11.85% (p=0.000 n=10)
EqualBothUnaligned/4194304_4-8 86.51µ ± 3% 75.44µ ± 4% -12.80% (p=0.000 n=10)
EqualBothUnaligned/4194304_7-8 86.40µ ± 3% 74.41µ ± 3% -13.88% (p=0.000 n=10)
EqualBothUnaligned/67108864_0-8 1.374m ± 3% 1.171m ± 3% -14.75% (p=0.000 n=10)
EqualBothUnaligned/67108864_1-8 1.401m ± 4% 1.198m ± 4% -14.49% (p=0.000 n=10)
EqualBothUnaligned/67108864_4-8 1.393m ± 4% 1.205m ± 4% -13.53% (p=0.000 n=10)
EqualBothUnaligned/67108864_7-8 1.396m ± 3% 1.199m ± 4% -14.11% (p=0.000 n=10)
geomean 735.7n 666.7n -9.39%
│ master │ HEAD │
│ B/s │ B/s vs base │
Equal/1-8 51.36Mi ± 2% 49.32Mi ± 6% -3.98% (p=0.015 n=10)
Equal/6-8 300.0Mi ± 1% 295.3Mi ± 2% -1.57% (p=0.011 n=10)
Equal/9-8 442.5Mi ± 2% 450.6Mi ± 1% +1.82% (p=0.005 n=10)
Equal/15-8 734.9Mi ± 1% 748.8Mi ± 1% +1.90% (p=0.000 n=10)
Equal/16-8 788.4Mi ± 2% 805.2Mi ± 1% +2.14% (p=0.011 n=10)
Equal/20-8 944.2Mi ± 1% 961.8Mi ± 1% +1.87% (p=0.002 n=10)
Equal/32-8 1.422Gi ± 0% 1.430Gi ± 1% +0.58% (p=0.011 n=10)
Equal/4K-8 39.17Gi ± 2% 46.90Gi ± 3% +19.74% (p=0.000 n=10)
Equal/4M-8 47.79Gi ± 3% 54.62Gi ± 4% +14.27% (p=0.000 n=10)
Equal/64M-8 47.38Gi ± 1% 54.89Gi ± 3% +15.85% (p=0.000 n=10)
EqualBothUnaligned/64_0-8 6.845Gi ± 4% 6.940Gi ± 3% ~ (p=0.353 n=10)
EqualBothUnaligned/64_1-8 7.002Gi ± 3% 6.919Gi ± 2% ~ (p=0.481 n=10)
EqualBothUnaligned/64_4-8 6.811Gi ± 3% 6.901Gi ± 4% ~ (p=0.165 n=10)
EqualBothUnaligned/64_7-8 6.819Gi ± 3% 7.002Gi ± 2% ~ (p=0.052 n=10)
EqualBothUnaligned/4096_0-8 42.45Gi ± 3% 54.16Gi ± 5% +27.60% (p=0.000 n=10)
EqualBothUnaligned/4096_1-8 41.61Gi ± 6% 53.82Gi ± 3% +29.33% (p=0.000 n=10)
EqualBothUnaligned/4096_4-8 42.19Gi ± 2% 54.09Gi ± 3% +28.22% (p=0.000 n=10)
EqualBothUnaligned/4096_7-8 42.61Gi ± 3% 52.97Gi ± 5% +24.33% (p=0.000 n=10)
EqualBothUnaligned/4194304_0-8 45.20Gi ± 3% 53.22Gi ± 4% +17.75% (p=0.000 n=10)
EqualBothUnaligned/4194304_1-8 45.70Gi ± 2% 51.84Gi ± 1% +13.43% (p=0.000 n=10)
EqualBothUnaligned/4194304_4-8 45.15Gi ± 3% 51.78Gi ± 4% +14.68% (p=0.000 n=10)
EqualBothUnaligned/4194304_7-8 45.21Gi ± 3% 52.50Gi ± 4% +16.12% (p=0.000 n=10)
EqualBothUnaligned/67108864_0-8 45.50Gi ± 3% 53.37Gi ± 3% +17.30% (p=0.000 n=10)
EqualBothUnaligned/67108864_1-8 44.63Gi ± 4% 52.17Gi ± 4% +16.89% (p=0.000 n=10)
EqualBothUnaligned/67108864_4-8 44.86Gi ± 4% 51.88Gi ± 4% +15.65% (p=0.000 n=10)
EqualBothUnaligned/67108864_7-8 44.76Gi ± 3% 52.12Gi ± 4% +16.43% (p=0.000 n=10)
geomean 9.734Gi 10.79Gi +10.88%
For #63678
Change-Id: I427b8756e361fd4d36984c2bdb8bc3661ac3a0b8
GitHub-Last-Rev: 981d272d172a9e07c17fab04d6dbab032ecb2426
GitHub-Pull-Request: golang/go#63757
Reviewed-on: https://go-review.googlesource.com/c/go/+/537995
Reviewed-by: David Chase <drchase@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: qiulaidongfeng <2645477756@qq.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Mauri de Souza Meneguzzo <mauri870@gmail.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
|
|
Use ADD with constants, instead of ADDI. Also use SUB with a positive constant
rather than ADD with a negative constant. The resulting assembly is still the
same.
Change-Id: Ife10bf5ae4122e525f0e7d41b5e463e748236a9c
Reviewed-on: https://go-review.googlesource.com/c/go/+/540136
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
Reviewed-by: Heschi Kreinick <heschi@google.com>
Run-TryBot: Joel Sing <joel@sing.id.au>
|
|
goos: windows
goarch: amd64
pkg: bytes
cpu: AMD Ryzen 7 7840HS w/ Radeon 780M Graphics
│ old.txt │ new.txt │
│ sec/op │ sec/op vs base │
IndexByte/10-16 2.613n ± 1% 2.558n ± 1% -2.09% (p=0.014 n=10)
IndexByte/32-16 3.034n ± 1% 3.010n ± 2% ~ (p=0.305 n=10)
IndexByte/4K-16 57.20n ± 2% 39.58n ± 2% -30.81% (p=0.000 n=10)
IndexByte/4M-16 34.48µ ± 1% 33.83µ ± 2% -1.87% (p=0.023 n=10)
IndexByte/64M-16 1.493m ± 2% 1.450m ± 2% -2.89% (p=0.000 n=10)
IndexBytePortable/10-16 3.172n ± 4% 3.163n ± 2% ~ (p=0.684 n=10)
IndexBytePortable/32-16 8.465n ± 2% 8.375n ± 3% ~ (p=0.631 n=10)
IndexBytePortable/4K-16 852.0n ± 1% 846.6n ± 3% ~ (p=0.971 n=10)
IndexBytePortable/4M-16 868.2µ ± 2% 856.6µ ± 2% ~ (p=0.393 n=10)
IndexBytePortable/64M-16 13.81m ± 2% 13.88m ± 3% ~ (p=0.684 n=10)
geomean 1.204µ 1.148µ -4.63%
│ old.txt │ new.txt │
│ B/s │ B/s vs base │
IndexByte/10-16 3.565Gi ± 1% 3.641Gi ± 1% +2.15% (p=0.015 n=10)
IndexByte/32-16 9.821Gi ± 1% 9.899Gi ± 2% ~ (p=0.315 n=10)
IndexByte/4K-16 66.70Gi ± 2% 96.39Gi ± 2% +44.52% (p=0.000 n=10)
IndexByte/4M-16 113.3Gi ± 1% 115.5Gi ± 2% +1.91% (p=0.023 n=10)
IndexByte/64M-16 41.85Gi ± 2% 43.10Gi ± 2% +2.98% (p=0.000 n=10)
IndexBytePortable/10-16 2.936Gi ± 4% 2.945Gi ± 2% ~ (p=0.684 n=10)
IndexBytePortable/32-16 3.521Gi ± 2% 3.559Gi ± 3% ~ (p=0.631 n=10)
IndexBytePortable/4K-16 4.477Gi ± 1% 4.506Gi ± 3% ~ (p=0.971 n=10)
IndexBytePortable/4M-16 4.499Gi ± 2% 4.560Gi ± 2% ~ (p=0.393 n=10)
IndexBytePortable/64M-16 4.525Gi ± 2% 4.504Gi ± 3% ~ (p=0.684 n=10)
geomean 10.04Gi 10.53Gi +4.86%
For #63678
Change-Id: I0571c2b540a816d57bd6ed8bb1df4191c7992d92
GitHub-Last-Rev: 7e95b8bfb035b53175f5a1b7d8750113933a7e17
GitHub-Pull-Request: golang/go#63847
Reviewed-on: https://go-review.googlesource.com/c/go/+/538715
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Auto-Submit: Keith Randall <khr@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
|
|
Also rename 'substr' to 'sep' in IndexRabinKarp for consistency.
Change-Id: Icc2ad1116aecaf002c8264daa2fa608306c9a88a
GitHub-Last-Rev: 1784b93f53d569991f86585f9011120ea26f193f
GitHub-Pull-Request: golang/go#63854
Reviewed-on: https://go-review.googlesource.com/c/go/+/538716
Reviewed-by: Keith Randall <khr@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
|
|
This is a follow-up to CL 538175.
Change-Id: Iec2523b36a16d7e157c17858c89fcd43c2470d58
GitHub-Last-Rev: 812d36e57c71ea3bf44d2d64bde0703ef02a1b91
GitHub-Pull-Request: golang/go#63770
Reviewed-on: https://go-review.googlesource.com/c/go/+/538195
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Auto-Submit: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Bryan Mills <bcmills@google.com>
|
|
For #63678
goos: darwin
goarch: arm64
pkg: strings
│ count_old.txt │ count_new.txt │
│ sec/op │ sec/op vs base │
CountHard1-8 368.7µ ± 11% 332.0µ ± 1% -9.95% (p=0.002 n=10)
CountHard2-8 348.8µ ± 5% 333.1µ ± 1% -4.51% (p=0.000 n=10)
CountHard3-8 402.7µ ± 25% 359.5µ ± 1% -10.75% (p=0.000 n=10)
CountTorture-8 10.536µ ± 23% 9.913µ ± 0% -5.91% (p=0.000 n=10)
CountTortureOverlapping-8 74.86µ ± 9% 67.56µ ± 1% -9.75% (p=0.000 n=10)
CountByte/10-8 6.905n ± 3% 6.690n ± 1% -3.11% (p=0.001 n=10)
CountByte/32-8 3.247n ± 13% 3.207n ± 2% -1.23% (p=0.030 n=10)
CountByte/4096-8 83.72n ± 1% 82.58n ± 1% -1.36% (p=0.007 n=10)
CountByte/4194304-8 85.17µ ± 5% 84.02µ ± 8% ~ (p=0.075 n=10)
CountByte/67108864-8 1.497m ± 8% 1.397m ± 2% -6.69% (p=0.000 n=10)
geomean 9.977µ 9.426µ -5.53%
│ count_old.txt │ count_new.txt │
│ B/s │ B/s vs base │
CountByte/10-8 1.349Gi ± 3% 1.392Gi ± 1% +3.20% (p=0.002 n=10)
CountByte/32-8 9.180Gi ± 11% 9.294Gi ± 2% +1.24% (p=0.029 n=10)
CountByte/4096-8 45.57Gi ± 1% 46.20Gi ± 1% +1.38% (p=0.007 n=10)
CountByte/4194304-8 45.86Gi ± 5% 46.49Gi ± 7% ~ (p=0.075 n=10)
CountByte/67108864-8 41.75Gi ± 8% 44.74Gi ± 2% +7.16% (p=0.000 n=10)
geomean 16.10Gi 16.55Gi +2.85%
Change-Id: Ifc2173ba3a926b0fa9598372d4404b8645929d45
Reviewed-on: https://go-review.googlesource.com/c/go/+/538116
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Bryan Mills <bcmills@google.com>
Run-TryBot: shuang cui <imcusg@gmail.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
|
|
…generics
The logic of HashStrBytes, HashStrRevBytes and HashStr, HashStrRev,
are exactly the same, except that the types are different.
Since the bootstrap toolchain is bumped to 1.20, we can eliminate them
by using generics.
Change-Id: I4336b1cab494ba963f09646c169b45f6b1ee62e3
GitHub-Last-Rev: b11a2bf9476d54bed4bd18a3f9269b5c95a66d67
GitHub-Pull-Request: golang/go#63766
Reviewed-on: https://go-review.googlesource.com/c/go/+/538175
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Keith Randall <khr@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
|
|
The branch taken by the bytealg.Count algorithm used to process a single
32 bytes block per loop iteration. Throughput of the algorithm can be
improved by unrolling two iterations per loop: the lack of data
dependencies between each iteration allows for better utilization of the
CPU pipeline. The improvement is most significant on medium size payloads
that fit in the L1 cache; beyond the L1 cache size, memory bandwidth is
likely the bottleneck and the change does not show any measurable
improvements.
goos: linux
goarch: amd64
pkg: bytes
cpu: Intel(R) Xeon(R) CPU @ 2.60GHz
│ old.txt │ new.txt │
│ sec/op │ sec/op vs base │
CountSingle/10 4.800n ± 0% 4.811n ± 0% +0.23% (p=0.000 n=10)
CountSingle/32 5.445n ± 0% 5.430n ± 0% ~ (p=0.085 n=10)
CountSingle/4K 81.38n ± 1% 63.12n ± 0% -22.43% (p=0.000 n=10)
CountSingle/4M 133.0µ ± 7% 130.1µ ± 4% ~ (p=0.280 n=10)
CountSingle/64M 4.079m ± 1% 4.070m ± 3% ~ (p=0.796 n=10)
geomean 1.029µ 973.3n -5.41%
│ old.txt │ new.txt │
│ B/s │ B/s vs base │
CountSingle/10 1.940Gi ± 0% 1.936Gi ± 0% -0.22% (p=0.000 n=10)
CountSingle/32 5.474Gi ± 0% 5.488Gi ± 0% ~ (p=0.075 n=10)
CountSingle/4K 46.88Gi ± 1% 60.43Gi ± 0% +28.92% (p=0.000 n=10)
CountSingle/4M 29.39Gi ± 7% 30.02Gi ± 4% ~ (p=0.280 n=10)
CountSingle/64M 15.32Gi ± 1% 15.36Gi ± 3% ~ (p=0.796 n=10)
geomean 11.75Gi 12.42Gi +5.71%
Change-Id: I1098228c726a2ee814806dcb438b7e92febf4370
Reviewed-on: https://go-review.googlesource.com/c/go/+/532457
Reviewed-by: Mauri de Souza Meneguzzo <mauri870@gmail.com>
Reviewed-by: Keith Randall <khr@google.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
|
|
Handle comparisons of 15 or less bytes more efficiently
with Power10 instructions when building with GOPPC64=power10.
name old time/op new time/op delta
BytesCompare/1 2.53ns ± 0% 2.17ns ± 0% -14.17%
BytesCompare/2 2.70ns ± 0% 2.17ns ± 0% -19.77%
BytesCompare/4 2.59ns ± 0% 2.17ns ± 0% -16.20%
BytesCompare/8 2.66ns ± 0% 2.17ns ± 0% -18.63%
Change-Id: I6d7c6af0a58ea3e03acc3930c54b77f2ac1dfbd5
Reviewed-on: https://go-review.googlesource.com/c/go/+/522315
Reviewed-by: Joedian Reid <joedian@golang.org>
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Bryan Mills <bcmills@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
|
|
To avoid duplicating them in net/netip and os and to allow these
packages automatically benefiting from future performance improvements
when optimized native LastIndexByte{,String} implementations are added.
For #36891
Change-Id: I4905a4742273570c2c36b867df57762c5bfbe1e4
Reviewed-on: https://go-review.googlesource.com/c/go/+/522475
Run-TryBot: Tobias Klauser <tobias.klauser@gmail.com>
Auto-Submit: Tobias Klauser <tobias.klauser@gmail.com>
Reviewed-by: Bryan Mills <bcmills@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@google.com>
|
|
Power10 adds a handful of new instructions which make this
noticeably quicker for smaller values.
Likewise, since the vector loop requires 32B to enter,
unroll it once to count 32B per iteration. This
improvement benefits all PPC64 cpus.
On Power10 comparing a binary built with GOPPC64=power8
CountSingle/10 8.99ns ± 0% 5.55ns ± 3% -38.24%
CountSingle/16 7.55ns ± 0% 5.56ns ± 3% -26.37%
CountSingle/17 7.45ns ± 0% 5.25ns ± 0% -29.52%
CountSingle/31 18.4ns ± 0% 6.2ns ± 0% -66.41%
CountSingle/32 6.17ns ± 0% 5.04ns ± 0% -18.37%
CountSingle/33 7.13ns ± 0% 5.99ns ± 0% -15.94%
CountSingle/4K 198ns ± 0% 115ns ± 0% -42.08%
CountSingle/4M 190µs ± 0% 109µs ± 0% -42.49%
CountSingle/64M 3.28ms ± 0% 2.08ms ± 0% -36.53%
Furthermore, comparing the new tail implementation on
GOPPC64=power8 with GOPPC64=power10:
CountSingle/10 5.55ns ± 3% 4.52ns ± 1% -18.66%
CountSingle/16 5.56ns ± 3% 4.80ns ± 0% -13.65%
CountSingle/17 5.25ns ± 0% 4.79ns ± 0% -8.78%
CountSingle/31 6.17ns ± 0% 4.82ns ± 0% -21.79%
CountSingle/32 5.04ns ± 0% 5.09ns ± 6% +1.01%
CountSingle/33 5.99ns ± 0% 5.42ns ± 2% -9.54%
Change-Id: I62d80be3b5d706e1abbb4bec7d6278a939a5eed4
Reviewed-on: https://go-review.googlesource.com/c/go/+/512695
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
|
|
Another optimization by aligning a hot loop.
```
│ sec/op │ sec/op vs base │
Count/10-16 11.29n ± 1% 10.50n ± 1% -7.04% (p=0.000 n=10)
Count/32-16 11.06n ± 1% 11.36n ± 2% +2.76% (p=0.000 n=10)
Count/4K-16 2.852µ ± 1% 1.953µ ± 1% -31.52% (p=0.000 n=10)
Count/4M-16 2.884m ± 1% 1.958m ± 1% -32.11% (p=0.000 n=10)
Count/64M-16 46.27m ± 1% 30.86m ± 0% -33.31% (p=0.000 n=10)
CountEasy/10-16 9.873n ± 1% 9.669n ± 1% -2.07% (p=0.000 n=10)
CountEasy/32-16 11.07n ± 1% 11.23n ± 1% +1.49% (p=0.000 n=10)
CountEasy/4K-16 73.47n ± 1% 54.20n ± 0% -26.22% (p=0.000 n=10)
CountEasy/4M-16 61.12µ ± 1% 49.42µ ± 0% -19.15% (p=0.000 n=10)
CountEasy/64M-16 1.303m ± 3% 1.082m ± 4% -16.97% (p=0.000 n=10)
CountSingle/10-16 4.150n ± 1% 3.679n ± 1% -11.36% (p=0.000 n=10)
CountSingle/32-16 4.815n ± 1% 4.588n ± 1% -4.71% (p=0.000 n=10)
CountSingle/4M-16 72.18µ ± 2% 75.38µ ± 1% +4.44% (p=0.000 n=10)
CountHard3-16 462.6µ ± 1% 484.4µ ± 1% +4.73% (p=0.000 n=10)
│ old.txt │ new.txt │
│ B/s │ B/s vs base │
Count/10-16 844.1Mi ± 1% 908.3Mi ± 1% +7.60% (p=0.000 n=10)
Count/32-16 2.695Gi ± 1% 2.623Gi ± 2% -2.66% (p=0.000 n=10)
Count/4K-16 1.337Gi ± 1% 1.953Gi ± 1% +46.06% (p=0.000 n=10)
Count/4M-16 1.355Gi ± 1% 1.995Gi ± 1% +47.29% (p=0.000 n=10)
Count/64M-16 1.351Gi ± 1% 2.026Gi ± 0% +49.95% (p=0.000 n=10)
CountEasy/10-16 965.9Mi ± 1% 986.3Mi ± 1% +2.11% (p=0.000 n=10)
CountEasy/32-16 2.693Gi ± 1% 2.653Gi ± 1% -1.48% (p=0.000 n=10)
CountEasy/4K-16 51.93Gi ± 1% 70.38Gi ± 0% +35.54% (p=0.000 n=10)
CountEasy/4M-16 63.91Gi ± 1% 79.05Gi ± 0% +23.68% (p=0.000 n=10)
CountEasy/64M-16 47.97Gi ± 3% 57.77Gi ± 4% +20.44% (p=0.000 n=10)
CountSingle/10-16 2.244Gi ± 1% 2.532Gi ± 1% +12.80% (p=0.000 n=10)
CountSingle/32-16 6.190Gi ± 1% 6.496Gi ± 1% +4.94% (p=0.000 n=10)
CountSingle/4M-16 54.12Gi ± 2% 51.82Gi ± 1% -4.25% (p=0.000 n=10)
```
Change-Id: I847b36125d2b11e2a88d31f48f6c160f041b3624
GitHub-Last-Rev: faacba662ee6bf41f69960060d48d340cfdbbbd6
GitHub-Pull-Request: golang/go#61793
Reviewed-on: https://go-review.googlesource.com/c/go/+/516455
TryBot-Result: Gopher Robot <gobot@golang.org>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
|
|
Now with PCALIGN available on amd64 we can start optimizing some routines that benefit from instruction alignment.
```
│ sec/op │ sec/op vs base │
IndexByte/4K-16 69.89n ± ∞ ¹ 45.88n ± ∞ ¹ -34.35% (p=0.008 n=5)
IndexByte/4M-16 65.36µ ± ∞ ¹ 47.32µ ± ∞ ¹ -27.60% (p=0.008 n=5)
IndexByte/64M-16 1.435m ± ∞ ¹ 1.140m ± ∞ ¹ -20.57% (p=0.008 n=5)
│ B/s │ B/s vs base │
IndexByte/4K-16 54.58Gi ± ∞ ¹ 83.14Gi ± ∞ ¹ +52.32% (p=0.008 n=5)
IndexByte/4M-16 59.76Gi ± ∞ ¹ 82.54Gi ± ∞ ¹ +38.12% (p=0.008 n=5)
IndexByte/64M-16 43.56Gi ± ∞ ¹ 54.84Gi ± ∞ ¹ +25.89% (p=0.008 n=5)
```
Change-Id: Iff3dfd542c55e7569242be81f38b2887b9e04e87
GitHub-Last-Rev: f309f898b13ad8fdf88a21f2f105382db9ada2f5
GitHub-Pull-Request: golang/go#61792
Reviewed-on: https://go-review.googlesource.com/c/go/+/516435
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
|
|
Use generic implementation of IndexByte/IndexByteString
on plan9/amd64 since the assembly implementation
uses SSE instructions which are classified as floating
point instructions and cannot be used in a note handler.
A similar issue was fixed in CL 100577.
This fixes runtime.TestBreakpoint.
Fixes #61087.
Change-Id: Id0c085e47da449be405ea04ab9b93518c4e2fde8
Reviewed-on: https://go-review.googlesource.com/c/go/+/508400
Reviewed-by: Heschi Kreinick <heschi@google.com>
Auto-Submit: Ian Lance Taylor <iant@golang.org>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: David du Colombier <0intro@gmail.com>
|
|
The riscv64 implementation of equal has an optimization that is
applied when both pointers share the same alignment but that alignment
is not 8 bytes. In this case it tries to align both pointers to an 8 byte boundaries,
by individually comparing the first few bytes of each buffer. Unfortunately,
the existing code is incorrect. It adjusts the pointers by the wrong number
of bytes resulting, in most cases, in pointers that are not 8 byte aligned.
This commit fixes the issue by individually comparing the first
(8 - (pointer & 7)) bytes of each buffer rather than the first
(pointer & 7) bytes.
This particular optimization is not covered by any of the existing
benchmarks so a new benchmark, BenchmarkEqualBothUnaligned,
is provided. The benchmark tests the case where both pointers have
the same alignment but may not be 8 byte aligned. Results of the
new benchmark along with some of the existing benchmarks generated on
a SiFive HiFive Unmatched A00 with 16GB of RAM running Ubuntu 23.04
are presented below.
Equal/0-4 3.356n ± 0% 3.357n ± 0% ~ (p=0.840 n=10)
Equal/1-4 63.91n ± 7% 65.97n ± 5% +3.22% (p=0.029 n=10)
Equal/6-4 72.94n ± 5% 76.09n ± 4% ~ (p=0.075 n=10)
Equal/9-4 84.61n ± 7% 85.83n ± 3% ~ (p=0.315 n=10)
Equal/15-4 103.7n ± 2% 102.9n ± 4% ~ (p=0.739 n=10)
Equal/16-4 89.14n ± 3% 100.40n ± 4% +12.64% (p=0.000 n=10)
Equal/20-4 107.8n ± 3% 106.8n ± 3% ~ (p=0.725 n=10)
Equal/32-4 63.95n ± 8% 67.79n ± 7% ~ (p=0.089 n=10)
Equal/4K-4 1.256µ ± 1% 1.254µ ± 0% ~ (p=0.925 n=10)
Equal/4M-4 1.231m ± 0% 1.230m ± 0% -0.04% (p=0.011 n=10)
Equal/64M-4 19.77m ± 0% 19.78m ± 0% ~ (p=0.052 n=10)
EqualBothUnaligned/64_0-4 43.70n ± 4% 44.40n ± 5% ~ (p=0.529 n=10)
EqualBothUnaligned/64_1-4 6957.5n ± 0% 105.9n ± 1% -98.48% (p=0.000 n=10)
EqualBothUnaligned/64_4-4 100.1n ± 2% 101.5n ± 4% ~ (p=0.149 n=10)
EqualBothUnaligned/64_7-4 6965.00n ± 0% 95.60n ± 4% -98.63% (p=0.000 n=10)
EqualBothUnaligned/4096_0-4 1.233µ ± 1% 1.225µ ± 0% -0.65% (p=0.015 n=10)
EqualBothUnaligned/4096_1-4 584.226µ ± 0% 1.277µ ± 0% -99.78% (p=0.000 n=10)
EqualBothUnaligned/4096_4-4 1.270µ ± 1% 1.268µ ± 0% ~ (p=0.105 n=10)
EqualBothUnaligned/4096_7-4 584.944µ ± 0% 1.266µ ± 1% -99.78% (p=0.000 n=10)
EqualBothUnaligned/4194304_0-4 1.241m ± 0% 1.236m ± 0% -0.38% (p=0.035 n=10)
EqualBothUnaligned/4194304_1-4 600.956m ± 0% 1.238m ± 0% -99.79% (p=0.000 n=10)
EqualBothUnaligned/4194304_4-4 1.239m ± 0% 1.241m ± 0% +0.22% (p=0.007 n=10)
EqualBothUnaligned/4194304_7-4 601.036m ± 0% 1.239m ± 0% -99.79% (p=0.000 n=10)
EqualBothUnaligned/67108864_0-4 19.79m ± 0% 19.78m ± 0% ~ (p=0.393 n=10)
EqualBothUnaligned/67108864_1-4 9616.61m ± 0% 19.82m ± 0% -99.79% (p=0.000 n=10)
EqualBothUnaligned/67108864_4-4 19.82m ± 0% 19.82m ± 0% ~ (p=0.971 n=10)
EqualBothUnaligned/67108864_7-4 9616.34m ± 0% 19.86m ± 0% -99.79% (p=0.000 n=10)
geomean 38.38µ 7.194µ -81.26%
Change-Id: I4caab6c3450bd7e2773426b08b70bbc37fbe4e5f
Reviewed-on: https://go-review.googlesource.com/c/go/+/500855
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Run-TryBot: Keith Randall <khr@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
|
|
The riscv64 implementation of compare has an optimization that is
applied when both pointers share the same alignment but that alignment
is not 8 bytes. In this case it tries to align both pointers to an 8 byte boundaries,
by individually comparing the first few bytes of each buffer. Unfortunately,
the existing code is incorrect. It adjusts the pointers by the wrong number
of bytes resulting, in most cases, in pointers that are not 8 byte aligned.
This commit fixes the issue by individually comparing the first
(8 - (pointer & 7)) bytes of each buffer rather than the first
(pointer & 7) bytes.
We also remove an unnecessary immediate MOV instruction.
This particular optimization is not covered by any of the existing
benchmarks so a new benchmark, benchmarkCompareBytesBigBothUnaligned,
is provided. The benchmark tests the case where both pointers have
the same alignment but may not be 8 byte aligned. Results of the
new benchmark along with some of the existing benchmarks generated on
a SiFive HiFive Unmatched A00 with 16GB of RAM running Ubuntu 23.04
are presented below.
CompareBytesEqual-4 70.00n ± 6% 68.32n ± 0% -2.40% (p=0.020 n=10)
CompareBytesToNil-4 19.31n ± 0% 18.47n ± 0% -4.35% (p=0.000 n=10)
CompareBytesEmpty-4 16.79n ± 0% 15.95n ± 0% -4.97% (p=0.000 n=10)
CompareBytesIdentical-4 19.94n ± 15% 18.32n ± 13% -8.15% (p=0.040 n=10)
CompareBytesSameLength-4 37.93n ± 0% 42.44n ± 1% +11.91% (p=0.000 n=10)
CompareBytesDifferentLength-4 37.93n ± 0% 42.44n ± 0% +11.89% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=1-4 3.881m ± 14% 3.880m ± 15% ~ (p=0.436 n=10)
CompareBytesBigUnaligned/offset=2-4 3.884m ± 0% 3.875m ± 0% ~ (p=0.190 n=10)
CompareBytesBigUnaligned/offset=3-4 3.858m ± 1% 3.868m ± 1% ~ (p=0.105 n=10)
CompareBytesBigUnaligned/offset=4-4 3.877m ± 1% 3.876m ± 0% ~ (p=0.529 n=10)
CompareBytesBigUnaligned/offset=5-4 3.859m ± 0% 3.874m ± 0% +0.39% (p=0.009 n=10)
CompareBytesBigUnaligned/offset=6-4 3.878m ± 1% 3.876m ± 0% ~ (p=0.353 n=10)
CompareBytesBigUnaligned/offset=7-4 3.868m ± 1% 3.877m ± 0% ~ (p=0.190 n=10)
CompareBytesBigBothUnaligned/offset=0-4 1.586m ± 0% 1.765m ± 0% +11.30% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=1-4 153.132m ± 1% 1.765m ± 1% -98.85% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=2-4 152.930m ± 1% 1.765m ± 1% -98.85% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=3-4 152.093m ± 1% 1.769m ± 0% -98.84% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=4-4 1.602m ± 0% 1.764m ± 0% +10.11% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=5-4 152.314m ± 1% 1.768m ± 0% -98.84% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=6-4 152.905m ± 1% 1.764m ± 1% -98.85% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=7-4 152.951m ± 1% 1.804m ± 2% -98.82% (p=0.000 n=10)
CompareBytesBig-4 1.441m ± 21% 1.373m ± 55% ~ (p=0.481 n=10)
CompareBytesBigIdentical-4 19.94n ± 1% 19.10n ± 0% -4.21% (p=0.001 n=10)
geomean 243.7µ 76.65µ -68.54%
CompareBytesBigUnaligned/offset=1-4 257.7Mi ± 12% 257.7Mi ± 13% ~ (p=0.424 n=10)
CompareBytesBigUnaligned/offset=2-4 257.5Mi ± 0% 258.1Mi ± 0% ~ (p=0.190 n=10)
CompareBytesBigUnaligned/offset=3-4 259.2Mi ± 1% 258.5Mi ± 1% ~ (p=0.105 n=10)
CompareBytesBigUnaligned/offset=4-4 257.9Mi ± 1% 258.0Mi ± 0% ~ (p=0.529 n=10)
CompareBytesBigUnaligned/offset=5-4 259.1Mi ± 0% 258.1Mi ± 0% -0.39% (p=0.008 n=10)
CompareBytesBigUnaligned/offset=6-4 257.9Mi ± 1% 258.0Mi ± 0% ~ (p=0.353 n=10)
CompareBytesBigUnaligned/offset=7-4 258.5Mi ± 1% 257.9Mi ± 0% ~ (p=0.190 n=10)
CompareBytesBigBothUnaligned/offset=0-4 630.6Mi ± 0% 566.6Mi ± 0% -10.15% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=1-4 6.533Mi ± 1% 566.545Mi ± 1% +8572.48% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=2-4 6.537Mi ± 1% 566.683Mi ± 1% +8568.27% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=3-4 6.576Mi ± 1% 565.200Mi ± 0% +8495.43% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=4-4 624.2Mi ± 0% 566.9Mi ± 0% -9.18% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=5-4 6.566Mi ± 1% 565.758Mi ± 0% +8516.41% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=6-4 6.542Mi ± 1% 567.036Mi ± 1% +8567.35% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=7-4 6.542Mi ± 1% 554.390Mi ± 2% +8374.05% (p=0.000 n=10)
CompareBytesBig-4 694.2Mi ± 18% 728.1Mi ± 35% ~ (p=0.481 n=10)
CompareBytesBigIdentical-4 47.83Ti ± 1% 49.92Ti ± 0% +4.39% (p=0.002 n=10)
geomean 170.0Mi 813.8Mi +378.66%
Change-Id: I0a2d0386d5ca1ffa249682a12ebd1533508e31e9
Reviewed-on: https://go-review.googlesource.com/c/go/+/497838
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Keith Randall <khr@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Keith Randall <khr@google.com>
Run-TryBot: M Zhuo <mzh@golangcn.org>
|
|
The initial purpose of PCALIGN was to identify code
where it would be beneficial to align code for performance,
but avoid cases where too many NOPs were added. On p10, it
is now necessary to enforce a certain alignment in some
cases, so the behavior of PCALIGN needs to be slightly
different. Code will now be aligned to the value specified
on the PCALIGN instruction regardless of number of NOPs added,
which is more intuitive and consistent with power assembler
alignment directives.
This also adds 64 as a possible alignment value.
The existing values used in PCALIGN were modified according to
the new behavior.
A testcase was updated and performance testing was done to
verify that this does not adversely affect performance.
Change-Id: Iad1cf5ff112e5bfc0514f0805be90e24095e932b
Reviewed-on: https://go-review.googlesource.com/c/go/+/485056
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Archana Ravindar <aravind5@in.ibm.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Paul Murphy <murp@ibm.com>
Reviewed-by: Bryan Mills <bcmills@google.com>
|
|
Use P8 instructions throughout to be backwards compatible, but
otherwise not impede performance. Use overlapping loads where
possible, and prioritize larger checks over smaller check.
However, some newer instructions can be used surgically when
targeting a newer GOPPC64. These can lead to noticeable
performance improvements with minimal impact to readability.
All tests run below on a Power10/ppc64le, and use a small
modification to BenchmarkIndexByte to ensure the IndexByte
wrapper call is inlined (as it likely is under realistic usage).
This wrapper adds substantial overhead if not inlined.
Previous (power9 path, GOPPC64=power8) vs. GOPPC64=power8:
IndexByte/1 3.81ns ± 8% 3.11ns ± 5% -18.39%
IndexByte/2 3.82ns ± 3% 3.20ns ± 6% -16.23%
IndexByte/3 3.61ns ± 4% 3.25ns ± 6% -10.13%
IndexByte/4 3.66ns ± 5% 3.08ns ± 1% -15.91%
IndexByte/5 3.82ns ± 0% 3.75ns ± 2% -1.94%
IndexByte/6 3.83ns ± 0% 3.87ns ± 4% +1.04%
IndexByte/7 3.83ns ± 0% 3.82ns ± 0% -0.27%
IndexByte/8 3.82ns ± 0% 2.92ns ±11% -23.70%
IndexByte/9 3.70ns ± 2% 3.08ns ± 2% -16.87%
IndexByte/10 3.74ns ± 2% 3.04ns ± 0% -18.75%
IndexByte/11 3.75ns ± 0% 3.31ns ± 8% -11.79%
IndexByte/12 3.74ns ± 0% 3.04ns ± 0% -18.86%
IndexByte/13 3.83ns ± 4% 3.04ns ± 0% -20.64%
IndexByte/14 3.80ns ± 1% 3.30ns ± 8% -13.18%
IndexByte/15 3.77ns ± 1% 3.04ns ± 0% -19.33%
IndexByte/16 3.81ns ± 0% 2.78ns ± 7% -26.88%
IndexByte/17 4.12ns ± 0% 3.04ns ± 1% -26.11%
IndexByte/18 4.27ns ± 6% 3.05ns ± 0% -28.64%
IndexByte/19 4.30ns ± 4% 3.02ns ± 2% -29.65%
IndexByte/20 4.43ns ± 7% 3.45ns ± 7% -22.15%
IndexByte/21 4.12ns ± 0% 3.03ns ± 1% -26.35%
IndexByte/22 4.40ns ± 6% 3.05ns ± 0% -30.82%
IndexByte/23 4.40ns ± 6% 3.01ns ± 2% -31.48%
IndexByte/24 4.32ns ± 5% 3.07ns ± 0% -28.98%
IndexByte/25 4.76ns ± 2% 3.04ns ± 1% -36.11%
IndexByte/26 4.82ns ± 0% 3.05ns ± 0% -36.66%
IndexByte/27 4.82ns ± 0% 2.97ns ± 3% -38.39%
IndexByte/28 4.82ns ± 0% 2.96ns ± 3% -38.57%
IndexByte/29 4.82ns ± 0% 3.34ns ± 9% -30.71%
IndexByte/30 4.82ns ± 0% 3.05ns ± 0% -36.77%
IndexByte/31 4.81ns ± 0% 3.05ns ± 0% -36.70%
IndexByte/32 3.52ns ± 0% 3.44ns ± 1% -2.15%
IndexByte/33 4.77ns ± 1% 3.35ns ± 0% -29.81%
IndexByte/34 5.01ns ± 5% 3.35ns ± 0% -33.15%
IndexByte/35 4.92ns ± 9% 3.35ns ± 0% -31.89%
IndexByte/36 4.81ns ± 5% 3.35ns ± 0% -30.37%
IndexByte/37 4.99ns ± 6% 3.35ns ± 0% -32.86%
IndexByte/38 5.06ns ± 5% 3.35ns ± 0% -33.84%
IndexByte/39 5.02ns ± 5% 3.48ns ± 9% -30.58%
IndexByte/40 5.21ns ± 9% 3.55ns ± 4% -31.82%
IndexByte/41 5.18ns ± 0% 3.42ns ± 2% -33.98%
IndexByte/42 5.19ns ± 0% 3.55ns ±11% -31.56%
IndexByte/43 5.18ns ± 0% 3.45ns ± 5% -33.46%
IndexByte/44 5.18ns ± 0% 3.39ns ± 0% -34.56%
IndexByte/45 5.18ns ± 0% 3.43ns ± 4% -33.74%
IndexByte/46 5.18ns ± 0% 3.47ns ± 1% -33.03%
IndexByte/47 5.18ns ± 0% 3.44ns ± 2% -33.54%
IndexByte/48 5.18ns ± 0% 3.39ns ± 0% -34.52%
IndexByte/49 5.69ns ± 0% 3.79ns ± 0% -33.45%
IndexByte/50 5.70ns ± 0% 3.70ns ± 3% -34.98%
IndexByte/51 5.70ns ± 0% 3.70ns ± 2% -35.05%
IndexByte/52 5.69ns ± 0% 3.80ns ± 1% -33.35%
IndexByte/53 5.69ns ± 0% 3.78ns ± 0% -33.54%
IndexByte/54 5.69ns ± 0% 3.78ns ± 1% -33.51%
IndexByte/55 5.69ns ± 0% 3.78ns ± 0% -33.61%
IndexByte/56 5.69ns ± 0% 3.81ns ± 3% -33.12%
IndexByte/57 6.20ns ± 0% 3.79ns ± 4% -38.89%
IndexByte/58 6.20ns ± 0% 3.74ns ± 2% -39.58%
IndexByte/59 6.20ns ± 0% 3.69ns ± 2% -40.47%
IndexByte/60 6.20ns ± 0% 3.79ns ± 1% -38.81%
IndexByte/61 6.20ns ± 0% 3.77ns ± 1% -39.23%
IndexByte/62 6.20ns ± 0% 3.79ns ± 0% -38.89%
IndexByte/63 6.20ns ± 0% 3.79ns ± 0% -38.90%
IndexByte/64 4.17ns ± 0% 3.47ns ± 3% -16.70%
IndexByte/65 5.38ns ± 0% 4.21ns ± 0% -21.59%
IndexByte/66 5.38ns ± 0% 4.21ns ± 0% -21.58%
IndexByte/67 5.38ns ± 0% 4.22ns ± 0% -21.58%
IndexByte/68 5.38ns ± 0% 4.22ns ± 0% -21.59%
IndexByte/69 5.38ns ± 0% 4.22ns ± 0% -21.56%
IndexByte/70 5.38ns ± 0% 4.21ns ± 0% -21.59%
IndexByte/71 5.37ns ± 0% 4.21ns ± 0% -21.51%
IndexByte/72 5.37ns ± 0% 4.22ns ± 0% -21.46%
IndexByte/73 5.71ns ± 0% 4.22ns ± 0% -26.20%
IndexByte/74 5.71ns ± 0% 4.21ns ± 0% -26.21%
IndexByte/75 5.71ns ± 0% 4.21ns ± 0% -26.17%
IndexByte/76 5.71ns ± 0% 4.22ns ± 0% -26.22%
IndexByte/77 5.71ns ± 0% 4.22ns ± 0% -26.22%
IndexByte/78 5.71ns ± 0% 4.21ns ± 0% -26.22%
IndexByte/79 5.71ns ± 0% 4.22ns ± 0% -26.21%
IndexByte/80 5.71ns ± 0% 4.21ns ± 0% -26.19%
IndexByte/81 6.20ns ± 0% 4.39ns ± 0% -29.13%
IndexByte/82 6.20ns ± 0% 4.36ns ± 0% -29.67%
IndexByte/83 6.20ns ± 0% 4.36ns ± 0% -29.63%
IndexByte/84 6.20ns ± 0% 4.39ns ± 0% -29.21%
IndexByte/85 6.20ns ± 0% 4.36ns ± 0% -29.64%
IndexByte/86 6.20ns ± 0% 4.36ns ± 0% -29.63%
IndexByte/87 6.20ns ± 0% 4.39ns ± 0% -29.21%
IndexByte/88 6.20ns ± 0% 4.36ns ± 0% -29.65%
IndexByte/89 6.74ns ± 0% 4.36ns ± 0% -35.33%
IndexByte/90 6.75ns ± 0% 4.37ns ± 0% -35.22%
IndexByte/91 6.74ns ± 0% 4.36ns ± 0% -35.30%
IndexByte/92 6.74ns ± 0% 4.36ns ± 0% -35.34%
IndexByte/93 6.74ns ± 0% 4.37ns ± 0% -35.20%
IndexByte/94 6.74ns ± 0% 4.36ns ± 0% -35.33%
IndexByte/95 6.75ns ± 0% 4.36ns ± 0% -35.32%
IndexByte/96 4.83ns ± 0% 4.34ns ± 2% -10.24%
IndexByte/97 5.91ns ± 0% 4.65ns ± 0% -21.24%
IndexByte/98 5.91ns ± 0% 4.65ns ± 0% -21.24%
IndexByte/99 5.91ns ± 0% 4.65ns ± 0% -21.23%
IndexByte/100 5.90ns ± 0% 4.65ns ± 0% -21.21%
IndexByte/101 5.90ns ± 0% 4.65ns ± 0% -21.22%
IndexByte/102 5.90ns ± 0% 4.65ns ± 0% -21.23%
IndexByte/103 5.91ns ± 0% 4.65ns ± 0% -21.23%
IndexByte/104 5.91ns ± 0% 4.65ns ± 0% -21.24%
IndexByte/105 6.25ns ± 0% 4.65ns ± 0% -25.59%
IndexByte/106 6.25ns ± 0% 4.65ns ± 0% -25.59%
IndexByte/107 6.25ns ± 0% 4.65ns ± 0% -25.60%
IndexByte/108 6.25ns ± 0% 4.65ns ± 0% -25.58%
IndexByte/109 6.24ns ± 0% 4.65ns ± 0% -25.50%
IndexByte/110 6.25ns ± 0% 4.65ns ± 0% -25.56%
IndexByte/111 6.25ns ± 0% 4.65ns ± 0% -25.60%
IndexByte/112 6.25ns ± 0% 4.65ns ± 0% -25.59%
IndexByte/113 6.76ns ± 0% 5.05ns ± 0% -25.37%
IndexByte/114 6.76ns ± 0% 5.05ns ± 0% -25.31%
IndexByte/115 6.76ns ± 0% 5.05ns ± 0% -25.38%
IndexByte/116 6.76ns ± 0% 5.05ns ± 0% -25.31%
IndexByte/117 6.76ns ± 0% 5.05ns ± 0% -25.38%
IndexByte/118 6.76ns ± 0% 5.05ns ± 0% -25.31%
IndexByte/119 6.76ns ± 0% 5.05ns ± 0% -25.38%
IndexByte/120 6.76ns ± 0% 5.05ns ± 0% -25.36%
IndexByte/121 7.35ns ± 0% 5.05ns ± 0% -31.33%
IndexByte/122 7.36ns ± 0% 5.05ns ± 0% -31.42%
IndexByte/123 7.38ns ± 0% 5.05ns ± 0% -31.60%
IndexByte/124 7.38ns ± 0% 5.05ns ± 0% -31.59%
IndexByte/125 7.38ns ± 0% 5.05ns ± 0% -31.60%
IndexByte/126 7.38ns ± 0% 5.05ns ± 0% -31.58%
IndexByte/128 5.28ns ± 0% 5.10ns ± 0% -3.41%
IndexByte/256 7.27ns ± 0% 7.28ns ± 2% +0.13%
IndexByte/512 12.1ns ± 0% 11.8ns ± 0% -2.51%
IndexByte/1K 23.1ns ± 3% 22.0ns ± 0% -4.66%
IndexByte/2K 42.6ns ± 0% 42.4ns ± 0% -0.41%
IndexByte/4K 90.3ns ± 0% 89.4ns ± 0% -0.98%
IndexByte/8K 170ns ± 0% 170ns ± 0% -0.59%
IndexByte/16K 331ns ± 0% 330ns ± 0% -0.27%
IndexByte/32K 660ns ± 0% 660ns ± 0% -0.08%
IndexByte/64K 1.30µs ± 0% 1.30µs ± 0% -0.08%
IndexByte/128K 2.58µs ± 0% 2.58µs ± 0% -0.04%
IndexByte/256K 5.15µs ± 0% 5.15µs ± 0% -0.04%
IndexByte/512K 10.3µs ± 0% 10.3µs ± 0% -0.03%
IndexByte/1M 20.6µs ± 0% 20.5µs ± 0% -0.03%
IndexByte/2M 41.1µs ± 0% 41.1µs ± 0% -0.03%
IndexByte/4M 82.2µs ± 0% 82.1µs ± 0% -0.02%
IndexByte/8M 164µs ± 0% 164µs ± 0% -0.01%
IndexByte/16M 328µs ± 0% 328µs ± 0% -0.01%
IndexByte/32M 657µs ± 0% 657µs ± 0% -0.00%
GOPPC64=power8 vs GOPPC64=power9. The Improvement is
most noticed between 16 and 64B, and goes away around
128B.
IndexByte/16 2.78ns ± 7% 2.65ns ±15% -4.74%
IndexByte/17 3.04ns ± 1% 2.80ns ± 3% -7.85%
IndexByte/18 3.05ns ± 0% 2.71ns ± 4% -11.00%
IndexByte/19 3.02ns ± 2% 2.76ns ±10% -8.74%
IndexByte/20 3.45ns ± 7% 2.91ns ± 0% -15.46%
IndexByte/21 3.03ns ± 1% 2.84ns ± 9% -6.33%
IndexByte/22 3.05ns ± 0% 2.67ns ± 1% -12.38%
IndexByte/23 3.01ns ± 2% 2.67ns ± 1% -11.24%
IndexByte/24 3.07ns ± 0% 2.92ns ±12% -4.79%
IndexByte/25 3.04ns ± 1% 3.15ns ±15% +3.63%
IndexByte/26 3.05ns ± 0% 2.83ns ±13% -7.33%
IndexByte/27 2.97ns ± 3% 2.98ns ±10% +0.56%
IndexByte/28 2.96ns ± 3% 2.96ns ± 9% -0.05%
IndexByte/29 3.34ns ± 9% 3.03ns ±12% -9.33%
IndexByte/30 3.05ns ± 0% 2.68ns ± 1% -12.05%
IndexByte/31 3.05ns ± 0% 2.83ns ±12% -7.27%
IndexByte/32 3.44ns ± 1% 3.21ns ±10% -6.78%
IndexByte/33 3.35ns ± 0% 3.41ns ± 2% +1.95%
IndexByte/34 3.35ns ± 0% 3.13ns ± 0% -6.53%
IndexByte/35 3.35ns ± 0% 3.13ns ± 0% -6.54%
IndexByte/36 3.35ns ± 0% 3.13ns ± 0% -6.52%
IndexByte/37 3.35ns ± 0% 3.13ns ± 0% -6.52%
IndexByte/38 3.35ns ± 0% 3.24ns ± 4% -3.30%
IndexByte/39 3.48ns ± 9% 3.44ns ± 2% -1.19%
IndexByte/40 3.55ns ± 4% 3.46ns ± 2% -2.44%
IndexByte/41 3.42ns ± 2% 3.39ns ± 4% -0.86%
IndexByte/42 3.55ns ±11% 3.46ns ± 1% -2.65%
IndexByte/43 3.45ns ± 5% 3.44ns ± 2% -0.31%
IndexByte/44 3.39ns ± 0% 3.43ns ± 3% +1.23%
IndexByte/45 3.43ns ± 4% 3.50ns ± 1% +2.07%
IndexByte/46 3.47ns ± 1% 3.46ns ± 2% -0.31%
IndexByte/47 3.44ns ± 2% 3.47ns ± 1% +0.78%
IndexByte/48 3.39ns ± 0% 3.46ns ± 2% +1.96%
IndexByte/49 3.79ns ± 0% 3.47ns ± 0% -8.41%
IndexByte/50 3.70ns ± 3% 3.64ns ± 5% -1.66%
IndexByte/51 3.70ns ± 2% 3.75ns ± 0% +1.40%
IndexByte/52 3.80ns ± 1% 3.77ns ± 0% -0.70%
IndexByte/53 3.78ns ± 0% 3.77ns ± 0% -0.46%
IndexByte/54 3.78ns ± 1% 3.53ns ± 7% -6.74%
IndexByte/55 3.78ns ± 0% 3.47ns ± 0% -8.17%
IndexByte/56 3.81ns ± 3% 3.45ns ± 0% -9.43%
IndexByte/57 3.79ns ± 4% 3.47ns ± 0% -8.45%
IndexByte/58 3.74ns ± 2% 3.55ns ± 4% -5.16%
IndexByte/59 3.69ns ± 2% 3.61ns ± 4% -2.01%
IndexByte/60 3.79ns ± 1% 3.45ns ± 0% -9.09%
IndexByte/61 3.77ns ± 1% 3.47ns ± 0% -7.93%
IndexByte/62 3.79ns ± 0% 3.45ns ± 0% -8.97%
IndexByte/63 3.79ns ± 0% 3.47ns ± 0% -8.44%
IndexByte/64 3.47ns ± 3% 3.18ns ± 0% -8.41%
GOPPC64=power9 vs GOPPC64=power10. Only sizes <16 will
show meaningful changes.
IndexByte/1 3.27ns ± 8% 2.36ns ± 2% -27.58%
IndexByte/2 3.06ns ± 4% 2.34ns ± 1% -23.42%
IndexByte/3 3.77ns ±11% 2.48ns ± 7% -34.03%
IndexByte/4 3.18ns ± 8% 2.33ns ± 1% -26.69%
IndexByte/5 3.18ns ± 5% 2.34ns ± 4% -26.26%
IndexByte/6 3.13ns ± 3% 2.35ns ± 1% -24.97%
IndexByte/7 3.25ns ± 1% 2.33ns ± 1% -28.22%
IndexByte/8 2.79ns ± 2% 2.36ns ± 1% -15.32%
IndexByte/9 2.90ns ± 0% 2.34ns ± 2% -19.36%
IndexByte/10 2.99ns ± 3% 2.31ns ± 1% -22.70%
IndexByte/11 3.13ns ± 7% 2.31ns ± 0% -26.08%
IndexByte/12 3.01ns ± 4% 2.32ns ± 1% -22.91%
IndexByte/13 2.98ns ± 3% 2.31ns ± 1% -22.72%
IndexByte/14 2.92ns ± 2% 2.61ns ±16% -10.58%
IndexByte/15 3.02ns ± 5% 2.69ns ± 7% -10.90%
IndexByte/16 2.65ns ±15% 2.29ns ± 1% -13.61%
Change-Id: I4482f762d25eabf60def4981a0b2bc0c10ccf50c
Reviewed-on: https://go-review.googlesource.com/c/go/+/478656
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Bryan Mills <bcmills@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Archana Ravindar <aravind5@in.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
|
|
Rewrite index asm function to use the new power10 instruction lxvl,
stxvl or the load, store vector with length which can specify the
number of bytes to be stored in a register. This avoids the need to
create a separator mask and extra AND instructions. It also allows
us to process the tail end of the string using a lot fewer instructions
as we can load bytes of separator length directly rather than loading
16 bytes and masking out bytes that are greater than separator length
On power9 and power8 the code remains unchanged.
The performance for smaller sizes improve the most, on larger sizes
we see minimal improvement.
name old time/op new time/op delta
Index/10 10.6ns ± 3% 9.8ns ± 2% -7.20%
Index/11 11.2ns ± 4% 10.6ns ± 0% -5.99%
Index/12 12.7ns ± 3% 11.3ns ± 0% -11.21%
Index/13 13.5ns ± 2% 11.7ns ± 0% -13.11%
Index/14 14.1ns ± 1% 12.0ns ± 0% -14.43%
Index/15 14.3ns ± 2% 12.4ns ± 0% -13.39%
Index/16 14.5ns ± 1% 12.7ns ± 0% -12.57%
Index/17 26.7ns ± 0% 25.9ns ± 0% -2.99%
Index/18 27.3ns ± 0% 26.4ns ± 1% -3.35%
Index/19 35.7ns ±16% 26.1ns ± 1% -26.87%
Index/20 29.4ns ± 0% 27.3ns ± 1% -7.06%
Index/21 29.3ns ± 0% 26.9ns ± 1% -8.37%
Index/22 30.0ns ± 0% 27.4ns ± 0% -8.68%
Index/23 29.9ns ± 0% 27.7ns ± 0% -7.15%
Index/24 31.0ns ± 0% 28.0ns ± 0% -9.92%
Index/25 31.7ns ± 0% 28.4ns ± 0% -10.54%
Index/26 30.6ns ± 0% 28.9ns ± 1% -5.67%
Index/27 31.4ns ± 0% 29.3ns ± 0% -6.71%
Index/28 32.7ns ± 0% 29.6ns ± 1% -9.36%
Index/29 33.3ns ± 0% 30.1ns ± 1% -9.70%
Index/30 32.4ns ± 0% 30.7ns ± 0% -5.23%
Index/31 33.2ns ± 0% 30.6ns ± 1% -7.83%
Index/32 34.3ns ± 0% 30.9ns ± 0% -9.94%
Index/64 46.8ns ± 0% 44.2ns ± 0% -5.66%
Index/128 71.2ns ± 0% 67.3ns ± 0% -5.43%
Index/256 129ns ± 0% 127ns ± 0% -1.67%
Index/2K 838ns ± 0% 804ns ± 0% -4.03%
Index/4K 1.65µs ± 0% 1.58µs ± 0% -4.25%
Index/2M 829µs ± 0% 793µs ± 0% -4.42%
Index/4M 1.65ms ± 0% 1.59ms ± 0% -4.19%
Index/64M 26.5ms ± 0% 25.4ms ± 0% -4.18%
IndexHard2 412µs ± 0% 396µs ± 0% -3.76%
IndexEasy/10 10.0ns ± 0% 9.3ns ± 1% -7.20%
IndexEasy/11 10.8ns ± 1% 11.0ns ± 1% +2.22%
IndexEasy/12 12.3ns ± 2% 11.5ns ± 1% -6.37%
IndexEasy/13 13.1ns ± 0% 11.7ns ± 2% -10.83%
IndexEasy/14 13.8ns ± 2% 11.9ns ± 1% -13.52%
IndexEasy/15 14.0ns ± 2% 12.4ns ± 2% -11.46%
IndexEasy/16 14.3ns ± 1% 12.5ns ± 0% -12.40%
CountHard2 415µs ± 0% 396µs ± 0% -4.48%
Change-Id: Id3efa5ed9c662a29f58125c7f866a09f29a59b6c
Reviewed-on: https://go-review.googlesource.com/c/go/+/478918
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Paul Murphy <murp@ibm.com>
Run-TryBot: Archana Ravindar <aravind5@in.ibm.com>
Reviewed-by: David Chase <drchase@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
|
|
The LA464 micro-architecture is very sensitive to alignment of loops,
so the final performance of linked binaries can vary wildly due to
uncontrolled alignment of certain performance-critical loops. Now that
PCALIGN is available on loong64, let's make use of it and manually align
some assembly loops. The functions are identified based on perf records
of some easily regressed go1 benchmark cases (e.g. FmtFprintfPrefixedInt,
RegexpMatchEasy0_1K and Revcomp are particularly sensitive; even those
optimizations purely reducing dynamic instruction counts can regress
those cases by 6~12%, making the numbers almost useless).
Benchmark results on Loongson 3A5000 (which is an LA464 implementation):
goos: linux
goarch: loong64
pkg: test/bench/go1
│ CL 416154 │ this CL │
│ sec/op │ sec/op vs base │
BinaryTree17 14.10 ± 1% 14.10 ± 1% ~ (p=1.000 n=10)
Fannkuch11 3.672 ± 0% 3.579 ± 0% -2.53% (p=0.000 n=10)
FmtFprintfEmpty 94.72n ± 0% 94.73n ± 0% +0.01% (p=0.000 n=10)
FmtFprintfString 149.9n ± 0% 151.9n ± 0% +1.33% (p=0.000 n=10)
FmtFprintfInt 154.1n ± 0% 158.3n ± 0% +2.73% (p=0.000 n=10)
FmtFprintfIntInt 236.2n ± 0% 241.4n ± 0% +2.20% (p=0.000 n=10)
FmtFprintfPrefixedInt 314.2n ± 0% 320.2n ± 0% +1.91% (p=0.000 n=10)
FmtFprintfFloat 405.0n ± 0% 414.3n ± 0% +2.30% (p=0.000 n=10)
FmtManyArgs 933.6n ± 0% 949.9n ± 0% +1.75% (p=0.000 n=10)
GobDecode 15.51m ± 1% 15.24m ± 0% -1.77% (p=0.000 n=10)
GobEncode 18.42m ± 4% 18.10m ± 2% ~ (p=0.631 n=10)
Gzip 423.6m ± 0% 429.9m ± 0% +1.49% (p=0.000 n=10)
Gunzip 88.75m ± 0% 88.31m ± 0% -0.50% (p=0.000 n=10)
HTTPClientServer 85.44µ ± 0% 85.71µ ± 0% +0.31% (p=0.035 n=10)
JSONEncode 18.65m ± 0% 19.74m ± 0% +5.81% (p=0.000 n=10)
JSONDecode 77.75m ± 0% 78.60m ± 1% +1.09% (p=0.000 n=10)
Mandelbrot200 7.214m ± 0% 7.208m ± 0% ~ (p=0.481 n=10)
GoParse 7.616m ± 2% 7.616m ± 1% ~ (p=0.739 n=10)
RegexpMatchEasy0_32 142.9n ± 0% 133.0n ± 0% -6.93% (p=0.000 n=10)
RegexpMatchEasy0_1K 1.535µ ± 0% 1.362µ ± 0% -11.27% (p=0.000 n=10)
RegexpMatchEasy1_32 161.8n ± 0% 161.8n ± 0% ~ (p=0.628 n=10)
RegexpMatchEasy1_1K 1.635µ ± 0% 1.497µ ± 0% -8.41% (p=0.000 n=10)
RegexpMatchMedium_32 1.429µ ± 0% 1.420µ ± 0% -0.63% (p=0.000 n=10)
RegexpMatchMedium_1K 41.86µ ± 0% 42.25µ ± 0% +0.93% (p=0.000 n=10)
RegexpMatchHard_32 2.144µ ± 0% 2.108µ ± 0% -1.68% (p=0.000 n=10)
RegexpMatchHard_1K 63.83µ ± 0% 62.65µ ± 0% -1.86% (p=0.000 n=10)
Revcomp 1.337 ± 0% 1.192 ± 0% -10.89% (p=0.000 n=10)
Template 116.4m ± 1% 115.6m ± 2% ~ (p=0.579 n=10)
TimeParse 421.4n ± 2% 418.1n ± 1% -0.78% (p=0.001 n=10)
TimeFormat 515.1n ± 0% 517.9n ± 0% +0.54% (p=0.001 n=10)
geomean 104.5µ 103.5µ -0.99%
│ CL 416154 │ this CL │
│ B/s │ B/s vs base │
GobDecode 47.19Mi ± 1% 48.04Mi ± 0% +1.80% (p=0.000 n=10)
GobEncode 39.73Mi ± 4% 40.44Mi ± 2% ~ (p=0.631 n=10)
Gzip 43.68Mi ± 0% 43.04Mi ± 0% -1.47% (p=0.000 n=10)
Gunzip 208.5Mi ± 0% 209.6Mi ± 0% +0.50% (p=0.000 n=10)
JSONEncode 99.21Mi ± 0% 93.76Mi ± 0% -5.49% (p=0.000 n=10)
JSONDecode 23.80Mi ± 0% 23.55Mi ± 1% -1.08% (p=0.000 n=10)
GoParse 7.253Mi ± 2% 7.253Mi ± 1% ~ (p=0.810 n=10)
RegexpMatchEasy0_32 213.6Mi ± 0% 229.4Mi ± 0% +7.41% (p=0.000 n=10)
RegexpMatchEasy0_1K 636.3Mi ± 0% 717.3Mi ± 0% +12.73% (p=0.000 n=10)
RegexpMatchEasy1_32 188.6Mi ± 0% 188.6Mi ± 0% ~ (p=0.810 n=10)
RegexpMatchEasy1_1K 597.4Mi ± 0% 652.2Mi ± 0% +9.17% (p=0.000 n=10)
RegexpMatchMedium_32 21.35Mi ± 0% 21.49Mi ± 0% +0.63% (p=0.000 n=10)
RegexpMatchMedium_1K 23.33Mi ± 0% 23.11Mi ± 0% -0.94% (p=0.000 n=10)
RegexpMatchHard_32 14.24Mi ± 0% 14.48Mi ± 0% +1.67% (p=0.000 n=10)
RegexpMatchHard_1K 15.30Mi ± 0% 15.59Mi ± 0% +1.93% (p=0.000 n=10)
Revcomp 181.3Mi ± 0% 203.4Mi ± 0% +12.21% (p=0.000 n=10)
Template 15.89Mi ± 1% 16.00Mi ± 2% ~ (p=0.542 n=10)
geomean 59.33Mi 60.72Mi +2.33%
Change-Id: I9ac28d936e03d21c46bb19fa100018f61ace6b42
Reviewed-on: https://go-review.googlesource.com/c/go/+/479816
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Auto-Submit: Ian Lance Taylor <iant@google.com>
Run-TryBot: WANG Xuerui <git@xen0n.name>
Reviewed-by: Keith Randall <khr@google.com>
Run-TryBot: Ian Lance Taylor <iant@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
|
|
Merge the P8 and P9 paths into one. This removes the need for
a runtime CPU check and maintaining two separate code paths.
This takes advantage of overlapping checks, and the P9 SETB
(emulated with little overhead on P8) to speed up comparisons
of small strings.
Similarly, the SETB instruction can be used on GOPPC64=power9
which provides a small speedup over using a couple ISELs. This
only accounts for a few percent on very small strings, thus
results of running P8 codegen on P9 are left out.
For the baseline on a power8 machine:
BytesCompare/1 7.76ns ± 0% 6.38ns ± 0% -17.71%
BytesCompare/2 7.77ns ± 0% 6.36ns ± 0% -18.12%
BytesCompare/3 7.56ns ± 0% 6.36ns ± 0% -15.79%
BytesCompare/4 7.76ns ± 0% 5.74ns ± 0% -25.99%
BytesCompare/5 7.48ns ± 0% 5.74ns ± 0% -23.29%
BytesCompare/6 7.56ns ± 0% 5.74ns ± 0% -24.06%
BytesCompare/7 7.14ns ± 0% 5.74ns ± 0% -19.63%
BytesCompare/8 5.58ns ± 0% 5.19ns ± 0% -7.03%
BytesCompare/9 7.85ns ± 0% 5.19ns ± 0% -33.86%
BytesCompare/10 7.87ns ± 0% 5.19ns ± 0% -34.06%
BytesCompare/11 7.59ns ± 0% 5.19ns ± 0% -31.59%
BytesCompare/12 7.87ns ± 0% 5.19ns ± 0% -34.02%
BytesCompare/13 7.55ns ± 0% 5.19ns ± 0% -31.24%
BytesCompare/14 7.47ns ± 0% 5.19ns ± 0% -30.53%
BytesCompare/15 7.88ns ± 0% 5.19ns ± 0% -34.09%
BytesCompare/16 6.07ns ± 0% 5.58ns ± 0% -8.08%
BytesCompare/17 9.05ns ± 0% 5.62ns ± 0% -37.94%
BytesCompare/18 8.95ns ± 0% 5.62ns ± 0% -37.24%
BytesCompare/19 8.49ns ± 0% 5.62ns ± 0% -33.81%
BytesCompare/20 9.07ns ± 0% 5.62ns ± 0% -38.05%
BytesCompare/21 8.69ns ± 0% 5.62ns ± 0% -35.37%
BytesCompare/22 8.57ns ± 0% 5.62ns ± 0% -34.43%
BytesCompare/23 8.31ns ± 0% 5.62ns ± 0% -32.38%
BytesCompare/24 8.42ns ± 0% 5.62ns ± 0% -33.23%
BytesCompare/25 9.70ns ± 0% 5.56ns ± 0% -42.69%
BytesCompare/26 9.53ns ± 0% 5.56ns ± 0% -41.66%
BytesCompare/27 9.29ns ± 0% 5.56ns ± 0% -40.15%
BytesCompare/28 9.53ns ± 0% 5.56ns ± 0% -41.65%
BytesCompare/29 9.37ns ± 0% 5.56ns ± 0% -40.63%
BytesCompare/30 9.17ns ± 0% 5.56ns ± 0% -39.36%
BytesCompare/31 9.07ns ± 0% 5.56ns ± 0% -38.71%
BytesCompare/32 5.81ns ± 0% 5.49ns ± 0% -5.49%
BytesCompare/33 9.36ns ± 0% 5.32ns ± 0% -43.17%
BytesCompare/34 9.44ns ± 0% 5.32ns ± 0% -43.68%
BytesCompare/35 8.91ns ± 0% 5.32ns ± 0% -40.29%
BytesCompare/36 9.45ns ± 0% 5.32ns ± 0% -43.71%
BytesCompare/37 8.94ns ± 0% 5.32ns ± 0% -40.53%
BytesCompare/38 9.08ns ± 0% 5.32ns ± 0% -41.44%
BytesCompare/39 8.62ns ± 0% 5.32ns ± 0% -38.33%
BytesCompare/40 7.93ns ± 0% 5.32ns ± 0% -32.93%
BytesCompare/41 10.1ns ± 0% 5.3ns ± 0% -47.08%
BytesCompare/42 10.1ns ± 0% 5.3ns ± 0% -47.43%
BytesCompare/43 9.80ns ± 0% 5.32ns ± 0% -45.66%
BytesCompare/44 10.3ns ± 0% 5.3ns ± 0% -48.26%
BytesCompare/45 9.88ns ± 0% 5.33ns ± 0% -46.08%
BytesCompare/46 9.82ns ± 0% 5.32ns ± 0% -45.81%
BytesCompare/47 9.73ns ± 0% 5.33ns ± 0% -45.25%
BytesCompare/48 8.31ns ± 0% 5.22ns ± 0% -37.19%
BytesCompare/49 11.2ns ± 0% 5.2ns ± 0% -53.28%
BytesCompare/50 11.1ns ± 0% 5.2ns ± 0% -52.86%
BytesCompare/51 10.8ns ± 0% 5.2ns ± 0% -51.37%
BytesCompare/52 11.1ns ± 0% 5.2ns ± 0% -52.94%
BytesCompare/53 10.8ns ± 0% 5.2ns ± 0% -51.50%
BytesCompare/54 10.7ns ± 0% 5.2ns ± 0% -51.09%
BytesCompare/55 10.3ns ± 0% 5.2ns ± 0% -49.49%
BytesCompare/56 10.9ns ± 0% 5.2ns ± 0% -51.73%
BytesCompare/57 12.2ns ± 0% 5.3ns ± 0% -56.92%
BytesCompare/58 12.2ns ± 0% 5.3ns ± 0% -56.81%
BytesCompare/59 11.5ns ± 0% 5.3ns ± 0% -54.45%
BytesCompare/60 12.1ns ± 0% 5.3ns ± 0% -56.67%
BytesCompare/61 11.7ns ± 0% 5.3ns ± 0% -54.96%
BytesCompare/62 11.9ns ± 0% 5.3ns ± 0% -55.76%
BytesCompare/63 11.4ns ± 0% 5.3ns ± 0% -53.73%
BytesCompare/64 6.08ns ± 0% 5.47ns ± 0% -9.96%
BytesCompare/65 9.87ns ± 0% 5.96ns ± 0% -39.57%
BytesCompare/66 9.81ns ± 0% 5.96ns ± 0% -39.25%
BytesCompare/67 9.49ns ± 0% 5.96ns ± 0% -37.18%
BytesCompare/68 9.81ns ± 0% 5.96ns ± 0% -39.26%
BytesCompare/69 9.44ns ± 0% 5.96ns ± 0% -36.84%
BytesCompare/70 9.58ns ± 0% 5.96ns ± 0% -37.75%
BytesCompare/71 9.24ns ± 0% 5.96ns ± 0% -35.50%
BytesCompare/72 8.26ns ± 0% 5.94ns ± 0% -28.09%
BytesCompare/73 10.6ns ± 0% 5.9ns ± 0% -43.70%
BytesCompare/74 10.6ns ± 0% 5.9ns ± 0% -43.87%
BytesCompare/75 10.2ns ± 0% 5.9ns ± 0% -41.83%
BytesCompare/76 10.7ns ± 0% 5.9ns ± 0% -44.55%
BytesCompare/77 10.3ns ± 0% 5.9ns ± 0% -42.51%
BytesCompare/78 10.3ns ± 0% 5.9ns ± 0% -42.29%
BytesCompare/79 10.2ns ± 0% 5.9ns ± 0% -41.95%
BytesCompare/80 8.74ns ± 0% 5.93ns ± 0% -32.23%
BytesCompare/81 11.7ns ± 0% 6.8ns ± 0% -41.87%
BytesCompare/82 11.7ns ± 0% 6.8ns ± 0% -41.54%
BytesCompare/83 11.1ns ± 0% 6.8ns ± 0% -38.32%
BytesCompare/84 11.7ns ± 0% 6.8ns ± 0% -41.59%
BytesCompare/85 11.2ns ± 0% 6.8ns ± 0% -38.93%
BytesCompare/86 11.2ns ± 0% 6.8ns ± 0% -38.87%
BytesCompare/87 10.8ns ± 0% 6.8ns ± 0% -37.07%
BytesCompare/88 11.3ns ± 0% 6.7ns ± 0% -40.57%
BytesCompare/89 12.6ns ± 0% 6.7ns ± 0% -46.57%
BytesCompare/90 12.6ns ± 0% 6.7ns ± 0% -46.44%
BytesCompare/91 11.9ns ± 0% 6.7ns ± 0% -43.66%
BytesCompare/92 12.5ns ± 0% 6.7ns ± 0% -46.09%
BytesCompare/93 12.2ns ± 0% 6.7ns ± 0% -44.90%
BytesCompare/94 12.4ns ± 0% 6.7ns ± 0% -45.62%
BytesCompare/95 11.8ns ± 0% 6.7ns ± 0% -43.00%
BytesCompare/96 7.25ns ± 0% 6.62ns ± 0% -8.70%
BytesCompare/97 11.1ns ± 0% 7.2ns ± 0% -34.98%
BytesCompare/98 10.9ns ± 0% 7.2ns ± 0% -34.03%
BytesCompare/99 10.4ns ± 0% 7.2ns ± 0% -31.19%
BytesCompare/100 10.9ns ± 0% 7.2ns ± 0% -33.97%
BytesCompare/101 10.4ns ± 0% 7.2ns ± 0% -31.19%
BytesCompare/102 10.7ns ± 0% 7.2ns ± 0% -32.72%
BytesCompare/103 10.2ns ± 0% 7.2ns ± 0% -29.28%
BytesCompare/104 9.38ns ± 0% 7.19ns ± 0% -23.33%
BytesCompare/105 11.7ns ± 0% 7.2ns ± 0% -38.60%
BytesCompare/106 11.7ns ± 0% 7.2ns ± 0% -38.28%
BytesCompare/107 11.3ns ± 0% 7.2ns ± 0% -36.48%
BytesCompare/108 11.7ns ± 0% 7.2ns ± 0% -38.49%
BytesCompare/109 11.4ns ± 0% 7.2ns ± 0% -36.76%
BytesCompare/110 11.3ns ± 0% 7.2ns ± 0% -36.37%
BytesCompare/111 11.1ns ± 0% 7.2ns ± 0% -35.05%
BytesCompare/112 9.95ns ± 0% 7.19ns ± 0% -27.71%
BytesCompare/113 12.7ns ± 0% 7.0ns ± 0% -44.71%
BytesCompare/114 12.6ns ± 0% 7.0ns ± 0% -44.23%
BytesCompare/115 12.3ns ± 0% 7.0ns ± 0% -42.83%
BytesCompare/116 12.7ns ± 0% 7.0ns ± 0% -44.67%
BytesCompare/117 12.2ns ± 0% 7.0ns ± 0% -42.41%
BytesCompare/118 12.2ns ± 0% 7.0ns ± 0% -42.50%
BytesCompare/119 11.9ns ± 0% 7.0ns ± 0% -40.76%
BytesCompare/120 12.3ns ± 0% 7.0ns ± 0% -43.01%
BytesCompare/121 13.7ns ± 0% 7.0ns ± 0% -48.55%
BytesCompare/122 13.6ns ± 0% 7.0ns ± 0% -48.06%
BytesCompare/123 12.9ns ± 0% 7.0ns ± 0% -45.44%
BytesCompare/124 13.5ns ± 0% 7.0ns ± 0% -47.91%
BytesCompare/125 13.0ns ± 0% 7.0ns ± 0% -46.03%
BytesCompare/126 13.2ns ± 0% 7.0ns ± 0% -46.72%
BytesCompare/127 12.9ns ± 0% 7.0ns ± 0% -45.36%
BytesCompare/128 7.53ns ± 0% 6.78ns ± 0% -9.95%
BytesCompare/256 10.1ns ± 0% 9.6ns ± 0% -4.35%
BytesCompare/512 23.0ns ± 0% 15.3ns ± 0% -33.30%
BytesCompare/1024 36.4ns ± 0% 32.8ns ± 0% -9.83%
BytesCompare/2048 62.0ns ± 0% 56.0ns ± 0% -9.77%
For GOPPC64=power9 on power9:
BytesCompare/1 5.95ns ± 0% 4.83ns ± 0% -18.89%
BytesCompare/2 6.37ns ± 0% 4.69ns ± 0% -26.39%
BytesCompare/3 6.87ns ± 0% 4.68ns ± 0% -31.79%
BytesCompare/4 5.86ns ± 0% 4.63ns ± 0% -20.98%
BytesCompare/5 5.84ns ± 0% 4.63ns ± 0% -20.67%
BytesCompare/6 5.84ns ± 0% 4.63ns ± 0% -20.70%
BytesCompare/7 5.82ns ± 0% 4.63ns ± 0% -20.40%
BytesCompare/8 5.81ns ± 0% 4.64ns ± 0% -20.23%
BytesCompare/9 5.83ns ± 0% 4.71ns ± 0% -19.19%
BytesCompare/10 6.22ns ± 0% 4.71ns ± 0% -24.32%
BytesCompare/11 6.94ns ± 0% 4.71ns ± 0% -32.16%
BytesCompare/12 5.77ns ± 0% 4.71ns ± 0% -18.34%
BytesCompare/13 5.77ns ± 0% 4.71ns ± 0% -18.44%
BytesCompare/14 5.77ns ± 0% 4.71ns ± 0% -18.31%
BytesCompare/15 6.31ns ± 0% 4.71ns ± 0% -25.32%
BytesCompare/16 4.99ns ± 0% 5.03ns ± 0% +0.72%
BytesCompare/17 5.07ns ± 0% 5.03ns ± 0% -0.87%
BytesCompare/18 5.07ns ± 0% 5.03ns ± 0% -0.81%
BytesCompare/19 5.07ns ± 0% 5.03ns ± 0% -0.85%
BytesCompare/20 5.07ns ± 0% 5.03ns ± 0% -0.73%
BytesCompare/21 5.07ns ± 0% 5.03ns ± 0% -0.81%
BytesCompare/22 5.07ns ± 0% 5.03ns ± 0% -0.77%
BytesCompare/23 5.07ns ± 0% 5.03ns ± 0% -0.75%
BytesCompare/24 5.08ns ± 0% 5.07ns ± 0% -0.12%
BytesCompare/25 5.03ns ± 0% 5.00ns ± 0% -0.60%
BytesCompare/26 5.02ns ± 0% 5.00ns ± 0% -0.56%
BytesCompare/27 5.03ns ± 0% 5.00ns ± 0% -0.60%
BytesCompare/28 5.03ns ± 0% 5.00ns ± 0% -0.72%
BytesCompare/29 5.03ns ± 0% 5.00ns ± 0% -0.68%
BytesCompare/30 5.03ns ± 0% 5.00ns ± 0% -0.76%
BytesCompare/31 5.03ns ± 0% 5.00ns ± 0% -0.60%
BytesCompare/32 5.02ns ± 0% 5.05ns ± 0% +0.56%
BytesCompare/33 6.78ns ± 0% 5.16ns ± 0% -23.84%
BytesCompare/34 7.26ns ± 0% 5.16ns ± 0% -28.93%
BytesCompare/35 7.78ns ± 0% 5.16ns ± 0% -33.65%
BytesCompare/36 6.72ns ± 0% 5.16ns ± 0% -23.24%
BytesCompare/37 7.32ns ± 0% 5.16ns ± 0% -29.55%
BytesCompare/38 7.26ns ± 0% 5.16ns ± 0% -28.95%
BytesCompare/39 7.99ns ± 0% 5.16ns ± 0% -35.40%
BytesCompare/40 6.67ns ± 0% 5.11ns ± 0% -23.41%
BytesCompare/41 7.25ns ± 0% 5.14ns ± 0% -29.05%
BytesCompare/42 7.47ns ± 0% 5.14ns ± 0% -31.11%
BytesCompare/43 7.97ns ± 0% 5.14ns ± 0% -35.42%
BytesCompare/44 7.29ns ± 0% 5.14ns ± 0% -29.38%
BytesCompare/45 8.06ns ± 0% 5.14ns ± 0% -36.20%
BytesCompare/46 7.89ns ± 0% 5.14ns ± 0% -34.77%
BytesCompare/47 8.59ns ± 0% 5.14ns ± 0% -40.13%
BytesCompare/48 5.57ns ± 0% 5.12ns ± 0% -8.18%
BytesCompare/49 6.05ns ± 0% 5.17ns ± 0% -14.48%
BytesCompare/50 6.05ns ± 0% 5.17ns ± 0% -14.51%
BytesCompare/51 6.06ns ± 0% 5.17ns ± 0% -14.61%
BytesCompare/52 6.05ns ± 0% 5.17ns ± 0% -14.54%
BytesCompare/53 6.06ns ± 0% 5.17ns ± 0% -14.56%
BytesCompare/54 6.05ns ± 0% 5.17ns ± 0% -14.54%
BytesCompare/55 6.05ns ± 0% 5.17ns ± 0% -14.54%
BytesCompare/56 6.02ns ± 0% 5.11ns ± 0% -15.13%
BytesCompare/57 6.01ns ± 0% 5.14ns ± 0% -14.56%
BytesCompare/58 6.02ns ± 0% 5.14ns ± 0% -14.59%
BytesCompare/59 6.02ns ± 0% 5.14ns ± 0% -14.65%
BytesCompare/60 6.03ns ± 0% 5.14ns ± 0% -14.71%
BytesCompare/61 6.02ns ± 0% 5.14ns ± 0% -14.69%
BytesCompare/62 6.01ns ± 0% 5.14ns ± 0% -14.55%
BytesCompare/63 6.02ns ± 0% 5.14ns ± 0% -14.65%
BytesCompare/64 6.09ns ± 0% 5.15ns ± 0% -15.34%
BytesCompare/65 7.83ns ± 0% 5.93ns ± 0% -24.17%
BytesCompare/66 7.86ns ± 0% 5.93ns ± 0% -24.52%
BytesCompare/67 8.56ns ± 0% 5.93ns ± 0% -30.68%
BytesCompare/68 7.90ns ± 0% 5.93ns ± 0% -24.88%
BytesCompare/69 8.58ns ± 0% 5.93ns ± 0% -30.84%
BytesCompare/70 8.54ns ± 0% 5.93ns ± 0% -30.48%
BytesCompare/71 9.18ns ± 0% 5.94ns ± 0% -35.34%
BytesCompare/72 7.89ns ± 0% 5.86ns ± 0% -25.76%
BytesCompare/73 8.59ns ± 0% 5.82ns ± 0% -32.25%
BytesCompare/74 8.52ns ± 0% 5.82ns ± 0% -31.61%
BytesCompare/75 9.17ns ± 0% 5.82ns ± 0% -36.50%
BytesCompare/76 8.54ns ± 0% 5.82ns ± 0% -31.85%
BytesCompare/77 9.25ns ± 0% 5.82ns ± 0% -37.07%
BytesCompare/78 9.17ns ± 0% 5.82ns ± 0% -36.48%
BytesCompare/79 10.0ns ± 0% 5.8ns ± 0% -41.66%
BytesCompare/80 6.76ns ± 0% 5.69ns ± 0% -15.90%
BytesCompare/81 7.63ns ± 0% 6.70ns ± 0% -12.23%
BytesCompare/82 7.63ns ± 0% 6.70ns ± 0% -12.23%
BytesCompare/83 7.63ns ± 0% 6.70ns ± 0% -12.24%
BytesCompare/84 7.63ns ± 0% 6.70ns ± 0% -12.24%
BytesCompare/85 7.63ns ± 0% 6.70ns ± 0% -12.23%
BytesCompare/86 7.63ns ± 0% 6.70ns ± 0% -12.24%
BytesCompare/87 7.63ns ± 0% 6.70ns ± 0% -12.24%
BytesCompare/88 7.53ns ± 0% 6.56ns ± 0% -12.90%
BytesCompare/89 7.53ns ± 0% 6.55ns ± 0% -12.93%
BytesCompare/90 7.53ns ± 0% 6.55ns ± 0% -12.93%
BytesCompare/91 7.53ns ± 0% 6.55ns ± 0% -12.93%
BytesCompare/92 7.53ns ± 0% 6.55ns ± 0% -12.93%
BytesCompare/93 7.53ns ± 0% 6.55ns ± 0% -12.93%
BytesCompare/94 7.53ns ± 0% 6.55ns ± 0% -12.93%
BytesCompare/95 7.53ns ± 0% 6.55ns ± 0% -12.94%
BytesCompare/96 7.02ns ± 0% 6.45ns ± 0% -8.09%
BytesCompare/97 8.73ns ± 0% 7.39ns ± 0% -15.35%
BytesCompare/98 8.71ns ± 0% 7.39ns ± 0% -15.15%
BytesCompare/99 9.42ns ± 0% 7.39ns ± 0% -21.57%
BytesCompare/100 8.73ns ± 0% 7.39ns ± 0% -15.36%
BytesCompare/101 9.43ns ± 0% 7.39ns ± 0% -21.70%
BytesCompare/102 9.42ns ± 0% 7.39ns ± 0% -21.59%
BytesCompare/103 10.2ns ± 0% 7.4ns ± 0% -27.58%
BytesCompare/104 8.74ns ± 0% 7.35ns ± 0% -15.95%
BytesCompare/105 9.44ns ± 0% 7.30ns ± 0% -22.67%
BytesCompare/106 9.44ns ± 0% 7.30ns ± 0% -22.69%
BytesCompare/107 10.2ns ± 0% 7.3ns ± 0% -28.53%
BytesCompare/108 9.48ns ± 0% 7.30ns ± 0% -23.04%
BytesCompare/109 10.2ns ± 0% 7.3ns ± 0% -28.81%
BytesCompare/110 10.2ns ± 0% 7.3ns ± 0% -28.39%
BytesCompare/111 10.9ns ± 0% 7.3ns ± 0% -33.18%
BytesCompare/112 7.75ns ± 0% 7.16ns ± 0% -7.60%
BytesCompare/113 8.57ns ± 0% 7.83ns ± 0% -8.60%
BytesCompare/114 8.57ns ± 0% 7.83ns ± 0% -8.63%
BytesCompare/115 8.57ns ± 0% 7.83ns ± 0% -8.56%
BytesCompare/116 8.57ns ± 0% 7.83ns ± 0% -8.57%
BytesCompare/117 8.57ns ± 0% 7.83ns ± 0% -8.56%
BytesCompare/118 8.57ns ± 0% 7.83ns ± 0% -8.56%
BytesCompare/119 8.57ns ± 0% 7.83ns ± 0% -8.61%
BytesCompare/120 8.46ns ± 0% 7.71ns ± 0% -8.80%
BytesCompare/121 8.46ns ± 0% 7.72ns ± 0% -8.77%
BytesCompare/122 8.46ns ± 0% 7.72ns ± 0% -8.78%
BytesCompare/123 8.46ns ± 0% 7.72ns ± 0% -8.76%
BytesCompare/124 8.46ns ± 0% 7.72ns ± 0% -8.70%
BytesCompare/125 8.46ns ± 0% 7.72ns ± 0% -8.70%
BytesCompare/126 8.46ns ± 0% 7.72ns ± 0% -8.70%
BytesCompare/127 8.46ns ± 0% 7.72ns ± 0% -8.71%
BytesCompare/128 8.19ns ± 0% 7.35ns ± 0% -10.29%
BytesCompare/256 12.8ns ± 0% 11.4ns ± 0% -11.23%
BytesCompare/512 22.2ns ± 0% 20.7ns ± 0% -6.80%
BytesCompare/1024 41.1ns ± 0% 39.8ns ± 0% -3.12%
BytesCompare/2048 86.5ns ± 0% 81.1ns ± 0% -6.31%
Change-Id: I7c7fb1f7b891c23c6cade580e7b9928ca1a6efc3
Reviewed-on: https://go-review.googlesource.com/c/go/+/474496
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Heschi Kreinick <heschi@google.com>
Reviewed-by: Archana Ravindar <aravind5@in.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
|
|
Rewrite equal asm function to use the new power10 instruction lxvl
and stxvl- load and store with variable length which can simplify
the tail end bytes comparison process. Cleaned up code on CR
register usage.
On power9 and power8 the code remains unchanged. The performance
for multiple sizes<=16 improve on power10 with the change.
name old time/op new time/op delta
Equal/1 5.28ns ± 0% 4.19ns ± 9% -20.80%
Equal/2 5.30ns ± 0% 4.29ns ± 6% -19.06%
Equal/3 5.10ns ± 5% 4.20ns ± 6% -17.73%
Equal/4 5.05ns ± 0% 4.42ns ± 4% -12.50%
Equal/5 5.27ns ± 1% 4.44ns ± 4% -15.69%
Equal/6 5.30ns ± 0% 4.38ns ±12% -17.44%
Equal/7 5.02ns ± 6% 4.48ns ± 2% -10.64%
Equal/9 4.53ns ± 0% 4.34ns ± 7% -4.21%
Equal/16 4.52ns ± 0% 4.29ns ± 6% -5.16%
Change-Id: Ie124906e3a5012dfe634bfe09af06be42f1b178b
Reviewed-on: https://go-review.googlesource.com/c/go/+/473536
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Paul Murphy <murp@ibm.com>
|
|
This code is generic to ppc64/ppc64le - there is no need to limit it to
aix or linux.
Updates #56001
Change-Id: I613964a90f9c5ca637720219a0260d65427f4be0
Reviewed-on: https://go-review.googlesource.com/c/go/+/473697
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
|
|
These directives affect the next declaration, so the existing form is
valid, but can be confusing because it is easy to miss. Move then
directly above the declaration for improved readability.
CL 69120 previously moved the Gosched nosplit away to hide it from
documentation. Since CL 224737, directives are automatically excluded
from documentation.
Change-Id: I8ebf2d47fbb5e77c6f40ed8afdf79eaa4f4e335e
Reviewed-on: https://go-review.googlesource.com/c/go/+/472957
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
|
|
Add bytealg.MakeNoZero that specially allocates a []byte
without zeroing it. It assumes the caller will populate every byte.
From within the bytes and strings packages, we can use
bytealg.MakeNoZero in a way where our logic ensures that
the entire slice is overwritten such that uninitialized bytes
are never leaked to the end user.
We use bytealg.MakeNoZero from within the following functions:
* bytes.Join
* bytes.Repeat
* bytes.ToUpper
* bytes.ToLower
* strings.Builder.Grow
The optimization in strings.Builder transitively benefits the following:
* strings.Join
* strings.Map
* strings.Repeat
* strings.ToUpper
* strings.ToLower
* strings.ToValidUTF8
* strings.Replace
* any user logic that depends on strings.Builder
This optimization is especially notable on large buffers that
do not fit in the CPU cache, such that the cost of
runtime.memclr and runtime.memmove are non-trivial since they are
both limited by the relatively slow speed of physical RAM.
Performance:
RepeatLarge/256/1 66.0ns ± 3% 64.5ns ± 1% ~ (p=0.095 n=5+5)
RepeatLarge/256/16 55.4ns ± 5% 53.1ns ± 3% -4.17% (p=0.016 n=5+5)
RepeatLarge/512/1 95.5ns ± 7% 87.1ns ± 2% -8.78% (p=0.008 n=5+5)
RepeatLarge/512/16 84.4ns ± 9% 76.2ns ± 5% -9.73% (p=0.016 n=5+5)
RepeatLarge/1024/1 161ns ± 4% 144ns ± 7% -10.45% (p=0.016 n=5+5)
RepeatLarge/1024/16 148ns ± 3% 141ns ± 5% ~ (p=0.095 n=5+5)
RepeatLarge/2048/1 296ns ± 7% 288ns ± 5% ~ (p=0.841 n=5+5)
RepeatLarge/2048/16 298ns ± 8% 281ns ± 5% ~ (p=0.151 n=5+5)
RepeatLarge/4096/1 593ns ± 8% 539ns ± 8% -8.99% (p=0.032 n=5+5)
RepeatLarge/4096/16 568ns ±12% 526ns ± 7% ~ (p=0.056 n=5+5)
RepeatLarge/8192/1 1.15µs ± 8% 1.08µs ±12% ~ (p=0.095 n=5+5)
RepeatLarge/8192/16 1.12µs ± 4% 1.07µs ± 7% ~ (p=0.310 n=5+5)
RepeatLarge/8192/4097 1.77ns ± 1% 1.76ns ± 2% ~ (p=0.310 n=5+5)
RepeatLarge/16384/1 2.06µs ± 7% 1.94µs ± 5% ~ (p=0.222 n=5+5)
RepeatLarge/16384/16 2.02µs ± 4% 1.92µs ± 6% ~ (p=0.095 n=5+5)
RepeatLarge/16384/4097 1.50µs ±15% 1.44µs ±11% ~ (p=0.802 n=5+5)
RepeatLarge/32768/1 3.90µs ± 8% 3.65µs ±11% ~ (p=0.151 n=5+5)
RepeatLarge/32768/16 3.92µs ±14% 3.68µs ±12% ~ (p=0.222 n=5+5)
RepeatLarge/32768/4097 3.71µs ± 5% 3.43µs ± 4% -7.54% (p=0.032 n=5+5)
RepeatLarge/65536/1 7.47µs ± 8% 6.88µs ± 9% ~ (p=0.056 n=5+5)
RepeatLarge/65536/16 7.29µs ± 4% 6.74µs ± 6% -7.60% (p=0.016 n=5+5)
RepeatLarge/65536/4097 7.90µs ±11% 6.34µs ± 5% -19.81% (p=0.008 n=5+5)
RepeatLarge/131072/1 17.0µs ±18% 14.1µs ± 6% -17.32% (p=0.008 n=5+5)
RepeatLarge/131072/16 15.2µs ± 2% 16.2µs ±17% ~ (p=0.151 n=5+5)
RepeatLarge/131072/4097 15.7µs ± 6% 14.8µs ±11% ~ (p=0.095 n=5+5)
RepeatLarge/262144/1 30.4µs ± 5% 31.4µs ±13% ~ (p=0.548 n=5+5)
RepeatLarge/262144/16 30.1µs ± 4% 30.7µs ±11% ~ (p=1.000 n=5+5)
RepeatLarge/262144/4097 31.2µs ± 7% 32.7µs ±13% ~ (p=0.310 n=5+5)
RepeatLarge/524288/1 67.5µs ± 9% 63.7µs ± 3% ~ (p=0.095 n=5+5)
RepeatLarge/524288/16 67.2µs ± 5% 62.9µs ± 6% ~ (p=0.151 n=5+5)
RepeatLarge/524288/4097 65.5µs ± 4% 65.2µs ±18% ~ (p=0.548 n=5+5)
RepeatLarge/1048576/1 141µs ± 6% 137µs ±14% ~ (p=0.421 n=5+5)
RepeatLarge/1048576/16 140µs ± 2% 134µs ±11% ~ (p=0.222 n=5+5)
RepeatLarge/1048576/4097 141µs ± 3% 134µs ±10% ~ (p=0.151 n=5+5)
RepeatLarge/2097152/1 258µs ± 2% 271µs ±10% ~ (p=0.222 n=5+5)
RepeatLarge/2097152/16 263µs ± 6% 273µs ± 9% ~ (p=0.151 n=5+5)
RepeatLarge/2097152/4097 270µs ± 2% 277µs ± 6% ~ (p=0.690 n=5+5)
RepeatLarge/4194304/1 684µs ± 3% 467µs ± 6% -31.69% (p=0.008 n=5+5)
RepeatLarge/4194304/16 682µs ± 1% 471µs ± 7% -30.91% (p=0.008 n=5+5)
RepeatLarge/4194304/4097 685µs ± 2% 465µs ±20% -32.12% (p=0.008 n=5+5)
RepeatLarge/8388608/1 1.50ms ± 1% 1.16ms ± 8% -22.63% (p=0.008 n=5+5)
RepeatLarge/8388608/16 1.50ms ± 2% 1.22ms ±17% -18.49% (p=0.008 n=5+5)
RepeatLarge/8388608/4097 1.51ms ± 7% 1.33ms ±11% -11.56% (p=0.008 n=5+5)
RepeatLarge/16777216/1 3.48ms ± 4% 2.66ms ±13% -23.76% (p=0.008 n=5+5)
RepeatLarge/16777216/16 3.37ms ± 3% 2.57ms ±13% -23.72% (p=0.008 n=5+5)
RepeatLarge/16777216/4097 3.38ms ± 9% 2.50ms ±11% -26.16% (p=0.008 n=5+5)
RepeatLarge/33554432/1 7.74ms ± 1% 4.70ms ±19% -39.31% (p=0.016 n=4+5)
RepeatLarge/33554432/16 7.90ms ± 4% 4.78ms ± 9% -39.50% (p=0.008 n=5+5)
RepeatLarge/33554432/4097 7.80ms ± 2% 4.86ms ±11% -37.60% (p=0.008 n=5+5)
RepeatLarge/67108864/1 16.4ms ± 3% 9.7ms ±15% -41.29% (p=0.008 n=5+5)
RepeatLarge/67108864/16 16.5ms ± 1% 9.9ms ±15% -39.83% (p=0.008 n=5+5)
RepeatLarge/67108864/4097 16.5ms ± 1% 11.0ms ±18% -32.95% (p=0.008 n=5+5)
RepeatLarge/134217728/1 35.2ms ±12% 19.2ms ±10% -45.58% (p=0.008 n=5+5)
RepeatLarge/134217728/16 34.6ms ± 6% 19.3ms ± 7% -44.07% (p=0.008 n=5+5)
RepeatLarge/134217728/4097 33.2ms ± 2% 19.3ms ±14% -41.79% (p=0.008 n=5+5)
RepeatLarge/268435456/1 70.9ms ± 2% 36.2ms ± 5% -48.87% (p=0.008 n=5+5)
RepeatLarge/268435456/16 77.4ms ± 7% 36.1ms ± 8% -53.33% (p=0.008 n=5+5)
RepeatLarge/268435456/4097 75.8ms ± 4% 37.0ms ± 4% -51.15% (p=0.008 n=5+5)
RepeatLarge/536870912/1 163ms ±14% 77ms ± 9% -52.94% (p=0.008 n=5+5)
RepeatLarge/536870912/16 156ms ± 4% 76ms ± 6% -51.42% (p=0.008 n=5+5)
RepeatLarge/536870912/4097 151ms ± 2% 76ms ± 6% -49.64% (p=0.008 n=5+5)
RepeatLarge/1073741824/1 293ms ± 5% 149ms ± 8% -49.18% (p=0.008 n=5+5)
RepeatLarge/1073741824/16 308ms ± 9% 150ms ± 8% -51.19% (p=0.008 n=5+5)
RepeatLarge/1073741824/4097 299ms ± 5% 151ms ± 6% -49.51% (p=0.008 n=5+5)
Updates #57153
Change-Id: I024553b7e676d6da6408278109ac1fa8def0a802
Reviewed-on: https://go-review.googlesource.com/c/go/+/456336
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Run-TryBot: Joseph Tsai <joetsai@digital-static.net>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Daniel Martí <mvdan@mvdan.cc>
|
|
Remove some unnecessary loops and pull the comparison code out from the
compare/loop code. Add an unaligned 8 byte comparison, which reads 8 bytes
from each input before comparing them. This gives a reasonable gain in
performance for the large unaligned case.
Updates #50615
name old time/op new time/op delta
CompareBytesEqual-4 116ns _ 0% 111ns _ 0% -4.10% (p=0.000 n=5+5)
CompareBytesToNil-4 34.9ns _ 0% 35.0ns _ 0% +0.45% (p=0.002 n=5+5)
CompareBytesEmpty-4 29.6ns _ 1% 29.8ns _ 0% +0.71% (p=0.016 n=5+5)
CompareBytesIdentical-4 29.8ns _ 0% 29.9ns _ 1% +0.50% (p=0.036 n=5+5)
CompareBytesSameLength-4 66.1ns _ 0% 60.4ns _ 0% -8.59% (p=0.000 n=5+5)
CompareBytesDifferentLength-4 63.1ns _ 0% 60.5ns _ 0% -4.20% (p=0.000 n=5+5)
CompareBytesBigUnaligned/offset=1-4 6.84ms _ 3% 6.04ms _ 5% -11.70% (p=0.001 n=5+5)
CompareBytesBigUnaligned/offset=2-4 6.99ms _ 4% 5.93ms _ 6% -15.22% (p=0.000 n=5+5)
CompareBytesBigUnaligned/offset=3-4 6.74ms _ 1% 6.00ms _ 5% -10.94% (p=0.001 n=5+5)
CompareBytesBigUnaligned/offset=4-4 7.20ms _ 6% 5.97ms _ 6% -17.05% (p=0.000 n=5+5)
CompareBytesBigUnaligned/offset=5-4 6.75ms _ 1% 5.81ms _ 8% -13.93% (p=0.001 n=5+5)
CompareBytesBigUnaligned/offset=6-4 6.89ms _ 5% 5.75ms _ 2% -16.58% (p=0.000 n=5+4)
CompareBytesBigUnaligned/offset=7-4 6.91ms _ 6% 6.13ms _ 6% -11.27% (p=0.001 n=5+5)
CompareBytesBig-4 2.75ms _ 5% 2.71ms _ 8% ~ (p=0.651 n=5+5)
CompareBytesBigIdentical-4 29.9ns _ 1% 29.8ns _ 0% ~ (p=0.751 n=5+5)
name old speed new speed delta
CompareBytesBigUnaligned/offset=1-4 153MB/s _ 3% 174MB/s _ 6% +13.40% (p=0.003 n=5+5)
CompareBytesBigUnaligned/offset=2-4 150MB/s _ 4% 177MB/s _ 6% +18.06% (p=0.001 n=5+5)
CompareBytesBigUnaligned/offset=3-4 156MB/s _ 1% 175MB/s _ 5% +12.39% (p=0.002 n=5+5)
CompareBytesBigUnaligned/offset=4-4 146MB/s _ 6% 176MB/s _ 6% +20.67% (p=0.001 n=5+5)
CompareBytesBigUnaligned/offset=5-4 155MB/s _ 1% 181MB/s _ 7% +16.35% (p=0.002 n=5+5)
CompareBytesBigUnaligned/offset=6-4 152MB/s _ 5% 182MB/s _ 2% +19.74% (p=0.000 n=5+4)
CompareBytesBigUnaligned/offset=7-4 152MB/s _ 6% 171MB/s _ 6% +12.70% (p=0.001 n=5+5)
CompareBytesBig-4 382MB/s _ 5% 388MB/s _ 9% ~ (p=0.616 n=5+5)
CompareBytesBigIdentical-4 35.1TB/s _ 1% 35.1TB/s _ 0% ~ (p=0.800 n=5+5)
Change-Id: I127edc376e62a2c529719a4ab172f481e0a81357
Reviewed-on: https://go-review.googlesource.com/c/go/+/431100
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Meng Zhuo <mzh@golangcn.org>
Reviewed-by: Bryan Mills <bcmills@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joedian Reid <joedian@golang.org>
Run-TryBot: Joel Sing <joel@sing.id.au>
|