From 6b66b59328f3178cceae624a6100cfe8e7944746 Mon Sep 17 00:00:00 2001 From: Martin Möhrmann Date: Tue, 13 Oct 2020 08:32:41 +0200 Subject: internal/cpu: remove unused arm64 capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change-Id: I038b0fe165931b8ec3ef59f08dc73c8128d56572 Reviewed-on: https://go-review.googlesource.com/c/go/+/261365 Trust: Martin Möhrmann Run-TryBot: Martin Möhrmann TryBot-Result: Go Bot Reviewed-by: Keith Randall --- src/internal/cpu/cpu_arm64.go | 68 ++++--------------------------------------- 1 file changed, 6 insertions(+), 62 deletions(-) (limited to 'src/internal/cpu/cpu_arm64.go') diff --git a/src/internal/cpu/cpu_arm64.go b/src/internal/cpu/cpu_arm64.go index efdb3b9e33..d9e0c98ca6 100644 --- a/src/internal/cpu/cpu_arm64.go +++ b/src/internal/cpu/cpu_arm64.go @@ -14,87 +14,31 @@ var HWCap2 uint // HWCAP/HWCAP2 bits. These are exposed by Linux. const ( - hwcap_FP = 1 << 0 - hwcap_ASIMD = 1 << 1 - hwcap_EVTSTRM = 1 << 2 - hwcap_AES = 1 << 3 - hwcap_PMULL = 1 << 4 - hwcap_SHA1 = 1 << 5 - hwcap_SHA2 = 1 << 6 - hwcap_CRC32 = 1 << 7 - hwcap_ATOMICS = 1 << 8 - hwcap_FPHP = 1 << 9 - hwcap_ASIMDHP = 1 << 10 - hwcap_CPUID = 1 << 11 - hwcap_ASIMDRDM = 1 << 12 - hwcap_JSCVT = 1 << 13 - hwcap_FCMA = 1 << 14 - hwcap_LRCPC = 1 << 15 - hwcap_DCPOP = 1 << 16 - hwcap_SHA3 = 1 << 17 - hwcap_SM3 = 1 << 18 - hwcap_SM4 = 1 << 19 - hwcap_ASIMDDP = 1 << 20 - hwcap_SHA512 = 1 << 21 - hwcap_SVE = 1 << 22 - hwcap_ASIMDFHM = 1 << 23 + hwcap_AES = 1 << 3 + hwcap_PMULL = 1 << 4 + hwcap_SHA1 = 1 << 5 + hwcap_SHA2 = 1 << 6 + hwcap_CRC32 = 1 << 7 + hwcap_ATOMICS = 1 << 8 ) func doinit() { options = []option{ - {Name: "evtstrm", Feature: &ARM64.HasEVTSTRM}, {Name: "aes", Feature: &ARM64.HasAES}, {Name: "pmull", Feature: &ARM64.HasPMULL}, {Name: "sha1", Feature: &ARM64.HasSHA1}, {Name: "sha2", Feature: &ARM64.HasSHA2}, {Name: "crc32", Feature: &ARM64.HasCRC32}, {Name: "atomics", Feature: &ARM64.HasATOMICS}, - {Name: "fphp", Feature: &ARM64.HasFPHP}, - {Name: "asimdhp", Feature: &ARM64.HasASIMDHP}, - {Name: "cpuid", Feature: &ARM64.HasCPUID}, - {Name: "asimdrdm", Feature: &ARM64.HasASIMDRDM}, - {Name: "jscvt", Feature: &ARM64.HasJSCVT}, - {Name: "fcma", Feature: &ARM64.HasFCMA}, - {Name: "lrcpc", Feature: &ARM64.HasLRCPC}, - {Name: "dcpop", Feature: &ARM64.HasDCPOP}, - {Name: "sha3", Feature: &ARM64.HasSHA3}, - {Name: "sm3", Feature: &ARM64.HasSM3}, - {Name: "sm4", Feature: &ARM64.HasSM4}, - {Name: "asimddp", Feature: &ARM64.HasASIMDDP}, - {Name: "sha512", Feature: &ARM64.HasSHA512}, - {Name: "sve", Feature: &ARM64.HasSVE}, - {Name: "asimdfhm", Feature: &ARM64.HasASIMDFHM}, - - // These capabilities should always be enabled on arm64: - {Name: "fp", Feature: &ARM64.HasFP, Required: true}, - {Name: "asimd", Feature: &ARM64.HasASIMD, Required: true}, } // HWCAP feature bits - ARM64.HasFP = isSet(HWCap, hwcap_FP) - ARM64.HasASIMD = isSet(HWCap, hwcap_ASIMD) - ARM64.HasEVTSTRM = isSet(HWCap, hwcap_EVTSTRM) ARM64.HasAES = isSet(HWCap, hwcap_AES) ARM64.HasPMULL = isSet(HWCap, hwcap_PMULL) ARM64.HasSHA1 = isSet(HWCap, hwcap_SHA1) ARM64.HasSHA2 = isSet(HWCap, hwcap_SHA2) ARM64.HasCRC32 = isSet(HWCap, hwcap_CRC32) ARM64.HasATOMICS = isSet(HWCap, hwcap_ATOMICS) - ARM64.HasFPHP = isSet(HWCap, hwcap_FPHP) - ARM64.HasASIMDHP = isSet(HWCap, hwcap_ASIMDHP) - ARM64.HasCPUID = isSet(HWCap, hwcap_CPUID) - ARM64.HasASIMDRDM = isSet(HWCap, hwcap_ASIMDRDM) - ARM64.HasJSCVT = isSet(HWCap, hwcap_JSCVT) - ARM64.HasFCMA = isSet(HWCap, hwcap_FCMA) - ARM64.HasLRCPC = isSet(HWCap, hwcap_LRCPC) - ARM64.HasDCPOP = isSet(HWCap, hwcap_DCPOP) - ARM64.HasSHA3 = isSet(HWCap, hwcap_SHA3) - ARM64.HasSM3 = isSet(HWCap, hwcap_SM3) - ARM64.HasSM4 = isSet(HWCap, hwcap_SM4) - ARM64.HasASIMDDP = isSet(HWCap, hwcap_ASIMDDP) - ARM64.HasSHA512 = isSet(HWCap, hwcap_SHA512) - ARM64.HasSVE = isSet(HWCap, hwcap_SVE) - ARM64.HasASIMDFHM = isSet(HWCap, hwcap_ASIMDFHM) } func isSet(hwc uint, value uint) bool { -- cgit v1.3 From de932da453f68b8fc04e9c2ab25136748173c806 Mon Sep 17 00:00:00 2001 From: Martin Möhrmann Date: Tue, 13 Oct 2020 22:30:23 +0200 Subject: internal/cpu: consolidate arm64 feature detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move code to detect and mask arm64 CPU features from runtime to internal/cpu. Change-Id: Ib784e2ff056e8def125d68827b852f07a3eff0db Reviewed-on: https://go-review.googlesource.com/c/go/+/261878 Trust: Martin Möhrmann Trust: Tobias Klauser Run-TryBot: Tobias Klauser TryBot-Result: Go Bot Reviewed-by: Tobias Klauser Reviewed-by: Benny Siegert --- src/internal/cpu/cpu_android.go | 7 ++ src/internal/cpu/cpu_arm64.go | 73 ++++++++++++++++---- src/internal/cpu/cpu_arm64.s | 12 ++++ src/internal/cpu/cpu_freebsd.go | 7 ++ src/internal/cpu/cpu_linux.go | 9 +++ src/internal/cpu/cpu_other.go | 11 +++ src/runtime/auxv_none.go | 1 - src/runtime/os_freebsd_arm64.go | 143 --------------------------------------- src/runtime/os_freebsd_noauxv.go | 2 +- src/runtime/os_linux_arm64.go | 14 +--- src/runtime/os_netbsd.go | 1 - src/runtime/os_netbsd_386.go | 3 - src/runtime/os_netbsd_amd64.go | 3 - src/runtime/os_netbsd_arm.go | 3 - src/runtime/os_netbsd_arm64.go | 12 +--- src/runtime/os_openbsd_arm64.go | 11 --- src/runtime/sys_freebsd_arm64.s | 21 ------ 17 files changed, 110 insertions(+), 223 deletions(-) create mode 100644 src/internal/cpu/cpu_android.go create mode 100644 src/internal/cpu/cpu_arm64.s create mode 100644 src/internal/cpu/cpu_freebsd.go create mode 100644 src/internal/cpu/cpu_linux.go create mode 100644 src/internal/cpu/cpu_other.go (limited to 'src/internal/cpu/cpu_arm64.go') diff --git a/src/internal/cpu/cpu_android.go b/src/internal/cpu/cpu_android.go new file mode 100644 index 0000000000..d995e8d5a7 --- /dev/null +++ b/src/internal/cpu/cpu_android.go @@ -0,0 +1,7 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const GOOS = "android" diff --git a/src/internal/cpu/cpu_arm64.go b/src/internal/cpu/cpu_arm64.go index d9e0c98ca6..533bea2470 100644 --- a/src/internal/cpu/cpu_arm64.go +++ b/src/internal/cpu/cpu_arm64.go @@ -6,13 +6,11 @@ package cpu const CacheLinePadSize = 64 -// arm64 doesn't have a 'cpuid' equivalent, so we rely on HWCAP/HWCAP2. -// These are initialized by archauxv and should not be changed after they are -// initialized. +// HWCap may be initialized by archauxv and +// should not be changed after it was initialized. var HWCap uint -var HWCap2 uint -// HWCAP/HWCAP2 bits. These are exposed by Linux. +// HWCAP bits. These are exposed by Linux. const ( hwcap_AES = 1 << 3 hwcap_PMULL = 1 << 4 @@ -32,15 +30,66 @@ func doinit() { {Name: "atomics", Feature: &ARM64.HasATOMICS}, } - // HWCAP feature bits - ARM64.HasAES = isSet(HWCap, hwcap_AES) - ARM64.HasPMULL = isSet(HWCap, hwcap_PMULL) - ARM64.HasSHA1 = isSet(HWCap, hwcap_SHA1) - ARM64.HasSHA2 = isSet(HWCap, hwcap_SHA2) - ARM64.HasCRC32 = isSet(HWCap, hwcap_CRC32) - ARM64.HasATOMICS = isSet(HWCap, hwcap_ATOMICS) + switch GOOS { + case "linux", "android": + // HWCap was populated by the runtime from the auxillary vector. + // Use HWCap information since reading aarch64 system registers + // is not supported in user space on older linux kernels. + ARM64.HasAES = isSet(HWCap, hwcap_AES) + ARM64.HasPMULL = isSet(HWCap, hwcap_PMULL) + ARM64.HasSHA1 = isSet(HWCap, hwcap_SHA1) + ARM64.HasSHA2 = isSet(HWCap, hwcap_SHA2) + ARM64.HasCRC32 = isSet(HWCap, hwcap_CRC32) + + // The Samsung S9+ kernel reports support for atomics, but not all cores + // actually support them, resulting in SIGILL. See issue #28431. + // TODO(elias.naur): Only disable the optimization on bad chipsets on android. + ARM64.HasATOMICS = isSet(HWCap, hwcap_ATOMICS) && GOOS != "android" + + case "freebsd": + // Retrieve info from system register ID_AA64ISAR0_EL1. + isar0 := getisar0() + + // ID_AA64ISAR0_EL1 + switch extractBits(isar0, 4, 7) { + case 1: + ARM64.HasAES = true + case 2: + ARM64.HasAES = true + ARM64.HasPMULL = true + } + + switch extractBits(isar0, 8, 11) { + case 1: + ARM64.HasSHA1 = true + } + + switch extractBits(isar0, 12, 15) { + case 1, 2: + ARM64.HasSHA2 = true + } + + switch extractBits(isar0, 16, 19) { + case 1: + ARM64.HasCRC32 = true + } + + switch extractBits(isar0, 20, 23) { + case 2: + ARM64.HasATOMICS = true + } + default: + // Other operating systems do not support reading HWCap from auxillary vector + // or reading privileged aarch64 system registers in user space. + } +} + +func extractBits(data uint64, start, end uint) uint { + return (uint)(data>>start) & ((1 << (end - start + 1)) - 1) } func isSet(hwc uint, value uint) bool { return hwc&value != 0 } + +func getisar0() uint64 diff --git a/src/internal/cpu/cpu_arm64.s b/src/internal/cpu/cpu_arm64.s new file mode 100644 index 0000000000..d85914973f --- /dev/null +++ b/src/internal/cpu/cpu_arm64.s @@ -0,0 +1,12 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// func getisar0() uint64 +TEXT ·getisar0(SB),NOSPLIT,$0 + // get Instruction Set Attributes 0 into R0 + MRS ID_AA64ISAR0_EL1, R0 + MOVD R0, ret+0(FP) + RET diff --git a/src/internal/cpu/cpu_freebsd.go b/src/internal/cpu/cpu_freebsd.go new file mode 100644 index 0000000000..dc37173dac --- /dev/null +++ b/src/internal/cpu/cpu_freebsd.go @@ -0,0 +1,7 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const GOOS = "freebsd" diff --git a/src/internal/cpu/cpu_linux.go b/src/internal/cpu/cpu_linux.go new file mode 100644 index 0000000000..ec0b84c510 --- /dev/null +++ b/src/internal/cpu/cpu_linux.go @@ -0,0 +1,9 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !android + +package cpu + +const GOOS = "linux" diff --git a/src/internal/cpu/cpu_other.go b/src/internal/cpu/cpu_other.go new file mode 100644 index 0000000000..8a15fbe79d --- /dev/null +++ b/src/internal/cpu/cpu_other.go @@ -0,0 +1,11 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !linux +// +build !freebsd +// +build !android + +package cpu + +const GOOS = "other" diff --git a/src/runtime/auxv_none.go b/src/runtime/auxv_none.go index 3a560a1793..3ca617b21e 100644 --- a/src/runtime/auxv_none.go +++ b/src/runtime/auxv_none.go @@ -7,7 +7,6 @@ // +build !dragonfly // +build !freebsd // +build !netbsd -// +build !openbsd !arm64 // +build !solaris package runtime diff --git a/src/runtime/os_freebsd_arm64.go b/src/runtime/os_freebsd_arm64.go index 51ebf9d478..b5b25f0dc5 100644 --- a/src/runtime/os_freebsd_arm64.go +++ b/src/runtime/os_freebsd_arm64.go @@ -4,149 +4,6 @@ package runtime -import "internal/cpu" - -const ( - hwcap_FP = 1 << 0 - hwcap_ASIMD = 1 << 1 - hwcap_EVTSTRM = 1 << 2 - hwcap_AES = 1 << 3 - hwcap_PMULL = 1 << 4 - hwcap_SHA1 = 1 << 5 - hwcap_SHA2 = 1 << 6 - hwcap_CRC32 = 1 << 7 - hwcap_ATOMICS = 1 << 8 - hwcap_FPHP = 1 << 9 - hwcap_ASIMDHP = 1 << 10 - hwcap_CPUID = 1 << 11 - hwcap_ASIMDRDM = 1 << 12 - hwcap_JSCVT = 1 << 13 - hwcap_FCMA = 1 << 14 - hwcap_LRCPC = 1 << 15 - hwcap_DCPOP = 1 << 16 - hwcap_SHA3 = 1 << 17 - hwcap_SM3 = 1 << 18 - hwcap_SM4 = 1 << 19 - hwcap_ASIMDDP = 1 << 20 - hwcap_SHA512 = 1 << 21 - hwcap_SVE = 1 << 22 - hwcap_ASIMDFHM = 1 << 23 -) - -func getisar0() uint64 -func getisar1() uint64 -func getpfr0() uint64 - -// no hwcap support on FreeBSD aarch64, we need to retrieve the info from -// ID_AA64ISAR0_EL1, ID_AA64ISAR1_EL1 and ID_AA64PFR0_EL1 -func archauxv(tag, val uintptr) { - var isar0, isar1, pfr0 uint64 - - isar0 = getisar0() - isar1 = getisar1() - pfr0 = getpfr0() - - // ID_AA64ISAR0_EL1 - switch extractBits(isar0, 4, 7) { - case 1: - cpu.HWCap |= hwcap_AES - case 2: - cpu.HWCap |= hwcap_PMULL | hwcap_AES - } - - switch extractBits(isar0, 8, 11) { - case 1: - cpu.HWCap |= hwcap_SHA1 - } - - switch extractBits(isar0, 12, 15) { - case 1: - cpu.HWCap |= hwcap_SHA2 - case 2: - cpu.HWCap |= hwcap_SHA2 | hwcap_SHA512 - } - - switch extractBits(isar0, 16, 19) { - case 1: - cpu.HWCap |= hwcap_CRC32 - } - - switch extractBits(isar0, 20, 23) { - case 2: - cpu.HWCap |= hwcap_ATOMICS - } - - switch extractBits(isar0, 28, 31) { - case 1: - cpu.HWCap |= hwcap_ASIMDRDM - } - - switch extractBits(isar0, 32, 35) { - case 1: - cpu.HWCap |= hwcap_SHA3 - } - - switch extractBits(isar0, 36, 39) { - case 1: - cpu.HWCap |= hwcap_SM3 - } - - switch extractBits(isar0, 40, 43) { - case 1: - cpu.HWCap |= hwcap_SM4 - } - - switch extractBits(isar0, 44, 47) { - case 1: - cpu.HWCap |= hwcap_ASIMDDP - } - - // ID_AA64ISAR1_EL1 - switch extractBits(isar1, 0, 3) { - case 1: - cpu.HWCap |= hwcap_DCPOP - } - - switch extractBits(isar1, 12, 15) { - case 1: - cpu.HWCap |= hwcap_JSCVT - } - - switch extractBits(isar1, 16, 19) { - case 1: - cpu.HWCap |= hwcap_FCMA - } - - switch extractBits(isar1, 20, 23) { - case 1: - cpu.HWCap |= hwcap_LRCPC - } - - // ID_AA64PFR0_EL1 - switch extractBits(pfr0, 16, 19) { - case 0: - cpu.HWCap |= hwcap_FP - case 1: - cpu.HWCap |= hwcap_FP | hwcap_FPHP - } - - switch extractBits(pfr0, 20, 23) { - case 0: - cpu.HWCap |= hwcap_ASIMD - case 1: - cpu.HWCap |= hwcap_ASIMD | hwcap_ASIMDHP - } - - switch extractBits(pfr0, 32, 35) { - case 1: - cpu.HWCap |= hwcap_SVE - } -} - -func extractBits(data uint64, start, end uint) uint { - return (uint)(data>>start) & ((1 << (end - start + 1)) - 1) -} - //go:nosplit func cputicks() int64 { // Currently cputicks() is used in blocking profiler and to seed fastrand(). diff --git a/src/runtime/os_freebsd_noauxv.go b/src/runtime/os_freebsd_noauxv.go index c6a49927c8..01efb9b7c9 100644 --- a/src/runtime/os_freebsd_noauxv.go +++ b/src/runtime/os_freebsd_noauxv.go @@ -3,7 +3,7 @@ // license that can be found in the LICENSE file. // +build freebsd -// +build !arm,!arm64 +// +build !arm package runtime diff --git a/src/runtime/os_linux_arm64.go b/src/runtime/os_linux_arm64.go index 19968dc164..c5fd742048 100644 --- a/src/runtime/os_linux_arm64.go +++ b/src/runtime/os_linux_arm64.go @@ -11,19 +11,7 @@ import "internal/cpu" func archauxv(tag, val uintptr) { switch tag { case _AT_HWCAP: - // arm64 doesn't have a 'cpuid' instruction equivalent and relies on - // HWCAP/HWCAP2 bits for hardware capabilities. - hwcap := uint(val) - if GOOS == "android" { - // The Samsung S9+ kernel reports support for atomics, but not all cores - // actually support them, resulting in SIGILL. See issue #28431. - // TODO(elias.naur): Only disable the optimization on bad chipsets. - const hwcap_ATOMICS = 1 << 8 - hwcap &= ^uint(hwcap_ATOMICS) - } - cpu.HWCap = hwcap - case _AT_HWCAP2: - cpu.HWCap2 = uint(val) + cpu.HWCap = uint(val) } } diff --git a/src/runtime/os_netbsd.go b/src/runtime/os_netbsd.go index c4c3d8e2fe..f7f90cedc1 100644 --- a/src/runtime/os_netbsd.go +++ b/src/runtime/os_netbsd.go @@ -359,7 +359,6 @@ func sysargs(argc int32, argv **byte) { // now argv+n is auxv auxv := (*[1 << 28]uintptr)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize)) sysauxv(auxv[:]) - archauxv(auxv[:]) } const ( diff --git a/src/runtime/os_netbsd_386.go b/src/runtime/os_netbsd_386.go index c203af9cef..037f7e36dc 100644 --- a/src/runtime/os_netbsd_386.go +++ b/src/runtime/os_netbsd_386.go @@ -14,6 +14,3 @@ func lwp_mcontext_init(mc *mcontextt, stk unsafe.Pointer, mp *m, gp *g, fn uintp mc.__gregs[_REG_EDX] = uint32(uintptr(unsafe.Pointer(gp))) mc.__gregs[_REG_ESI] = uint32(fn) } - -func archauxv(auxv []uintptr) { -} diff --git a/src/runtime/os_netbsd_amd64.go b/src/runtime/os_netbsd_amd64.go index ea9d125492..5118b0c4ff 100644 --- a/src/runtime/os_netbsd_amd64.go +++ b/src/runtime/os_netbsd_amd64.go @@ -14,6 +14,3 @@ func lwp_mcontext_init(mc *mcontextt, stk unsafe.Pointer, mp *m, gp *g, fn uintp mc.__gregs[_REG_R9] = uint64(uintptr(unsafe.Pointer(gp))) mc.__gregs[_REG_R12] = uint64(fn) } - -func archauxv(auxv []uintptr) { -} diff --git a/src/runtime/os_netbsd_arm.go b/src/runtime/os_netbsd_arm.go index 646da9dc0b..b5ec23e45b 100644 --- a/src/runtime/os_netbsd_arm.go +++ b/src/runtime/os_netbsd_arm.go @@ -32,6 +32,3 @@ func cputicks() int64 { // runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler. return nanotime() } - -func archauxv(auxv []uintptr) { -} diff --git a/src/runtime/os_netbsd_arm64.go b/src/runtime/os_netbsd_arm64.go index ae2638c778..8d21b0a430 100644 --- a/src/runtime/os_netbsd_arm64.go +++ b/src/runtime/os_netbsd_arm64.go @@ -4,10 +4,7 @@ package runtime -import ( - "internal/cpu" - "unsafe" -) +import "unsafe" func lwp_mcontext_init(mc *mcontextt, stk unsafe.Pointer, mp *m, gp *g, fn uintptr) { // Machine dependent mcontext initialisation for LWP. @@ -24,10 +21,3 @@ func cputicks() int64 { // runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler. return nanotime() } - -func archauxv(auxv []uintptr) { - // NetBSD does not supply AT_HWCAP, however we still need to initialise cpu.HWCaps. - // For now specify the bare minimum until we add some form of capabilities - // detection. See issue https://golang.org/issue/30824#issuecomment-494901591 - cpu.HWCap = 1<<1 | 1<<0 // ASIMD, FP -} diff --git a/src/runtime/os_openbsd_arm64.go b/src/runtime/os_openbsd_arm64.go index d559a2a3e5..d71de7d196 100644 --- a/src/runtime/os_openbsd_arm64.go +++ b/src/runtime/os_openbsd_arm64.go @@ -4,20 +4,9 @@ package runtime -import ( - "internal/cpu" -) - //go:nosplit func cputicks() int64 { // Currently cputicks() is used in blocking profiler and to seed runtime·fastrand(). // runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler. return nanotime() } - -func sysargs(argc int32, argv **byte) { - // OpenBSD does not have auxv, however we still need to initialise cpu.HWCaps. - // For now specify the bare minimum until we add some form of capabilities - // detection. See issue #31746. - cpu.HWCap = 1<<1 | 1<<0 // ASIMD, FP -} diff --git a/src/runtime/sys_freebsd_arm64.s b/src/runtime/sys_freebsd_arm64.s index 2330f2ffe2..8a4f9b7fa1 100644 --- a/src/runtime/sys_freebsd_arm64.s +++ b/src/runtime/sys_freebsd_arm64.s @@ -515,24 +515,3 @@ TEXT runtime·getCntxct(SB),NOSPLIT,$0 MOVW R0, ret+8(FP) RET - -// func getisar0() uint64 -TEXT runtime·getisar0(SB),NOSPLIT,$0 - // get Instruction Set Attributes 0 into R0 - MRS ID_AA64ISAR0_EL1, R0 - MOVD R0, ret+0(FP) - RET - -// func getisar1() uint64 -TEXT runtime·getisar1(SB),NOSPLIT,$0 - // get Instruction Set Attributes 1 into R0 - MRS ID_AA64ISAR1_EL1, R0 - MOVD R0, ret+0(FP) - RET - -// func getpfr0() uint64 -TEXT runtime·getpfr0(SB),NOSPLIT,$0 - // get Processor Feature Register 0 into R0 - MRS ID_AA64PFR0_EL1, R0 - MOVD R0, ret+0(FP) - RET -- cgit v1.3 From d5388e23b5c75bf8189b173051d24a0176a5a303 Mon Sep 17 00:00:00 2001 From: Jonathan Swinney Date: Fri, 30 Oct 2020 18:46:23 +0000 Subject: runtime: improve memmove performance on arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the memmove implementation for moves of 17 bytes or larger with an implementation from ARM optimized software. The moves of 16 bytes or fewer are unchanged, but the registers used are updated to match the rest of the implementation. This implementation makes use of new optimizations: - software pipelined loop for large (>128 byte) moves - medium size moves (17..128 bytes) have a new implementation - address realignment when src or dst is unaligned - preference for aligned src (loads) or dst (stores) depending on CPU To support preference for aligned loads or aligned stores, a new CPU flag is added. This flag indicates that the detected micro architecture performs better with aligned loads. Some tested CPUs did not exhibit a significant difference and are left with the default behavior of realigning based on the destination address (stores). Neoverse N1 (Tested on Graviton 2) name old time/op new time/op delta Memmove/0-4 1.88ns ± 1% 1.87ns ± 1% -0.58% (p=0.020 n=10+10) Memmove/1-4 4.40ns ± 0% 4.40ns ± 0% ~ (all equal) Memmove/8-4 3.88ns ± 3% 3.80ns ± 0% -1.97% (p=0.001 n=10+9) Memmove/16-4 3.90ns ± 3% 3.80ns ± 0% -2.49% (p=0.000 n=10+9) Memmove/32-4 4.80ns ± 0% 4.40ns ± 0% -8.33% (p=0.000 n=9+8) Memmove/64-4 5.86ns ± 0% 5.00ns ± 0% -14.76% (p=0.000 n=8+8) Memmove/128-4 8.46ns ± 0% 8.06ns ± 0% -4.62% (p=0.000 n=10+10) Memmove/256-4 12.4ns ± 0% 12.2ns ± 0% -1.61% (p=0.000 n=10+10) Memmove/512-4 19.5ns ± 0% 19.1ns ± 0% -2.05% (p=0.000 n=10+10) Memmove/1024-4 33.7ns ± 0% 33.5ns ± 0% -0.59% (p=0.000 n=10+10) Memmove/2048-4 62.1ns ± 0% 59.0ns ± 0% -4.99% (p=0.000 n=10+10) Memmove/4096-4 117ns ± 1% 110ns ± 0% -5.66% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 6.41ns ± 0% 5.62ns ± 0% -12.32% (p=0.000 n=10+7) MemmoveUnalignedDst/128-4 9.40ns ± 0% 8.34ns ± 0% -11.24% (p=0.000 n=10+10) MemmoveUnalignedDst/256-4 12.8ns ± 0% 12.8ns ± 0% ~ (all equal) MemmoveUnalignedDst/512-4 20.4ns ± 0% 19.7ns ± 0% -3.43% (p=0.000 n=9+10) MemmoveUnalignedDst/1024-4 34.1ns ± 0% 35.1ns ± 0% +2.93% (p=0.000 n=9+9) MemmoveUnalignedDst/2048-4 61.5ns ± 0% 60.4ns ± 0% -1.77% (p=0.000 n=10+10) MemmoveUnalignedDst/4096-4 122ns ± 0% 113ns ± 0% -7.38% (p=0.002 n=8+10) MemmoveUnalignedSrc/64-4 7.25ns ± 1% 6.26ns ± 0% -13.64% (p=0.000 n=9+9) MemmoveUnalignedSrc/128-4 10.5ns ± 0% 9.7ns ± 0% -7.52% (p=0.000 n=10+10) MemmoveUnalignedSrc/256-4 17.1ns ± 0% 17.3ns ± 0% +1.17% (p=0.000 n=10+10) MemmoveUnalignedSrc/512-4 27.0ns ± 0% 27.0ns ± 0% ~ (all equal) MemmoveUnalignedSrc/1024-4 46.7ns ± 0% 35.7ns ± 0% -23.55% (p=0.000 n=10+9) MemmoveUnalignedSrc/2048-4 85.2ns ± 0% 61.2ns ± 0% -28.17% (p=0.000 n=10+8) MemmoveUnalignedSrc/4096-4 162ns ± 0% 113ns ± 0% -30.25% (p=0.000 n=10+10) name old speed new speed delta Memmove/4096-4 35.2GB/s ± 0% 37.1GB/s ± 0% +5.56% (p=0.000 n=10+9) MemmoveUnalignedSrc/1024-4 21.9GB/s ± 0% 28.7GB/s ± 0% +30.90% (p=0.000 n=10+10) MemmoveUnalignedSrc/2048-4 24.0GB/s ± 0% 33.5GB/s ± 0% +39.18% (p=0.000 n=10+9) MemmoveUnalignedSrc/4096-4 25.3GB/s ± 0% 36.2GB/s ± 0% +43.50% (p=0.000 n=10+7) Cortex-A72 (Graviton 1) name old time/op new time/op delta Memmove/0-4 3.06ns ± 3% 3.08ns ± 1% ~ (p=0.958 n=10+9) Memmove/1-4 8.72ns ± 0% 7.85ns ± 0% -9.98% (p=0.002 n=8+10) Memmove/8-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal) Memmove/16-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal) Memmove/32-4 8.19ns ± 2% 8.29ns ± 0% ~ (p=0.114 n=10+10) Memmove/64-4 18.3ns ± 4% 10.0ns ± 0% -45.36% (p=0.000 n=10+10) Memmove/128-4 14.8ns ± 0% 17.4ns ± 0% +17.77% (p=0.000 n=10+10) Memmove/256-4 21.8ns ± 0% 23.1ns ± 0% +5.96% (p=0.000 n=10+10) Memmove/512-4 35.8ns ± 0% 37.2ns ± 0% +3.91% (p=0.000 n=10+10) Memmove/1024-4 63.7ns ± 0% 67.2ns ± 0% +5.49% (p=0.000 n=10+10) Memmove/2048-4 126ns ± 0% 123ns ± 0% -2.38% (p=0.000 n=10+10) Memmove/4096-4 238ns ± 1% 243ns ± 1% +1.93% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 19.3ns ± 1% 12.0ns ± 1% -37.49% (p=0.000 n=10+10) MemmoveUnalignedDst/128-4 17.2ns ± 0% 17.4ns ± 0% +1.16% (p=0.000 n=10+10) MemmoveUnalignedDst/256-4 28.2ns ± 8% 29.2ns ± 0% ~ (p=0.352 n=10+10) MemmoveUnalignedDst/512-4 49.8ns ± 3% 48.9ns ± 0% ~ (p=1.000 n=10+10) MemmoveUnalignedDst/1024-4 89.5ns ± 0% 80.5ns ± 1% -10.02% (p=0.000 n=10+10) MemmoveUnalignedDst/2048-4 180ns ± 0% 127ns ± 0% -29.44% (p=0.000 n=9+10) MemmoveUnalignedDst/4096-4 347ns ± 0% 244ns ± 0% -29.59% (p=0.000 n=10+9) MemmoveUnalignedSrc/128-4 16.1ns ± 0% 21.8ns ± 0% +35.40% (p=0.000 n=10+10) MemmoveUnalignedSrc/256-4 24.9ns ± 8% 26.6ns ± 0% +6.70% (p=0.015 n=10+10) MemmoveUnalignedSrc/512-4 39.4ns ± 6% 40.6ns ± 0% ~ (p=0.352 n=10+10) MemmoveUnalignedSrc/1024-4 72.5ns ± 0% 83.0ns ± 1% +14.44% (p=0.000 n=9+10) MemmoveUnalignedSrc/2048-4 129ns ± 1% 128ns ± 1% ~ (p=0.179 n=10+10) MemmoveUnalignedSrc/4096-4 241ns ± 0% 253ns ± 1% +4.99% (p=0.000 n=9+9) Cortex-A53 (Raspberry Pi 3) name old time/op new time/op delta Memmove/0-4 11.0ns ± 0% 11.0ns ± 1% ~ (p=0.294 n=8+10) Memmove/1-4 29.6ns ± 0% 28.0ns ± 1% -5.41% (p=0.000 n=9+10) Memmove/8-4 23.5ns ± 0% 22.1ns ± 0% -6.11% (p=0.000 n=8+8) Memmove/16-4 23.7ns ± 1% 22.1ns ± 0% -6.59% (p=0.000 n=10+8) Memmove/32-4 27.9ns ± 0% 27.1ns ± 0% -3.13% (p=0.000 n=8+8) Memmove/64-4 33.8ns ± 0% 31.5ns ± 1% -6.99% (p=0.000 n=8+10) Memmove/128-4 45.6ns ± 0% 44.2ns ± 1% -3.23% (p=0.000 n=9+10) Memmove/256-4 69.3ns ± 0% 69.3ns ± 0% ~ (p=0.072 n=8+8) Memmove/512-4 127ns ± 0% 110ns ± 0% -13.39% (p=0.000 n=8+8) Memmove/1024-4 222ns ± 0% 205ns ± 1% -7.66% (p=0.000 n=7+10) Memmove/2048-4 411ns ± 0% 366ns ± 0% -10.98% (p=0.000 n=8+9) Memmove/4096-4 795ns ± 1% 695ns ± 1% -12.63% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 44.0ns ± 0% 40.5ns ± 0% -7.93% (p=0.000 n=8+8) MemmoveUnalignedDst/128-4 59.6ns ± 0% 54.9ns ± 0% -7.85% (p=0.000 n=9+9) MemmoveUnalignedDst/256-4 98.2ns ±11% 90.0ns ± 1% ~ (p=0.130 n=10+10) MemmoveUnalignedDst/512-4 161ns ± 2% 145ns ± 1% -9.96% (p=0.000 n=10+10) MemmoveUnalignedDst/1024-4 281ns ± 0% 265ns ± 0% -5.65% (p=0.000 n=9+8) MemmoveUnalignedDst/2048-4 528ns ± 0% 482ns ± 0% -8.73% (p=0.000 n=8+9) MemmoveUnalignedDst/4096-4 1.02µs ± 1% 0.92µs ± 0% -10.00% (p=0.000 n=10+8) MemmoveUnalignedSrc/64-4 42.4ns ± 1% 40.5ns ± 0% -4.39% (p=0.000 n=10+8) MemmoveUnalignedSrc/128-4 57.4ns ± 0% 57.0ns ± 1% -0.75% (p=0.048 n=9+10) MemmoveUnalignedSrc/256-4 88.1ns ± 1% 89.6ns ± 0% +1.70% (p=0.000 n=9+8) MemmoveUnalignedSrc/512-4 160ns ± 2% 144ns ± 0% -9.89% (p=0.000 n=10+8) MemmoveUnalignedSrc/1024-4 286ns ± 0% 266ns ± 1% -6.69% (p=0.000 n=8+10) MemmoveUnalignedSrc/2048-4 525ns ± 0% 483ns ± 1% -7.96% (p=0.000 n=9+10) MemmoveUnalignedSrc/4096-4 1.01µs ± 0% 0.92µs ± 1% -9.40% (p=0.000 n=8+10) Change-Id: Ia1144e9d4dfafdece6e167c5e576bf80f254c8ab Reviewed-on: https://go-review.googlesource.com/c/go/+/243357 TryBot-Result: Go Bot Reviewed-by: Martin Möhrmann Reviewed-by: eric fang Reviewed-by: Cherry Zhang --- src/internal/cpu/cpu.go | 54 +++---- src/internal/cpu/cpu_arm64.go | 25 ++++ src/internal/cpu/cpu_arm64.s | 6 + src/runtime/cpuflags_arm64.go | 17 +++ src/runtime/memmove_arm64.s | 332 ++++++++++++++++++++++++++---------------- src/runtime/memmove_test.go | 30 ++++ 6 files changed, 314 insertions(+), 150 deletions(-) create mode 100644 src/runtime/cpuflags_arm64.go (limited to 'src/internal/cpu/cpu_arm64.go') diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go index 2829945af0..0ceedcd7d2 100644 --- a/src/internal/cpu/cpu.go +++ b/src/internal/cpu/cpu.go @@ -56,32 +56,34 @@ var ARM struct { // The booleans in ARM64 contain the correspondingly named cpu feature bit. // The struct is padded to avoid false sharing. var ARM64 struct { - _ CacheLinePad - HasFP bool - HasASIMD bool - HasEVTSTRM bool - HasAES bool - HasPMULL bool - HasSHA1 bool - HasSHA2 bool - HasCRC32 bool - HasATOMICS bool - HasFPHP bool - HasASIMDHP bool - HasCPUID bool - HasASIMDRDM bool - HasJSCVT bool - HasFCMA bool - HasLRCPC bool - HasDCPOP bool - HasSHA3 bool - HasSM3 bool - HasSM4 bool - HasASIMDDP bool - HasSHA512 bool - HasSVE bool - HasASIMDFHM bool - _ CacheLinePad + _ CacheLinePad + HasFP bool + HasASIMD bool + HasEVTSTRM bool + HasAES bool + HasPMULL bool + HasSHA1 bool + HasSHA2 bool + HasCRC32 bool + HasATOMICS bool + HasFPHP bool + HasASIMDHP bool + HasCPUID bool + HasASIMDRDM bool + HasJSCVT bool + HasFCMA bool + HasLRCPC bool + HasDCPOP bool + HasSHA3 bool + HasSM3 bool + HasSM4 bool + HasASIMDDP bool + HasSHA512 bool + HasSVE bool + HasASIMDFHM bool + IsNeoverseN1 bool + IsZeus bool + _ CacheLinePad } var MIPS64X struct { diff --git a/src/internal/cpu/cpu_arm64.go b/src/internal/cpu/cpu_arm64.go index 533bea2470..8fde39f03e 100644 --- a/src/internal/cpu/cpu_arm64.go +++ b/src/internal/cpu/cpu_arm64.go @@ -18,6 +18,7 @@ const ( hwcap_SHA2 = 1 << 6 hwcap_CRC32 = 1 << 7 hwcap_ATOMICS = 1 << 8 + hwcap_CPUID = 1 << 11 ) func doinit() { @@ -28,6 +29,8 @@ func doinit() { {Name: "sha2", Feature: &ARM64.HasSHA2}, {Name: "crc32", Feature: &ARM64.HasCRC32}, {Name: "atomics", Feature: &ARM64.HasATOMICS}, + {Name: "isNeoverseN1", Feature: &ARM64.IsNeoverseN1}, + {Name: "isZeus", Feature: &ARM64.IsZeus}, } switch GOOS { @@ -40,12 +43,32 @@ func doinit() { ARM64.HasSHA1 = isSet(HWCap, hwcap_SHA1) ARM64.HasSHA2 = isSet(HWCap, hwcap_SHA2) ARM64.HasCRC32 = isSet(HWCap, hwcap_CRC32) + ARM64.HasCPUID = isSet(HWCap, hwcap_CPUID) // The Samsung S9+ kernel reports support for atomics, but not all cores // actually support them, resulting in SIGILL. See issue #28431. // TODO(elias.naur): Only disable the optimization on bad chipsets on android. ARM64.HasATOMICS = isSet(HWCap, hwcap_ATOMICS) && GOOS != "android" + // Check to see if executing on a NeoverseN1 and in order to do that, + // check the AUXV for the CPUID bit. The getMIDR function executes an + // instruction which would normally be an illegal instruction, but it's + // trapped by the kernel, the value sanitized and then returned. Without + // the CPUID bit the kernel will not trap the instruction and the process + // will be terminated with SIGILL. + if ARM64.HasCPUID { + midr := getMIDR() + part_num := uint16((midr >> 4) & 0xfff) + implementor := byte((midr >> 24) & 0xff) + + if implementor == 'A' && part_num == 0xd0c { + ARM64.IsNeoverseN1 = true + } + if implementor == 'A' && part_num == 0xd40 { + ARM64.IsZeus = true + } + } + case "freebsd": // Retrieve info from system register ID_AA64ISAR0_EL1. isar0 := getisar0() @@ -93,3 +116,5 @@ func isSet(hwc uint, value uint) bool { } func getisar0() uint64 + +func getMIDR() uint64 diff --git a/src/internal/cpu/cpu_arm64.s b/src/internal/cpu/cpu_arm64.s index d85914973f..d6e7f44373 100644 --- a/src/internal/cpu/cpu_arm64.s +++ b/src/internal/cpu/cpu_arm64.s @@ -10,3 +10,9 @@ TEXT ·getisar0(SB),NOSPLIT,$0 MRS ID_AA64ISAR0_EL1, R0 MOVD R0, ret+0(FP) RET + +// func getMIDR() uint64 +TEXT ·getMIDR(SB), NOSPLIT, $0-8 + MRS MIDR_EL1, R0 + MOVD R0, ret+0(FP) + RET diff --git a/src/runtime/cpuflags_arm64.go b/src/runtime/cpuflags_arm64.go new file mode 100644 index 0000000000..7576bef4a7 --- /dev/null +++ b/src/runtime/cpuflags_arm64.go @@ -0,0 +1,17 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime + +import ( + "internal/cpu" +) + +var arm64UseAlignedLoads bool + +func init() { + if cpu.ARM64.IsNeoverseN1 || cpu.ARM64.IsZeus { + arm64UseAlignedLoads = true + } +} diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s index dbb7e9a28a..43d27629e5 100644 --- a/src/runtime/memmove_arm64.s +++ b/src/runtime/memmove_arm64.s @@ -6,152 +6,236 @@ // See memmove Go doc for important implementation constraints. +// Register map +// +// dstin R0 +// src R1 +// count R2 +// dst R3 (same as R0, but gets modified in unaligned cases) +// srcend R4 +// dstend R5 +// data R6-R17 +// tmp1 R14 + +// Copies are split into 3 main cases: small copies of up to 32 bytes, medium +// copies of up to 128 bytes, and large copies. The overhead of the overlap +// check is negligible since it is only required for large copies. +// +// Large copies use a software pipelined loop processing 64 bytes per iteration. +// The destination pointer is 16-byte aligned to minimize unaligned accesses. +// The loop tail is handled by always copying 64 bytes from the end. + // func memmove(to, from unsafe.Pointer, n uintptr) TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 - MOVD to+0(FP), R3 - MOVD from+8(FP), R4 - MOVD n+16(FP), R5 - CBNZ R5, check - RET + MOVD to+0(FP), R0 + MOVD from+8(FP), R1 + MOVD n+16(FP), R2 + CBZ R2, copy0 -check: - CMP $16, R5 + // Small copies: 1..16 bytes + CMP $16, R2 BLE copy16 - AND $~31, R5, R7 // R7 is N&~31 - SUB R7, R5, R6 // R6 is N&31 - - CMP R3, R4 - BLT backward - - // Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes. - // R3 and R4 are advanced as we copy. - - // (There may be implementations of armv8 where copying by bytes until - // at least one of source or dest is word aligned is a worthwhile - // optimization, but the on the one tested so far (xgene) it did not - // make a significance difference.) - - CBZ R7, noforwardlarge // Do we need to do any quadword copying? - - ADD R3, R7, R9 // R9 points just past where we copy by word - -forwardlargeloop: - // Copy 32 bytes at a time. - LDP.P 32(R4), (R8, R10) - STP.P (R8, R10), 32(R3) - LDP -16(R4), (R11, R12) - STP (R11, R12), -16(R3) - SUB $32, R7, R7 - CBNZ R7, forwardlargeloop - -noforwardlarge: - CBNZ R6, forwardtail // Do we need to copy any tail bytes? + // Large copies + CMP $128, R2 + BHI copy_long + CMP $32, R2 + BHI copy32_128 + + // Small copies: 17..32 bytes. + LDP (R1), (R6, R7) + ADD R1, R2, R4 // R4 points just past the last source byte + LDP -16(R4), (R12, R13) + STP (R6, R7), (R0) + ADD R0, R2, R5 // R5 points just past the last destination byte + STP (R12, R13), -16(R5) RET -forwardtail: - // There are R6 <= 31 bytes remaining to copy. - // This is large enough to still contain pointers, - // which must be copied atomically. - // Copy the next 16 bytes, then 8 bytes, then any remaining bytes. - TBZ $4, R6, 3(PC) // write 16 bytes if R6&16 != 0 - LDP.P 16(R4), (R8, R10) - STP.P (R8, R10), 16(R3) - - TBZ $3, R6, 3(PC) // write 8 bytes if R6&8 != 0 - MOVD.P 8(R4), R8 - MOVD.P R8, 8(R3) - - AND $7, R6 - CBNZ R6, 2(PC) - RET - - ADD R3, R6, R9 // R9 points just past the destination memory - -forwardtailloop: - MOVBU.P 1(R4), R8 - MOVBU.P R8, 1(R3) - CMP R3, R9 - BNE forwardtailloop - RET - - // Small copies: 1..16 bytes. +// Small copies: 1..16 bytes. copy16: - ADD R4, R5, R8 // R8 points just past the last source byte - ADD R3, R5, R9 // R9 points just past the last destination byte - CMP $8, R5 + ADD R1, R2, R4 // R4 points just past the last source byte + ADD R0, R2, R5 // R5 points just past the last destination byte + CMP $8, R2 BLT copy7 - MOVD (R4), R6 - MOVD -8(R8), R7 - MOVD R6, (R3) - MOVD R7, -8(R9) + MOVD (R1), R6 + MOVD -8(R4), R7 + MOVD R6, (R0) + MOVD R7, -8(R5) RET copy7: - TBZ $2, R5, copy3 - MOVWU (R4), R6 - MOVWU -4(R8), R7 - MOVW R6, (R3) - MOVW R7, -4(R9) + TBZ $2, R2, copy3 + MOVWU (R1), R6 + MOVWU -4(R4), R7 + MOVW R6, (R0) + MOVW R7, -4(R5) RET copy3: - TBZ $1, R5, copy1 - MOVHU (R4), R6 - MOVHU -2(R8), R7 - MOVH R6, (R3) - MOVH R7, -2(R9) + TBZ $1, R2, copy1 + MOVHU (R1), R6 + MOVHU -2(R4), R7 + MOVH R6, (R0) + MOVH R7, -2(R5) RET copy1: - MOVBU (R4), R6 - MOVB R6, (R3) - RET - -backward: - // Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords. - // R3 and R4 are advanced to the end of the destination/source buffers - // respectively and moved back as we copy. - - ADD R4, R5, R4 // R4 points just past the last source byte - ADD R3, R5, R3 // R3 points just past the last destination byte - - CBZ R6, nobackwardtail // Do we need to do any byte-by-byte copying? + MOVBU (R1), R6 + MOVB R6, (R0) - AND $7, R6, R12 - CBZ R12, backwardtaillarge - - SUB R12, R3, R9 // R9 points at the lowest destination byte that should be copied by byte. -backwardtailloop: - // Copy sub-pointer-size tail. - MOVBU.W -1(R4), R8 - MOVBU.W R8, -1(R3) - CMP R9, R3 - BNE backwardtailloop - -backwardtaillarge: - // Do 8/16-byte write if possible. - // See comment at forwardtail. - TBZ $3, R6, 3(PC) - MOVD.W -8(R4), R8 - MOVD.W R8, -8(R3) +copy0: + RET - TBZ $4, R6, 3(PC) - LDP.W -16(R4), (R8, R10) - STP.W (R8, R10), -16(R3) + // Medium copies: 33..128 bytes. +copy32_128: + ADD R1, R2, R4 // R4 points just past the last source byte + ADD R0, R2, R5 // R5 points just past the last destination byte + LDP (R1), (R6, R7) + LDP 16(R1), (R8, R9) + LDP -32(R4), (R10, R11) + LDP -16(R4), (R12, R13) + CMP $64, R2 + BHI copy128 + STP (R6, R7), (R0) + STP (R8, R9), 16(R0) + STP (R10, R11), -32(R5) + STP (R12, R13), -16(R5) + RET -nobackwardtail: - CBNZ R7, backwardlarge // Do we need to do any doubleword-by-doubleword copying? + // Copy 65..128 bytes. +copy128: + LDP 32(R1), (R14, R15) + LDP 48(R1), (R16, R17) + CMP $96, R2 + BLS copy96 + LDP -64(R4), (R2, R3) + LDP -48(R4), (R1, R4) + STP (R2, R3), -64(R5) + STP (R1, R4), -48(R5) + +copy96: + STP (R6, R7), (R0) + STP (R8, R9), 16(R0) + STP (R14, R15), 32(R0) + STP (R16, R17), 48(R0) + STP (R10, R11), -32(R5) + STP (R12, R13), -16(R5) RET -backwardlarge: - SUB R7, R3, R9 // R9 points at the lowest destination byte + // Copy more than 128 bytes. +copy_long: + ADD R1, R2, R4 // R4 points just past the last source byte + ADD R0, R2, R5 // R5 points just past the last destination byte + MOVD ZR, R7 + MOVD ZR, R8 + + CMP $1024, R2 + BLT backward_check + // feature detect to decide how to align + MOVBU runtime·arm64UseAlignedLoads(SB), R6 + CBNZ R6, use_aligned_loads + MOVD R0, R7 + MOVD R5, R8 + B backward_check +use_aligned_loads: + MOVD R1, R7 + MOVD R4, R8 + // R7 and R8 are used here for the realignment calculation. In + // the use_aligned_loads case, R7 is the src pointer and R8 is + // srcend pointer, which is used in the backward copy case. + // When doing aligned stores, R7 is the dst pointer and R8 is + // the dstend pointer. + +backward_check: + // Use backward copy if there is an overlap. + SUB R1, R0, R14 + CBZ R14, copy0 + CMP R2, R14 + BCC copy_long_backward + + // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment. + LDP (R1), (R12, R13) // Load A + AND $15, R7, R14 // Calculate the realignment offset + SUB R14, R1, R1 + SUB R14, R0, R3 // move dst back same amount as src + ADD R14, R2, R2 + LDP 16(R1), (R6, R7) // Load B + STP (R12, R13), (R0) // Store A + LDP 32(R1), (R8, R9) // Load C + LDP 48(R1), (R10, R11) // Load D + LDP.W 64(R1), (R12, R13) // Load E + // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end + SUBS $144, R2, R2 + BLS copy64_from_end + +loop64: + STP (R6, R7), 16(R3) // Store B + LDP 16(R1), (R6, R7) // Load B (next iteration) + STP (R8, R9), 32(R3) // Store C + LDP 32(R1), (R8, R9) // Load C + STP (R10, R11), 48(R3) // Store D + LDP 48(R1), (R10, R11) // Load D + STP.W (R12, R13), 64(R3) // Store E + LDP.W 64(R1), (R12, R13) // Load E + SUBS $64, R2, R2 + BHI loop64 + + // Write the last iteration and copy 64 bytes from the end. +copy64_from_end: + LDP -64(R4), (R14, R15) // Load F + STP (R6, R7), 16(R3) // Store B + LDP -48(R4), (R6, R7) // Load G + STP (R8, R9), 32(R3) // Store C + LDP -32(R4), (R8, R9) // Load H + STP (R10, R11), 48(R3) // Store D + LDP -16(R4), (R10, R11) // Load I + STP (R12, R13), 64(R3) // Store E + STP (R14, R15), -64(R5) // Store F + STP (R6, R7), -48(R5) // Store G + STP (R8, R9), -32(R5) // Store H + STP (R10, R11), -16(R5) // Store I + RET -backwardlargeloop: - LDP -16(R4), (R8, R10) - STP (R8, R10), -16(R3) - LDP.W -32(R4), (R11, R12) - STP.W (R11, R12), -32(R3) - CMP R9, R3 - BNE backwardlargeloop + // Large backward copy for overlapping copies. + // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment. +copy_long_backward: + LDP -16(R4), (R12, R13) + AND $15, R8, R14 + SUB R14, R4, R4 + SUB R14, R2, R2 + LDP -16(R4), (R6, R7) + STP (R12, R13), -16(R5) + LDP -32(R4), (R8, R9) + LDP -48(R4), (R10, R11) + LDP.W -64(R4), (R12, R13) + SUB R14, R5, R5 + SUBS $128, R2, R2 + BLS copy64_from_start + +loop64_backward: + STP (R6, R7), -16(R5) + LDP -16(R4), (R6, R7) + STP (R8, R9), -32(R5) + LDP -32(R4), (R8, R9) + STP (R10, R11), -48(R5) + LDP -48(R4), (R10, R11) + STP.W (R12, R13), -64(R5) + LDP.W -64(R4), (R12, R13) + SUBS $64, R2, R2 + BHI loop64_backward + + // Write the last iteration and copy 64 bytes from the start. +copy64_from_start: + LDP 48(R1), (R2, R3) + STP (R6, R7), -16(R5) + LDP 32(R1), (R6, R7) + STP (R8, R9), -32(R5) + LDP 16(R1), (R8, R9) + STP (R10, R11), -48(R5) + LDP (R1), (R10, R11) + STP (R12, R13), -64(R5) + STP (R2, R3), 48(R0) + STP (R6, R7), 32(R0) + STP (R8, R9), 16(R0) + STP (R10, R11), (R0) RET diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go index b549433f71..7c9d2ada45 100644 --- a/src/runtime/memmove_test.go +++ b/src/runtime/memmove_test.go @@ -286,6 +286,9 @@ var bufSizes = []int{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, } +var bufSizesOverlap = []int{ + 32, 64, 128, 256, 512, 1024, 2048, 4096, +} func BenchmarkMemmove(b *testing.B) { benchmarkSizes(b, bufSizes, func(b *testing.B, n int) { @@ -297,6 +300,15 @@ func BenchmarkMemmove(b *testing.B) { }) } +func BenchmarkMemmoveOverlap(b *testing.B) { + benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) { + x := make([]byte, n+16) + for i := 0; i < b.N; i++ { + copy(x[16:n+16], x[:n]) + } + }) +} + func BenchmarkMemmoveUnalignedDst(b *testing.B) { benchmarkSizes(b, bufSizes, func(b *testing.B, n int) { x := make([]byte, n+1) @@ -307,6 +319,15 @@ func BenchmarkMemmoveUnalignedDst(b *testing.B) { }) } +func BenchmarkMemmoveUnalignedDstOverlap(b *testing.B) { + benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) { + x := make([]byte, n+16) + for i := 0; i < b.N; i++ { + copy(x[16:n+16], x[1:n+1]) + } + }) +} + func BenchmarkMemmoveUnalignedSrc(b *testing.B) { benchmarkSizes(b, bufSizes, func(b *testing.B, n int) { x := make([]byte, n) @@ -317,6 +338,15 @@ func BenchmarkMemmoveUnalignedSrc(b *testing.B) { }) } +func BenchmarkMemmoveUnalignedSrcOverlap(b *testing.B) { + benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) { + x := make([]byte, n+1) + for i := 0; i < b.N; i++ { + copy(x[1:n+1], x[:n]) + } + }) +} + func TestMemclr(t *testing.T) { size := 512 if testing.Short() { -- cgit v1.3 From 3510a1e32cbc86b73db143aefcc00aadc44c27bd Mon Sep 17 00:00:00 2001 From: Martin Möhrmann Date: Thu, 5 Nov 2020 05:59:34 +0100 Subject: internal/cpu: fix and cleanup ARM64 cpu feature fields and options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove all cpu features from the ARM64 struct that are not initialized to reduce cache lines used and to avoid those features being accidentially used without actual detection if they are present. Add missing option to mask the CPUID feature. Change-Id: I94bf90c0655de1af2218ac72117ac6c52adfc289 Reviewed-on: https://go-review.googlesource.com/c/go/+/267658 Run-TryBot: Martin Möhrmann TryBot-Result: Go Bot Reviewed-by: Tobias Klauser Trust: Martin Möhrmann --- src/internal/cpu/cpu.go | 17 ----------------- src/internal/cpu/cpu_arm64.go | 1 + 2 files changed, 1 insertion(+), 17 deletions(-) (limited to 'src/internal/cpu/cpu_arm64.go') diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go index 0ceedcd7d2..dab5d068ef 100644 --- a/src/internal/cpu/cpu.go +++ b/src/internal/cpu/cpu.go @@ -57,30 +57,13 @@ var ARM struct { // The struct is padded to avoid false sharing. var ARM64 struct { _ CacheLinePad - HasFP bool - HasASIMD bool - HasEVTSTRM bool HasAES bool HasPMULL bool HasSHA1 bool HasSHA2 bool HasCRC32 bool HasATOMICS bool - HasFPHP bool - HasASIMDHP bool HasCPUID bool - HasASIMDRDM bool - HasJSCVT bool - HasFCMA bool - HasLRCPC bool - HasDCPOP bool - HasSHA3 bool - HasSM3 bool - HasSM4 bool - HasASIMDDP bool - HasSHA512 bool - HasSVE bool - HasASIMDFHM bool IsNeoverseN1 bool IsZeus bool _ CacheLinePad diff --git a/src/internal/cpu/cpu_arm64.go b/src/internal/cpu/cpu_arm64.go index 8fde39f03e..4e9ea8ca96 100644 --- a/src/internal/cpu/cpu_arm64.go +++ b/src/internal/cpu/cpu_arm64.go @@ -29,6 +29,7 @@ func doinit() { {Name: "sha2", Feature: &ARM64.HasSHA2}, {Name: "crc32", Feature: &ARM64.HasCRC32}, {Name: "atomics", Feature: &ARM64.HasATOMICS}, + {Name: "cpuid", Feature: &ARM64.HasCPUID}, {Name: "isNeoverseN1", Feature: &ARM64.IsNeoverseN1}, {Name: "isZeus", Feature: &ARM64.IsZeus}, } -- cgit v1.3 From 3b2a578166bdedd94110698c971ba8990771eb89 Mon Sep 17 00:00:00 2001 From: Ikko Ashimine Date: Sat, 5 Dec 2020 14:43:53 +0000 Subject: internal/cpu: fix typo in cpu_arm64.go auxillary -> auxiliary Change-Id: I7c29c4a63d236c3688b8e4f5af70650d43cd89c0 GitHub-Last-Rev: d4a18c71a15cf0803bd847225ed5bf898c52e0f3 GitHub-Pull-Request: golang/go#43024 Reviewed-on: https://go-review.googlesource.com/c/go/+/275592 Reviewed-by: Ian Lance Taylor Trust: Keith Randall --- src/internal/cpu/cpu_arm64.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/internal/cpu/cpu_arm64.go') diff --git a/src/internal/cpu/cpu_arm64.go b/src/internal/cpu/cpu_arm64.go index 4e9ea8ca96..a8f7b2b458 100644 --- a/src/internal/cpu/cpu_arm64.go +++ b/src/internal/cpu/cpu_arm64.go @@ -36,7 +36,7 @@ func doinit() { switch GOOS { case "linux", "android": - // HWCap was populated by the runtime from the auxillary vector. + // HWCap was populated by the runtime from the auxiliary vector. // Use HWCap information since reading aarch64 system registers // is not supported in user space on older linux kernels. ARM64.HasAES = isSet(HWCap, hwcap_AES) @@ -103,7 +103,7 @@ func doinit() { ARM64.HasATOMICS = true } default: - // Other operating systems do not support reading HWCap from auxillary vector + // Other operating systems do not support reading HWCap from auxiliary vector // or reading privileged aarch64 system registers in user space. } } -- cgit v1.3 From c15593197453b8bf90fc3a9080ba2afeaf7934ea Mon Sep 17 00:00:00 2001 From: Martin Möhrmann Date: Sat, 21 Nov 2020 17:44:04 +0100 Subject: internal/cpu: add darwin/arm64 CPU feature detection support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #42747 Change-Id: I6b1679348c77161f075f0678818bb003fc0e8c86 Reviewed-on: https://go-review.googlesource.com/c/go/+/271989 Trust: Martin Möhrmann Run-TryBot: Martin Möhrmann TryBot-Result: Go Bot Reviewed-by: Cherry Zhang --- src/internal/cpu/cpu_android.go | 7 --- src/internal/cpu/cpu_arm64.go | 97 +---------------------------------- src/internal/cpu/cpu_arm64_android.go | 11 ++++ src/internal/cpu/cpu_arm64_darwin.go | 34 ++++++++++++ src/internal/cpu/cpu_arm64_freebsd.go | 45 ++++++++++++++++ src/internal/cpu/cpu_arm64_hwcap.go | 63 +++++++++++++++++++++++ src/internal/cpu/cpu_arm64_linux.go | 13 +++++ src/internal/cpu/cpu_arm64_other.go | 17 ++++++ src/internal/cpu/cpu_freebsd.go | 7 --- src/internal/cpu/cpu_linux.go | 9 ---- src/internal/cpu/cpu_other.go | 11 ---- src/internal/cpu/cpu_test.go | 7 +-- src/runtime/os_darwin.go | 12 +++++ src/runtime/sys_darwin.go | 10 +++- src/runtime/sys_darwin_amd64.s | 20 ++++++-- src/runtime/sys_darwin_arm64.s | 18 +++++-- 16 files changed, 238 insertions(+), 143 deletions(-) delete mode 100644 src/internal/cpu/cpu_android.go create mode 100644 src/internal/cpu/cpu_arm64_android.go create mode 100644 src/internal/cpu/cpu_arm64_darwin.go create mode 100644 src/internal/cpu/cpu_arm64_freebsd.go create mode 100644 src/internal/cpu/cpu_arm64_hwcap.go create mode 100644 src/internal/cpu/cpu_arm64_linux.go create mode 100644 src/internal/cpu/cpu_arm64_other.go delete mode 100644 src/internal/cpu/cpu_freebsd.go delete mode 100644 src/internal/cpu/cpu_linux.go delete mode 100644 src/internal/cpu/cpu_other.go (limited to 'src/internal/cpu/cpu_arm64.go') diff --git a/src/internal/cpu/cpu_android.go b/src/internal/cpu/cpu_android.go deleted file mode 100644 index d995e8d5a7..0000000000 --- a/src/internal/cpu/cpu_android.go +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package cpu - -const GOOS = "android" diff --git a/src/internal/cpu/cpu_arm64.go b/src/internal/cpu/cpu_arm64.go index a8f7b2b458..f64d9e4dd3 100644 --- a/src/internal/cpu/cpu_arm64.go +++ b/src/internal/cpu/cpu_arm64.go @@ -6,21 +6,6 @@ package cpu const CacheLinePadSize = 64 -// HWCap may be initialized by archauxv and -// should not be changed after it was initialized. -var HWCap uint - -// HWCAP bits. These are exposed by Linux. -const ( - hwcap_AES = 1 << 3 - hwcap_PMULL = 1 << 4 - hwcap_SHA1 = 1 << 5 - hwcap_SHA2 = 1 << 6 - hwcap_CRC32 = 1 << 7 - hwcap_ATOMICS = 1 << 8 - hwcap_CPUID = 1 << 11 -) - func doinit() { options = []option{ {Name: "aes", Feature: &ARM64.HasAES}, @@ -34,86 +19,8 @@ func doinit() { {Name: "isZeus", Feature: &ARM64.IsZeus}, } - switch GOOS { - case "linux", "android": - // HWCap was populated by the runtime from the auxiliary vector. - // Use HWCap information since reading aarch64 system registers - // is not supported in user space on older linux kernels. - ARM64.HasAES = isSet(HWCap, hwcap_AES) - ARM64.HasPMULL = isSet(HWCap, hwcap_PMULL) - ARM64.HasSHA1 = isSet(HWCap, hwcap_SHA1) - ARM64.HasSHA2 = isSet(HWCap, hwcap_SHA2) - ARM64.HasCRC32 = isSet(HWCap, hwcap_CRC32) - ARM64.HasCPUID = isSet(HWCap, hwcap_CPUID) - - // The Samsung S9+ kernel reports support for atomics, but not all cores - // actually support them, resulting in SIGILL. See issue #28431. - // TODO(elias.naur): Only disable the optimization on bad chipsets on android. - ARM64.HasATOMICS = isSet(HWCap, hwcap_ATOMICS) && GOOS != "android" - - // Check to see if executing on a NeoverseN1 and in order to do that, - // check the AUXV for the CPUID bit. The getMIDR function executes an - // instruction which would normally be an illegal instruction, but it's - // trapped by the kernel, the value sanitized and then returned. Without - // the CPUID bit the kernel will not trap the instruction and the process - // will be terminated with SIGILL. - if ARM64.HasCPUID { - midr := getMIDR() - part_num := uint16((midr >> 4) & 0xfff) - implementor := byte((midr >> 24) & 0xff) - - if implementor == 'A' && part_num == 0xd0c { - ARM64.IsNeoverseN1 = true - } - if implementor == 'A' && part_num == 0xd40 { - ARM64.IsZeus = true - } - } - - case "freebsd": - // Retrieve info from system register ID_AA64ISAR0_EL1. - isar0 := getisar0() - - // ID_AA64ISAR0_EL1 - switch extractBits(isar0, 4, 7) { - case 1: - ARM64.HasAES = true - case 2: - ARM64.HasAES = true - ARM64.HasPMULL = true - } - - switch extractBits(isar0, 8, 11) { - case 1: - ARM64.HasSHA1 = true - } - - switch extractBits(isar0, 12, 15) { - case 1, 2: - ARM64.HasSHA2 = true - } - - switch extractBits(isar0, 16, 19) { - case 1: - ARM64.HasCRC32 = true - } - - switch extractBits(isar0, 20, 23) { - case 2: - ARM64.HasATOMICS = true - } - default: - // Other operating systems do not support reading HWCap from auxiliary vector - // or reading privileged aarch64 system registers in user space. - } -} - -func extractBits(data uint64, start, end uint) uint { - return (uint)(data>>start) & ((1 << (end - start + 1)) - 1) -} - -func isSet(hwc uint, value uint) bool { - return hwc&value != 0 + // arm64 uses different ways to detect CPU features at runtime depending on the operating system. + osInit() } func getisar0() uint64 diff --git a/src/internal/cpu/cpu_arm64_android.go b/src/internal/cpu/cpu_arm64_android.go new file mode 100644 index 0000000000..3c9e57c52a --- /dev/null +++ b/src/internal/cpu/cpu_arm64_android.go @@ -0,0 +1,11 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build arm64 + +package cpu + +func osInit() { + hwcapInit("android") +} diff --git a/src/internal/cpu/cpu_arm64_darwin.go b/src/internal/cpu/cpu_arm64_darwin.go new file mode 100644 index 0000000000..e094b97f97 --- /dev/null +++ b/src/internal/cpu/cpu_arm64_darwin.go @@ -0,0 +1,34 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build arm64 +// +build darwin +// +build !ios + +package cpu + +func osInit() { + ARM64.HasATOMICS = sysctlEnabled([]byte("hw.optional.armv8_1_atomics\x00")) + ARM64.HasCRC32 = sysctlEnabled([]byte("hw.optional.armv8_crc32\x00")) + + // There are no hw.optional sysctl values for the below features on Mac OS 11.0 + // to detect their supported state dynamically. Assume the CPU features that + // Apple Silicon M1 supports to be available as a minimal set of features + // to all Go programs running on darwin/arm64. + ARM64.HasAES = true + ARM64.HasPMULL = true + ARM64.HasSHA1 = true + ARM64.HasSHA2 = true +} + +//go:noescape +func getsysctlbyname(name []byte) (int32, int32) + +func sysctlEnabled(name []byte) bool { + ret, value := getsysctlbyname(name) + if ret < 0 { + return false + } + return value > 0 +} diff --git a/src/internal/cpu/cpu_arm64_freebsd.go b/src/internal/cpu/cpu_arm64_freebsd.go new file mode 100644 index 0000000000..9de2005c2e --- /dev/null +++ b/src/internal/cpu/cpu_arm64_freebsd.go @@ -0,0 +1,45 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build arm64 + +package cpu + +func osInit() { + // Retrieve info from system register ID_AA64ISAR0_EL1. + isar0 := getisar0() + + // ID_AA64ISAR0_EL1 + switch extractBits(isar0, 4, 7) { + case 1: + ARM64.HasAES = true + case 2: + ARM64.HasAES = true + ARM64.HasPMULL = true + } + + switch extractBits(isar0, 8, 11) { + case 1: + ARM64.HasSHA1 = true + } + + switch extractBits(isar0, 12, 15) { + case 1, 2: + ARM64.HasSHA2 = true + } + + switch extractBits(isar0, 16, 19) { + case 1: + ARM64.HasCRC32 = true + } + + switch extractBits(isar0, 20, 23) { + case 2: + ARM64.HasATOMICS = true + } +} + +func extractBits(data uint64, start, end uint) uint { + return (uint)(data>>start) & ((1 << (end - start + 1)) - 1) +} diff --git a/src/internal/cpu/cpu_arm64_hwcap.go b/src/internal/cpu/cpu_arm64_hwcap.go new file mode 100644 index 0000000000..fdaf43e1a2 --- /dev/null +++ b/src/internal/cpu/cpu_arm64_hwcap.go @@ -0,0 +1,63 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build arm64 +// +build linux + +package cpu + +// HWCap may be initialized by archauxv and +// should not be changed after it was initialized. +var HWCap uint + +// HWCAP bits. These are exposed by Linux. +const ( + hwcap_AES = 1 << 3 + hwcap_PMULL = 1 << 4 + hwcap_SHA1 = 1 << 5 + hwcap_SHA2 = 1 << 6 + hwcap_CRC32 = 1 << 7 + hwcap_ATOMICS = 1 << 8 + hwcap_CPUID = 1 << 11 +) + +func hwcapInit(os string) { + // HWCap was populated by the runtime from the auxiliary vector. + // Use HWCap information since reading aarch64 system registers + // is not supported in user space on older linux kernels. + ARM64.HasAES = isSet(HWCap, hwcap_AES) + ARM64.HasPMULL = isSet(HWCap, hwcap_PMULL) + ARM64.HasSHA1 = isSet(HWCap, hwcap_SHA1) + ARM64.HasSHA2 = isSet(HWCap, hwcap_SHA2) + ARM64.HasCRC32 = isSet(HWCap, hwcap_CRC32) + ARM64.HasCPUID = isSet(HWCap, hwcap_CPUID) + + // The Samsung S9+ kernel reports support for atomics, but not all cores + // actually support them, resulting in SIGILL. See issue #28431. + // TODO(elias.naur): Only disable the optimization on bad chipsets on android. + ARM64.HasATOMICS = isSet(HWCap, hwcap_ATOMICS) && os != "android" + + // Check to see if executing on a NeoverseN1 and in order to do that, + // check the AUXV for the CPUID bit. The getMIDR function executes an + // instruction which would normally be an illegal instruction, but it's + // trapped by the kernel, the value sanitized and then returned. Without + // the CPUID bit the kernel will not trap the instruction and the process + // will be terminated with SIGILL. + if ARM64.HasCPUID { + midr := getMIDR() + part_num := uint16((midr >> 4) & 0xfff) + implementor := byte((midr >> 24) & 0xff) + + if implementor == 'A' && part_num == 0xd0c { + ARM64.IsNeoverseN1 = true + } + if implementor == 'A' && part_num == 0xd40 { + ARM64.IsZeus = true + } + } +} + +func isSet(hwc uint, value uint) bool { + return hwc&value != 0 +} diff --git a/src/internal/cpu/cpu_arm64_linux.go b/src/internal/cpu/cpu_arm64_linux.go new file mode 100644 index 0000000000..2f7411ff1e --- /dev/null +++ b/src/internal/cpu/cpu_arm64_linux.go @@ -0,0 +1,13 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build arm64 +// +build linux +// +build !android + +package cpu + +func osInit() { + hwcapInit("linux") +} diff --git a/src/internal/cpu/cpu_arm64_other.go b/src/internal/cpu/cpu_arm64_other.go new file mode 100644 index 0000000000..f191db28d2 --- /dev/null +++ b/src/internal/cpu/cpu_arm64_other.go @@ -0,0 +1,17 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build arm64 +// +build !linux +// +build !freebsd +// +build !android +// +build !darwin ios + +package cpu + +func osInit() { + // Other operating systems do not support reading HWCap from auxiliary vector, + // reading privileged aarch64 system registers or sysctl in user space to detect + // CPU features at runtime. +} diff --git a/src/internal/cpu/cpu_freebsd.go b/src/internal/cpu/cpu_freebsd.go deleted file mode 100644 index dc37173dac..0000000000 --- a/src/internal/cpu/cpu_freebsd.go +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package cpu - -const GOOS = "freebsd" diff --git a/src/internal/cpu/cpu_linux.go b/src/internal/cpu/cpu_linux.go deleted file mode 100644 index ec0b84c510..0000000000 --- a/src/internal/cpu/cpu_linux.go +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !android - -package cpu - -const GOOS = "linux" diff --git a/src/internal/cpu/cpu_other.go b/src/internal/cpu/cpu_other.go deleted file mode 100644 index 8a15fbe79d..0000000000 --- a/src/internal/cpu/cpu_other.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !linux -// +build !freebsd -// +build !android - -package cpu - -const GOOS = "other" diff --git a/src/internal/cpu/cpu_test.go b/src/internal/cpu/cpu_test.go index 919bbd5ed7..2de7365732 100644 --- a/src/internal/cpu/cpu_test.go +++ b/src/internal/cpu/cpu_test.go @@ -18,7 +18,7 @@ func TestMinimalFeatures(t *testing.T) { // TODO: maybe do MustSupportFeatureDectection(t) ? if runtime.GOARCH == "arm64" { switch runtime.GOOS { - case "linux", "android": + case "linux", "android", "darwin": default: t.Skipf("%s/%s is not supported", runtime.GOOS, runtime.GOARCH) } @@ -38,10 +38,7 @@ func MustHaveDebugOptionsSupport(t *testing.T) { } func MustSupportFeatureDectection(t *testing.T) { - if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { - t.Skipf("CPU feature detection is not supported on %s/%s", runtime.GOOS, runtime.GOARCH) - } - // TODO: maybe there are other platforms? + // TODO: add platforms that do not have CPU feature detection support. } func runDebugOptionsTest(t *testing.T, test string, options string) { diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go index 52f3cd1fef..e0a43c28aa 100644 --- a/src/runtime/os_darwin.go +++ b/src/runtime/os_darwin.go @@ -130,6 +130,18 @@ func osinit() { physPageSize = getPageSize() } +func sysctlbynameInt32(name []byte) (int32, int32) { + out := int32(0) + nout := unsafe.Sizeof(out) + ret := sysctlbyname(&name[0], (*byte)(unsafe.Pointer(&out)), &nout, nil, 0) + return ret, out +} + +//go:linkname internal_cpu_getsysctlbyname internal/cpu.getsysctlbyname +func internal_cpu_getsysctlbyname(name []byte) (int32, int32) { + return sysctlbynameInt32(name) +} + const ( _CTL_HW = 6 _HW_NCPU = 3 diff --git a/src/runtime/sys_darwin.go b/src/runtime/sys_darwin.go index c63ba8c6cd..c89ce78012 100644 --- a/src/runtime/sys_darwin.go +++ b/src/runtime/sys_darwin.go @@ -360,11 +360,18 @@ func setitimer_trampoline() //go:nosplit //go:cgo_unsafe_args -func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32 { +func sysctl(mib *uint32, miblen uint32, oldp *byte, oldlenp *uintptr, newp *byte, newlen uintptr) int32 { return libcCall(unsafe.Pointer(funcPC(sysctl_trampoline)), unsafe.Pointer(&mib)) } func sysctl_trampoline() +//go:nosplit +//go:cgo_unsafe_args +func sysctlbyname(name *byte, oldp *byte, oldlenp *uintptr, newp *byte, newlen uintptr) int32 { + return libcCall(unsafe.Pointer(funcPC(sysctlbyname_trampoline)), unsafe.Pointer(&name)) +} +func sysctlbyname_trampoline() + //go:nosplit //go:cgo_unsafe_args func fcntl(fd, cmd, arg int32) int32 { @@ -486,6 +493,7 @@ func setNonblock(fd int32) { //go:cgo_import_dynamic libc_kill kill "/usr/lib/libSystem.B.dylib" //go:cgo_import_dynamic libc_setitimer setitimer "/usr/lib/libSystem.B.dylib" //go:cgo_import_dynamic libc_sysctl sysctl "/usr/lib/libSystem.B.dylib" +//go:cgo_import_dynamic libc_sysctlbyname sysctlbyname "/usr/lib/libSystem.B.dylib" //go:cgo_import_dynamic libc_fcntl fcntl "/usr/lib/libSystem.B.dylib" //go:cgo_import_dynamic libc_kqueue kqueue "/usr/lib/libSystem.B.dylib" //go:cgo_import_dynamic libc_kevent kevent "/usr/lib/libSystem.B.dylib" diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s index 9b5b23901d..630fb5df64 100644 --- a/src/runtime/sys_darwin_amd64.s +++ b/src/runtime/sys_darwin_amd64.s @@ -371,15 +371,27 @@ TEXT runtime·sysctl_trampoline(SB),NOSPLIT,$0 PUSHQ BP MOVQ SP, BP MOVL 8(DI), SI // arg 2 miblen - MOVQ 16(DI), DX // arg 3 out - MOVQ 24(DI), CX // arg 4 size - MOVQ 32(DI), R8 // arg 5 dst - MOVQ 40(DI), R9 // arg 6 ndst + MOVQ 16(DI), DX // arg 3 oldp + MOVQ 24(DI), CX // arg 4 oldlenp + MOVQ 32(DI), R8 // arg 5 newp + MOVQ 40(DI), R9 // arg 6 newlen MOVQ 0(DI), DI // arg 1 mib CALL libc_sysctl(SB) POPQ BP RET +TEXT runtime·sysctlbyname_trampoline(SB),NOSPLIT,$0 + PUSHQ BP + MOVQ SP, BP + MOVQ 8(DI), SI // arg 2 oldp + MOVQ 16(DI), DX // arg 3 oldlenp + MOVQ 24(DI), CX // arg 4 newp + MOVQ 32(DI), R8 // arg 5 newlen + MOVQ 0(DI), DI // arg 1 name + CALL libc_sysctlbyname(SB) + POPQ BP + RET + TEXT runtime·kqueue_trampoline(SB),NOSPLIT,$0 PUSHQ BP MOVQ SP, BP diff --git a/src/runtime/sys_darwin_arm64.s b/src/runtime/sys_darwin_arm64.s index 9d4d116c50..96d2ed1076 100644 --- a/src/runtime/sys_darwin_arm64.s +++ b/src/runtime/sys_darwin_arm64.s @@ -301,14 +301,24 @@ TEXT runtime·usleep_trampoline(SB),NOSPLIT,$0 TEXT runtime·sysctl_trampoline(SB),NOSPLIT,$0 MOVW 8(R0), R1 // arg 2 miblen - MOVD 16(R0), R2 // arg 3 out - MOVD 24(R0), R3 // arg 4 size - MOVD 32(R0), R4 // arg 5 dst - MOVD 40(R0), R5 // arg 6 ndst + MOVD 16(R0), R2 // arg 3 oldp + MOVD 24(R0), R3 // arg 4 oldlenp + MOVD 32(R0), R4 // arg 5 newp + MOVD 40(R0), R5 // arg 6 newlen MOVD 0(R0), R0 // arg 1 mib BL libc_sysctl(SB) RET +TEXT runtime·sysctlbyname_trampoline(SB),NOSPLIT,$0 + MOVD 8(R0), R1 // arg 2 oldp + MOVD 16(R0), R2 // arg 3 oldlenp + MOVD 24(R0), R3 // arg 4 newp + MOVD 32(R0), R4 // arg 5 newlen + MOVD 0(R0), R0 // arg 1 name + BL libc_sysctlbyname(SB) + RET + + TEXT runtime·kqueue_trampoline(SB),NOSPLIT,$0 BL libc_kqueue(SB) RET -- cgit v1.3