From 92bda33d2771a9b12868d9025f113538fa7a84de Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Fri, 31 Jul 2020 15:58:00 -0400 Subject: runtime: revert signal stack mlocking Go 1.14 included a (rather awful) workaround for a Linux kernel bug that corrupted vector registers on x86 CPUs during signal delivery (https://bugzilla.kernel.org/show_bug.cgi?id=205663). This bug was introduced in Linux 5.2 and fixed in 5.3.15, 5.4.2 and all 5.5 and later kernels. The fix was also back-ported by major distros. This workaround was necessary, but had unfortunate downsides, including causing Go programs to exceed the mlock ulimit in many configurations (#37436). We're reasonably confident that by the Go 1.16 release, the number of systems running affected kernels will be vanishingly small. Hence, this CL removes this workaround. This effectively reverts CLs 209597 (version parser), 209899 (mlock top of signal stack), 210299 (better failure message), 223121 (soft mlock failure handling), and 244059 (special-case patched Ubuntu kernels). The one thing we keep is the osArchInit function. It's empty everywhere now, but is a reasonable hook to have. Updates #35326, #35777 (the original register corruption bugs). Updates #40184 (request to revert in 1.15). Fixes #35979. Change-Id: Ie213270837095576f1f3ef46bf3de187dc486c50 Reviewed-on: https://go-review.googlesource.com/c/go/+/246200 Run-TryBot: Austin Clements TryBot-Result: Gobot Gobot Reviewed-by: Ian Lance Taylor --- src/runtime/sys_linux_amd64.s | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'src/runtime/sys_linux_amd64.s') diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s index fe9c6bce85..b60057ce83 100644 --- a/src/runtime/sys_linux_amd64.s +++ b/src/runtime/sys_linux_amd64.s @@ -33,10 +33,8 @@ #define SYS_clone 56 #define SYS_exit 60 #define SYS_kill 62 -#define SYS_uname 63 #define SYS_fcntl 72 #define SYS_sigaltstack 131 -#define SYS_mlock 149 #define SYS_arch_prctl 158 #define SYS_gettid 186 #define SYS_futex 202 @@ -789,20 +787,3 @@ TEXT runtime·sbrk0(SB),NOSPLIT,$0-8 SYSCALL MOVQ AX, ret+0(FP) RET - -// func uname(utsname *new_utsname) int -TEXT ·uname(SB),NOSPLIT,$0-16 - MOVQ utsname+0(FP), DI - MOVL $SYS_uname, AX - SYSCALL - MOVQ AX, ret+8(FP) - RET - -// func mlock(addr, len uintptr) int -TEXT ·mlock(SB),NOSPLIT,$0-24 - MOVQ addr+0(FP), DI - MOVQ len+8(FP), SI - MOVL $SYS_mlock, AX - SYSCALL - MOVQ AX, ret+16(FP) - RET -- cgit v1.3 From c6a11f0dd279f374602794af60c7cde4585a1e6f Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Tue, 11 Aug 2020 13:04:48 -0700 Subject: crypto,internal/bytealg: fix assembly that clobbers BP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BP should be callee-save. It will be saved automatically if there is a nonzero frame size. Otherwise, we need to avoid this register. Change-Id: If3f551efa42d830c8793d9f0183cb8daad7a2ab5 Reviewed-on: https://go-review.googlesource.com/c/go/+/248260 Run-TryBot: Keith Randall Reviewed-by: Michael Knyszek Reviewed-by: Martin Möhrmann TryBot-Result: Gobot Gobot --- src/crypto/elliptic/p256_asm_amd64.s | 5 ++-- src/crypto/md5/md5block_amd64.s | 2 +- src/internal/bytealg/index_amd64.s | 52 ++++++++++++++++++------------------ src/runtime/sys_linux_amd64.s | 8 +++--- 4 files changed, 33 insertions(+), 34 deletions(-) (limited to 'src/runtime/sys_linux_amd64.s') diff --git a/src/crypto/elliptic/p256_asm_amd64.s b/src/crypto/elliptic/p256_asm_amd64.s index 7afa54a58c..c77b11bcf2 100644 --- a/src/crypto/elliptic/p256_asm_amd64.s +++ b/src/crypto/elliptic/p256_asm_amd64.s @@ -1336,7 +1336,7 @@ TEXT p256SubInternal(SB),NOSPLIT,$0 RET /* ---------------------------------------*/ -TEXT p256MulInternal(SB),NOSPLIT,$0 +TEXT p256MulInternal(SB),NOSPLIT,$8 MOVQ acc4, mul0 MULQ t0 MOVQ mul0, acc0 @@ -1519,7 +1519,7 @@ TEXT p256MulInternal(SB),NOSPLIT,$0 RET /* ---------------------------------------*/ -TEXT p256SqrInternal(SB),NOSPLIT,$0 +TEXT p256SqrInternal(SB),NOSPLIT,$8 MOVQ acc4, mul0 MULQ acc5 @@ -2345,4 +2345,3 @@ TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48 RET /* ---------------------------------------*/ - diff --git a/src/crypto/md5/md5block_amd64.s b/src/crypto/md5/md5block_amd64.s index 90d932b146..7c7d92d7e8 100644 --- a/src/crypto/md5/md5block_amd64.s +++ b/src/crypto/md5/md5block_amd64.s @@ -13,7 +13,7 @@ // Licence: I hereby disclaim the copyright on this code and place it // in the public domain. -TEXT ·block(SB),NOSPLIT,$0-32 +TEXT ·block(SB),NOSPLIT,$8-32 MOVQ dig+0(FP), BP MOVQ p+8(FP), SI MOVQ p_len+16(FP), DX diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s index 4459820801..6193b57239 100644 --- a/src/internal/bytealg/index_amd64.s +++ b/src/internal/bytealg/index_amd64.s @@ -8,7 +8,7 @@ TEXT ·Index(SB),NOSPLIT,$0-56 MOVQ a_base+0(FP), DI MOVQ a_len+8(FP), DX - MOVQ b_base+24(FP), BP + MOVQ b_base+24(FP), R8 MOVQ b_len+32(FP), AX MOVQ DI, R10 LEAQ ret+48(FP), R11 @@ -17,7 +17,7 @@ TEXT ·Index(SB),NOSPLIT,$0-56 TEXT ·IndexString(SB),NOSPLIT,$0-40 MOVQ a_base+0(FP), DI MOVQ a_len+8(FP), DX - MOVQ b_base+16(FP), BP + MOVQ b_base+16(FP), R8 MOVQ b_len+24(FP), AX MOVQ DI, R10 LEAQ ret+32(FP), R11 @@ -26,7 +26,7 @@ TEXT ·IndexString(SB),NOSPLIT,$0-40 // AX: length of string, that we are searching for // DX: length of string, in which we are searching // DI: pointer to string, in which we are searching -// BP: pointer to string, that we are searching for +// R8: pointer to string, that we are searching for // R11: address, where to put return value // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them TEXT indexbody<>(SB),NOSPLIT,$0 @@ -37,11 +37,11 @@ TEXT indexbody<>(SB),NOSPLIT,$0 no_sse42: CMPQ AX, $2 JA _3_or_more - MOVW (BP), BP + MOVW (R8), R8 LEAQ -1(DI)(DX*1), DX loop2: MOVW (DI), SI - CMPW SI,BP + CMPW SI,R8 JZ success ADDQ $1,DI CMPQ DI,DX @@ -50,12 +50,12 @@ loop2: _3_or_more: CMPQ AX, $3 JA _4_or_more - MOVW 1(BP), BX - MOVW (BP), BP + MOVW 1(R8), BX + MOVW (R8), R8 LEAQ -2(DI)(DX*1), DX loop3: MOVW (DI), SI - CMPW SI,BP + CMPW SI,R8 JZ partial_success3 ADDQ $1,DI CMPQ DI,DX @@ -72,11 +72,11 @@ partial_success3: _4_or_more: CMPQ AX, $4 JA _5_or_more - MOVL (BP), BP + MOVL (R8), R8 LEAQ -3(DI)(DX*1), DX loop4: MOVL (DI), SI - CMPL SI,BP + CMPL SI,R8 JZ success ADDQ $1,DI CMPQ DI,DX @@ -87,11 +87,11 @@ _5_or_more: JA _8_or_more LEAQ 1(DI)(DX*1), DX SUBQ AX, DX - MOVL -4(BP)(AX*1), BX - MOVL (BP), BP + MOVL -4(R8)(AX*1), BX + MOVL (R8), R8 loop5to7: MOVL (DI), SI - CMPL SI,BP + CMPL SI,R8 JZ partial_success5to7 ADDQ $1,DI CMPQ DI,DX @@ -108,11 +108,11 @@ partial_success5to7: _8_or_more: CMPQ AX, $8 JA _9_or_more - MOVQ (BP), BP + MOVQ (R8), R8 LEAQ -7(DI)(DX*1), DX loop8: MOVQ (DI), SI - CMPQ SI,BP + CMPQ SI,R8 JZ success ADDQ $1,DI CMPQ DI,DX @@ -123,11 +123,11 @@ _9_or_more: JA _16_or_more LEAQ 1(DI)(DX*1), DX SUBQ AX, DX - MOVQ -8(BP)(AX*1), BX - MOVQ (BP), BP + MOVQ -8(R8)(AX*1), BX + MOVQ (R8), R8 loop9to15: MOVQ (DI), SI - CMPQ SI,BP + CMPQ SI,R8 JZ partial_success9to15 ADDQ $1,DI CMPQ DI,DX @@ -144,7 +144,7 @@ partial_success9to15: _16_or_more: CMPQ AX, $16 JA _17_or_more - MOVOU (BP), X1 + MOVOU (R8), X1 LEAQ -15(DI)(DX*1), DX loop16: MOVOU (DI), X2 @@ -161,8 +161,8 @@ _17_or_more: JA _32_or_more LEAQ 1(DI)(DX*1), DX SUBQ AX, DX - MOVOU -16(BP)(AX*1), X0 - MOVOU (BP), X1 + MOVOU -16(R8)(AX*1), X0 + MOVOU (R8), X1 loop17to31: MOVOU (DI), X2 PCMPEQB X1,X2 @@ -188,7 +188,7 @@ partial_success17to31: _32_or_more: CMPQ AX, $32 JA _33_to_63 - VMOVDQU (BP), Y1 + VMOVDQU (R8), Y1 LEAQ -31(DI)(DX*1), DX loop32: VMOVDQU (DI), Y2 @@ -203,8 +203,8 @@ loop32: _33_to_63: LEAQ 1(DI)(DX*1), DX SUBQ AX, DX - VMOVDQU -32(BP)(AX*1), Y0 - VMOVDQU (BP), Y1 + VMOVDQU -32(R8)(AX*1), Y0 + VMOVDQU (R8), Y1 loop33to63: VMOVDQU (DI), Y2 VPCMPEQB Y1, Y2, Y3 @@ -241,10 +241,10 @@ sse42: // This value was determined experimentally and is the ~same // on Nehalem (first with SSE42) and Haswell. JAE _9_or_more - LEAQ 16(BP), SI + LEAQ 16(R8), SI TESTW $0xff0, SI JEQ no_sse42 - MOVOU (BP), X1 + MOVOU (R8), X1 LEAQ -15(DI)(DX*1), SI MOVQ $16, R9 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s index b60057ce83..621c01b365 100644 --- a/src/runtime/sys_linux_amd64.s +++ b/src/runtime/sys_linux_amd64.s @@ -212,7 +212,7 @@ TEXT runtime·walltime1(SB),NOSPLIT,$16-12 // due to stack probes inserted to avoid stack/heap collisions. // See issue #20427. - MOVQ SP, BP // Save old SP; BP unchanged by C code. + MOVQ SP, R12 // Save old SP; R12 unchanged by C code. get_tls(CX) MOVQ g(CX), AX @@ -250,7 +250,7 @@ noswitch: MOVQ 0(SP), AX // sec MOVQ 8(SP), DX // nsec ret: - MOVQ BP, SP // Restore real SP + MOVQ R12, SP // Restore real SP // Restore vdsoPC, vdsoSP // We don't worry about being signaled between the two stores. // If we are not in a signal handler, we'll restore vdsoSP to 0, @@ -277,7 +277,7 @@ fallback: TEXT runtime·nanotime1(SB),NOSPLIT,$16-8 // Switch to g0 stack. See comment above in runtime·walltime. - MOVQ SP, BP // Save old SP; BP unchanged by C code. + MOVQ SP, R12 // Save old SP; R12 unchanged by C code. get_tls(CX) MOVQ g(CX), AX @@ -315,7 +315,7 @@ noswitch: MOVQ 0(SP), AX // sec MOVQ 8(SP), DX // nsec ret: - MOVQ BP, SP // Restore real SP + MOVQ R12, SP // Restore real SP // Restore vdsoPC, vdsoSP // We don't worry about being signaled between the two stores. // If we are not in a signal handler, we'll restore vdsoSP to 0, -- cgit v1.3 From 0941fc3f9ff43598d25fa6e964e7829a268102bf Mon Sep 17 00:00:00 2001 From: cui Date: Wed, 12 Aug 2020 17:33:41 +0000 Subject: runtime: reduce syscall when call runtime.clone Change-Id: I3ea398fd86aae4c86557dd6fff65d90a6f756890 GitHub-Last-Rev: 4c295388f7b5e6768ffd2530337f78b4c75a9310 GitHub-Pull-Request: golang/go#40392 Reviewed-on: https://go-review.googlesource.com/c/go/+/244626 Run-TryBot: Ian Lance Taylor TryBot-Result: Gobot Gobot Reviewed-by: Ian Lance Taylor --- src/runtime/sys_linux_amd64.s | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'src/runtime/sys_linux_amd64.s') diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s index 621c01b365..8d90813589 100644 --- a/src/runtime/sys_linux_amd64.s +++ b/src/runtime/sys_linux_amd64.s @@ -592,13 +592,25 @@ TEXT runtime·clone(SB),NOSPLIT,$0 MOVQ stk+8(FP), SI MOVQ $0, DX MOVQ $0, R10 - + MOVQ $0, R8 // Copy mp, gp, fn off parent stack for use by child. // Careful: Linux system call clobbers CX and R11. - MOVQ mp+16(FP), R8 + MOVQ mp+16(FP), R13 MOVQ gp+24(FP), R9 MOVQ fn+32(FP), R12 - + CMPQ R13, $0 // m + JEQ nog1 + CMPQ R9, $0 // g + JEQ nog1 + LEAQ m_tls(R13), R8 +#ifdef GOOS_android + // Android stores the TLS offset in runtime·tls_g. + SUBQ runtime·tls_g(SB), R8 +#else + ADDQ $8, R8 // ELF wants to use -8(FS) +#endif + ORQ $0x00080000, DI //add flag CLONE_SETTLS(0x00080000) to call clone +nog1: MOVL $SYS_clone, AX SYSCALL @@ -612,27 +624,23 @@ TEXT runtime·clone(SB),NOSPLIT,$0 MOVQ SI, SP // If g or m are nil, skip Go-related setup. - CMPQ R8, $0 // m - JEQ nog + CMPQ R13, $0 // m + JEQ nog2 CMPQ R9, $0 // g - JEQ nog + JEQ nog2 // Initialize m->procid to Linux tid MOVL $SYS_gettid, AX SYSCALL - MOVQ AX, m_procid(R8) - - // Set FS to point at m->tls. - LEAQ m_tls(R8), DI - CALL runtime·settls(SB) + MOVQ AX, m_procid(R13) // In child, set up new stack get_tls(CX) - MOVQ R8, g_m(R9) + MOVQ R13, g_m(R9) MOVQ R9, g(CX) CALL runtime·stackcheck(SB) -nog: +nog2: // Call fn CALL R12 -- cgit v1.3