From 92bda33d2771a9b12868d9025f113538fa7a84de Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Fri, 31 Jul 2020 15:58:00 -0400
Subject: runtime: revert signal stack mlocking

Go 1.14 included a (rather awful) workaround for a Linux kernel bug
that corrupted vector registers on x86 CPUs during signal delivery
(https://bugzilla.kernel.org/show_bug.cgi?id=205663). This bug was
introduced in Linux 5.2 and fixed in 5.3.15, 5.4.2 and all 5.5 and
later kernels. The fix was also back-ported by major distros. This
workaround was necessary, but had unfortunate downsides, including
causing Go programs to exceed the mlock ulimit in many configurations
(#37436).

We're reasonably confident that by the Go 1.16 release, the number of
systems running affected kernels will be vanishingly small. Hence,
this CL removes this workaround.

This effectively reverts CLs 209597 (version parser), 209899 (mlock
top of signal stack), 210299 (better failure message), 223121 (soft
mlock failure handling), and 244059 (special-case patched Ubuntu
kernels). The one thing we keep is the osArchInit function. It's empty
everywhere now, but is a reasonable hook to have.

Updates #35326, #35777 (the original register corruption bugs).
Updates #40184 (request to revert in 1.15).
Fixes #35979.

Change-Id: Ie213270837095576f1f3ef46bf3de187dc486c50
Reviewed-on: https://go-review.googlesource.com/c/go/+/246200
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
---
 src/runtime/sys_linux_amd64.s | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'src/runtime/sys_linux_amd64.s')

diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index fe9c6bce85..b60057ce83 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -33,10 +33,8 @@
 #define SYS_clone		56
 #define SYS_exit		60
 #define SYS_kill		62
-#define SYS_uname		63
 #define SYS_fcntl		72
 #define SYS_sigaltstack 	131
-#define SYS_mlock		149
 #define SYS_arch_prctl		158
 #define SYS_gettid		186
 #define SYS_futex		202
@@ -789,20 +787,3 @@ TEXT runtime·sbrk0(SB),NOSPLIT,$0-8
 	SYSCALL
 	MOVQ	AX, ret+0(FP)
 	RET
-
-// func uname(utsname *new_utsname) int
-TEXT ·uname(SB),NOSPLIT,$0-16
-	MOVQ    utsname+0(FP), DI
-	MOVL    $SYS_uname, AX
-	SYSCALL
-	MOVQ	AX, ret+8(FP)
-	RET
-
-// func mlock(addr, len uintptr) int
-TEXT ·mlock(SB),NOSPLIT,$0-24
-	MOVQ    addr+0(FP), DI
-	MOVQ    len+8(FP), SI
-	MOVL    $SYS_mlock, AX
-	SYSCALL
-	MOVQ	AX, ret+16(FP)
-	RET
-- 
cgit v1.3


From c6a11f0dd279f374602794af60c7cde4585a1e6f Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Tue, 11 Aug 2020 13:04:48 -0700
Subject: crypto,internal/bytealg: fix assembly that clobbers BP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BP should be callee-save. It will be saved automatically if
there is a nonzero frame size. Otherwise, we need to avoid this register.

Change-Id: If3f551efa42d830c8793d9f0183cb8daad7a2ab5
Reviewed-on: https://go-review.googlesource.com/c/go/+/248260
Run-TryBot: Keith Randall <khr@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---
 src/crypto/elliptic/p256_asm_amd64.s |  5 ++--
 src/crypto/md5/md5block_amd64.s      |  2 +-
 src/internal/bytealg/index_amd64.s   | 52 ++++++++++++++++++------------------
 src/runtime/sys_linux_amd64.s        |  8 +++---
 4 files changed, 33 insertions(+), 34 deletions(-)

(limited to 'src/runtime/sys_linux_amd64.s')

diff --git a/src/crypto/elliptic/p256_asm_amd64.s b/src/crypto/elliptic/p256_asm_amd64.s
index 7afa54a58c..c77b11bcf2 100644
--- a/src/crypto/elliptic/p256_asm_amd64.s
+++ b/src/crypto/elliptic/p256_asm_amd64.s
@@ -1336,7 +1336,7 @@ TEXT p256SubInternal(SB),NOSPLIT,$0
 
 	RET
 /* ---------------------------------------*/
-TEXT p256MulInternal(SB),NOSPLIT,$0
+TEXT p256MulInternal(SB),NOSPLIT,$8
 	MOVQ acc4, mul0
 	MULQ t0
 	MOVQ mul0, acc0
@@ -1519,7 +1519,7 @@ TEXT p256MulInternal(SB),NOSPLIT,$0
 
 	RET
 /* ---------------------------------------*/
-TEXT p256SqrInternal(SB),NOSPLIT,$0
+TEXT p256SqrInternal(SB),NOSPLIT,$8
 
 	MOVQ acc4, mul0
 	MULQ acc5
@@ -2345,4 +2345,3 @@ TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48
 
 	RET
 /* ---------------------------------------*/
-
diff --git a/src/crypto/md5/md5block_amd64.s b/src/crypto/md5/md5block_amd64.s
index 90d932b146..7c7d92d7e8 100644
--- a/src/crypto/md5/md5block_amd64.s
+++ b/src/crypto/md5/md5block_amd64.s
@@ -13,7 +13,7 @@
 // Licence: I hereby disclaim the copyright on this code and place it
 // in the public domain.
 
-TEXT	·block(SB),NOSPLIT,$0-32
+TEXT	·block(SB),NOSPLIT,$8-32
 	MOVQ	dig+0(FP),	BP
 	MOVQ	p+8(FP),	SI
 	MOVQ	p_len+16(FP), DX
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s
index 4459820801..6193b57239 100644
--- a/src/internal/bytealg/index_amd64.s
+++ b/src/internal/bytealg/index_amd64.s
@@ -8,7 +8,7 @@
 TEXT ·Index(SB),NOSPLIT,$0-56
 	MOVQ a_base+0(FP), DI
 	MOVQ a_len+8(FP), DX
-	MOVQ b_base+24(FP), BP
+	MOVQ b_base+24(FP), R8
 	MOVQ b_len+32(FP), AX
 	MOVQ DI, R10
 	LEAQ ret+48(FP), R11
@@ -17,7 +17,7 @@ TEXT ·Index(SB),NOSPLIT,$0-56
 TEXT ·IndexString(SB),NOSPLIT,$0-40
 	MOVQ a_base+0(FP), DI
 	MOVQ a_len+8(FP), DX
-	MOVQ b_base+16(FP), BP
+	MOVQ b_base+16(FP), R8
 	MOVQ b_len+24(FP), AX
 	MOVQ DI, R10
 	LEAQ ret+32(FP), R11
@@ -26,7 +26,7 @@ TEXT ·IndexString(SB),NOSPLIT,$0-40
 // AX: length of string, that we are searching for
 // DX: length of string, in which we are searching
 // DI: pointer to string, in which we are searching
-// BP: pointer to string, that we are searching for
+// R8: pointer to string, that we are searching for
 // R11: address, where to put return value
 // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
 TEXT indexbody<>(SB),NOSPLIT,$0
@@ -37,11 +37,11 @@ TEXT indexbody<>(SB),NOSPLIT,$0
 no_sse42:
 	CMPQ AX, $2
 	JA   _3_or_more
-	MOVW (BP), BP
+	MOVW (R8), R8
 	LEAQ -1(DI)(DX*1), DX
 loop2:
 	MOVW (DI), SI
-	CMPW SI,BP
+	CMPW SI,R8
 	JZ success
 	ADDQ $1,DI
 	CMPQ DI,DX
@@ -50,12 +50,12 @@ loop2:
 _3_or_more:
 	CMPQ AX, $3
 	JA   _4_or_more
-	MOVW 1(BP), BX
-	MOVW (BP), BP
+	MOVW 1(R8), BX
+	MOVW (R8), R8
 	LEAQ -2(DI)(DX*1), DX
 loop3:
 	MOVW (DI), SI
-	CMPW SI,BP
+	CMPW SI,R8
 	JZ   partial_success3
 	ADDQ $1,DI
 	CMPQ DI,DX
@@ -72,11 +72,11 @@ partial_success3:
 _4_or_more:
 	CMPQ AX, $4
 	JA   _5_or_more
-	MOVL (BP), BP
+	MOVL (R8), R8
 	LEAQ -3(DI)(DX*1), DX
 loop4:
 	MOVL (DI), SI
-	CMPL SI,BP
+	CMPL SI,R8
 	JZ   success
 	ADDQ $1,DI
 	CMPQ DI,DX
@@ -87,11 +87,11 @@ _5_or_more:
 	JA   _8_or_more
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
-	MOVL -4(BP)(AX*1), BX
-	MOVL (BP), BP
+	MOVL -4(R8)(AX*1), BX
+	MOVL (R8), R8
 loop5to7:
 	MOVL (DI), SI
-	CMPL SI,BP
+	CMPL SI,R8
 	JZ   partial_success5to7
 	ADDQ $1,DI
 	CMPQ DI,DX
@@ -108,11 +108,11 @@ partial_success5to7:
 _8_or_more:
 	CMPQ AX, $8
 	JA   _9_or_more
-	MOVQ (BP), BP
+	MOVQ (R8), R8
 	LEAQ -7(DI)(DX*1), DX
 loop8:
 	MOVQ (DI), SI
-	CMPQ SI,BP
+	CMPQ SI,R8
 	JZ   success
 	ADDQ $1,DI
 	CMPQ DI,DX
@@ -123,11 +123,11 @@ _9_or_more:
 	JA   _16_or_more
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
-	MOVQ -8(BP)(AX*1), BX
-	MOVQ (BP), BP
+	MOVQ -8(R8)(AX*1), BX
+	MOVQ (R8), R8
 loop9to15:
 	MOVQ (DI), SI
-	CMPQ SI,BP
+	CMPQ SI,R8
 	JZ   partial_success9to15
 	ADDQ $1,DI
 	CMPQ DI,DX
@@ -144,7 +144,7 @@ partial_success9to15:
 _16_or_more:
 	CMPQ AX, $16
 	JA   _17_or_more
-	MOVOU (BP), X1
+	MOVOU (R8), X1
 	LEAQ -15(DI)(DX*1), DX
 loop16:
 	MOVOU (DI), X2
@@ -161,8 +161,8 @@ _17_or_more:
 	JA   _32_or_more
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
-	MOVOU -16(BP)(AX*1), X0
-	MOVOU (BP), X1
+	MOVOU -16(R8)(AX*1), X0
+	MOVOU (R8), X1
 loop17to31:
 	MOVOU (DI), X2
 	PCMPEQB X1,X2
@@ -188,7 +188,7 @@ partial_success17to31:
 _32_or_more:
 	CMPQ AX, $32
 	JA   _33_to_63
-	VMOVDQU (BP), Y1
+	VMOVDQU (R8), Y1
 	LEAQ -31(DI)(DX*1), DX
 loop32:
 	VMOVDQU (DI), Y2
@@ -203,8 +203,8 @@ loop32:
 _33_to_63:
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
-	VMOVDQU -32(BP)(AX*1), Y0
-	VMOVDQU (BP), Y1
+	VMOVDQU -32(R8)(AX*1), Y0
+	VMOVDQU (R8), Y1
 loop33to63:
 	VMOVDQU (DI), Y2
 	VPCMPEQB Y1, Y2, Y3
@@ -241,10 +241,10 @@ sse42:
 	// This value was determined experimentally and is the ~same
 	// on Nehalem (first with SSE42) and Haswell.
 	JAE _9_or_more
-	LEAQ 16(BP), SI
+	LEAQ 16(R8), SI
 	TESTW $0xff0, SI
 	JEQ no_sse42
-	MOVOU (BP), X1
+	MOVOU (R8), X1
 	LEAQ -15(DI)(DX*1), SI
 	MOVQ $16, R9
 	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index b60057ce83..621c01b365 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -212,7 +212,7 @@ TEXT runtime·walltime1(SB),NOSPLIT,$16-12
 	// due to stack probes inserted to avoid stack/heap collisions.
 	// See issue #20427.
 
-	MOVQ	SP, BP	// Save old SP; BP unchanged by C code.
+	MOVQ	SP, R12	// Save old SP; R12 unchanged by C code.
 
 	get_tls(CX)
 	MOVQ	g(CX), AX
@@ -250,7 +250,7 @@ noswitch:
 	MOVQ	0(SP), AX	// sec
 	MOVQ	8(SP), DX	// nsec
 ret:
-	MOVQ	BP, SP		// Restore real SP
+	MOVQ	R12, SP		// Restore real SP
 	// Restore vdsoPC, vdsoSP
 	// We don't worry about being signaled between the two stores.
 	// If we are not in a signal handler, we'll restore vdsoSP to 0,
@@ -277,7 +277,7 @@ fallback:
 TEXT runtime·nanotime1(SB),NOSPLIT,$16-8
 	// Switch to g0 stack. See comment above in runtime·walltime.
 
-	MOVQ	SP, BP	// Save old SP; BP unchanged by C code.
+	MOVQ	SP, R12	// Save old SP; R12 unchanged by C code.
 
 	get_tls(CX)
 	MOVQ	g(CX), AX
@@ -315,7 +315,7 @@ noswitch:
 	MOVQ	0(SP), AX	// sec
 	MOVQ	8(SP), DX	// nsec
 ret:
-	MOVQ	BP, SP		// Restore real SP
+	MOVQ	R12, SP		// Restore real SP
 	// Restore vdsoPC, vdsoSP
 	// We don't worry about being signaled between the two stores.
 	// If we are not in a signal handler, we'll restore vdsoSP to 0,
-- 
cgit v1.3


From 0941fc3f9ff43598d25fa6e964e7829a268102bf Mon Sep 17 00:00:00 2001
From: cui <cuiweixie@gmail.com>
Date: Wed, 12 Aug 2020 17:33:41 +0000
Subject: runtime: reduce syscall when call runtime.clone

Change-Id: I3ea398fd86aae4c86557dd6fff65d90a6f756890
GitHub-Last-Rev: 4c295388f7b5e6768ffd2530337f78b4c75a9310
GitHub-Pull-Request: golang/go#40392
Reviewed-on: https://go-review.googlesource.com/c/go/+/244626
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
---
 src/runtime/sys_linux_amd64.s | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

(limited to 'src/runtime/sys_linux_amd64.s')

diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index 621c01b365..8d90813589 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -592,13 +592,25 @@ TEXT runtime·clone(SB),NOSPLIT,$0
 	MOVQ	stk+8(FP), SI
 	MOVQ	$0, DX
 	MOVQ	$0, R10
-
+	MOVQ    $0, R8
 	// Copy mp, gp, fn off parent stack for use by child.
 	// Careful: Linux system call clobbers CX and R11.
-	MOVQ	mp+16(FP), R8
+	MOVQ	mp+16(FP), R13
 	MOVQ	gp+24(FP), R9
 	MOVQ	fn+32(FP), R12
-
+	CMPQ	R13, $0    // m
+	JEQ	nog1
+	CMPQ	R9, $0    // g
+	JEQ	nog1
+	LEAQ	m_tls(R13), R8
+#ifdef GOOS_android
+	// Android stores the TLS offset in runtime·tls_g.
+	SUBQ	runtime·tls_g(SB), R8
+#else
+	ADDQ	$8, R8	// ELF wants to use -8(FS)
+#endif
+	ORQ 	$0x00080000, DI //add flag CLONE_SETTLS(0x00080000) to call clone
+nog1:
 	MOVL	$SYS_clone, AX
 	SYSCALL
 
@@ -612,27 +624,23 @@ TEXT runtime·clone(SB),NOSPLIT,$0
 	MOVQ	SI, SP
 
 	// If g or m are nil, skip Go-related setup.
-	CMPQ	R8, $0    // m
-	JEQ	nog
+	CMPQ	R13, $0    // m
+	JEQ	nog2
 	CMPQ	R9, $0    // g
-	JEQ	nog
+	JEQ	nog2
 
 	// Initialize m->procid to Linux tid
 	MOVL	$SYS_gettid, AX
 	SYSCALL
-	MOVQ	AX, m_procid(R8)
-
-	// Set FS to point at m->tls.
-	LEAQ	m_tls(R8), DI
-	CALL	runtime·settls(SB)
+	MOVQ	AX, m_procid(R13)
 
 	// In child, set up new stack
 	get_tls(CX)
-	MOVQ	R8, g_m(R9)
+	MOVQ	R13, g_m(R9)
 	MOVQ	R9, g(CX)
 	CALL	runtime·stackcheck(SB)
 
-nog:
+nog2:
 	// Call fn
 	CALL	R12
 
-- 
cgit v1.3