aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/asm_amd64.s
diff options
context:
space:
mode:
authordoujiang24 <doujiang24@gmail.com>2023-03-24 01:30:46 +0000
committerCherry Mui <cherryyz@google.com>2023-03-24 16:00:24 +0000
commitef0dedce87b505f6115dd39b4329da9e9231ff95 (patch)
tree790632c8421af9f34fd84b70b2802ffb964a0533 /src/runtime/asm_amd64.s
parenta6c382eaa8eaa611d71232aa4d5391b56a5c2693 (diff)
downloadgo-ef0dedce87b505f6115dd39b4329da9e9231ff95.tar.xz
runtime/cgo: store M for C-created thread in pthread key
In a C thread, it's necessary to acquire an extra M by using needm while invoking a Go function from C. But, needm and dropm are heavy costs due to the signal-related syscalls. So, we change to not dropm while returning back to C, which means binding the extra M to the C thread until it exits, to avoid needm and dropm on each C to Go call. Instead, we only dropm while the C thread exits, so the extra M won't leak. When invoking a Go function from C: Allocate a pthread variable using pthread_key_create, only once per shared object, and register a thread-exit-time destructor. And store the g0 of the current m into the thread-specified value of the pthread key, only once per C thread, so that the destructor will put the extra M back onto the extra M list while the C thread exits. When returning back to C: Skip dropm in cgocallback, when the pthread variable has been created, so that the extra M will be reused the next time invoke a Go function from C. This is purely a performance optimization. The old version, in which needm & dropm happen on each cgo call, is still correct too, and we have to keep the old version on systems with cgo but without pthreads, like Windows. This optimization is significant, and the specific value depends on the OS system and CPU, but in general, it can be considered as 10x faster, for a simple Go function call from a C thread. For the newly added BenchmarkCGoInCThread, some benchmark results: 1. it's 28x faster, from 3395 ns/op to 121 ns/op, in darwin OS & Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz 2. it's 6.5x faster, from 1495 ns/op to 230 ns/op, in Linux OS & Intel(R) Xeon(R) CPU E5-2630 0 @ 2.30GHz Fixes #51676 Change-Id: I380702fe2f9b6b401b2d6f04b0aba990f4b9ee6c GitHub-Last-Rev: 93dc64ad98e5583372e41f65ee4b7ab78b5aff51 GitHub-Pull-Request: golang/go#51679 Reviewed-on: https://go-review.googlesource.com/c/go/+/392854 Reviewed-by: Ian Lance Taylor <iant@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: thepudds <thepudds1460@gmail.com> Reviewed-by: Cherry Mui <cherryyz@google.com>
Diffstat (limited to 'src/runtime/asm_amd64.s')
-rw-r--r--src/runtime/asm_amd64.s38
1 files changed, 33 insertions, 5 deletions
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 5e89c8d2da..c8641cb2c2 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -915,7 +915,20 @@ GLOBL zeroTLS<>(SB),RODATA,$const_tlsSize
TEXT ·cgocallback(SB),NOSPLIT,$24-24
NO_LOCAL_POINTERS
- // If g is nil, Go did not create the current thread.
+ // Skip cgocallbackg, just dropm when fn is nil, and frame is the saved g.
+ // It is used to dropm while thread is exiting.
+ MOVQ fn+0(FP), AX
+ CMPQ AX, $0
+ JNE loadg
+ // Restore the g from frame.
+ get_tls(CX)
+ MOVQ frame+8(FP), BX
+ MOVQ BX, g(CX)
+ JMP dropm
+
+loadg:
+ // If g is nil, Go did not create the current thread,
+ // or if this thread never called into Go on pthread platforms.
// Call needm to obtain one m for temporary use.
// In this case, we're running on the thread stack, so there's
// lots of space, but the linker doesn't know. Hide the call from
@@ -953,9 +966,9 @@ needm:
// a bad value in there, in case needm tries to use it.
XORPS X15, X15
XORQ R14, R14
- MOVQ $runtime·needm<ABIInternal>(SB), AX
+ MOVQ $runtime·needAndBindM<ABIInternal>(SB), AX
CALL AX
- MOVQ $0, savedm-8(SP) // dropm on return
+ MOVQ $0, savedm-8(SP)
get_tls(CX)
MOVQ g(CX), BX
MOVQ g_m(BX), BX
@@ -1044,11 +1057,26 @@ havem:
MOVQ 0(SP), AX
MOVQ AX, (g_sched+gobuf_sp)(SI)
- // If the m on entry was nil, we called needm above to borrow an m
- // for the duration of the call. Since the call is over, return it with dropm.
+ // If the m on entry was nil, we called needm above to borrow an m,
+ // 1. for the duration of the call on non-pthread platforms,
+ // 2. or the duration of the C thread alive on pthread platforms.
+ // If the m on entry wasn't nil,
+ // 1. the thread might be a Go thread,
+ // 2. or it's wasn't the first call from a C thread on pthread platforms,
+ // since the we skip dropm to resue the m in the first call.
MOVQ savedm-8(SP), BX
CMPQ BX, $0
JNE done
+
+ // Skip dropm to reuse it in the next call, when a pthread key has been created.
+ MOVQ _cgo_pthread_key_created(SB), AX
+ // It means cgo is disabled when _cgo_pthread_key_created is a nil pointer, need dropm.
+ CMPQ AX, $0
+ JEQ dropm
+ CMPQ (AX), $0
+ JNE done
+
+dropm:
MOVQ $runtime·dropm(SB), AX
CALL AX
#ifdef GOOS_windows