aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/sys_linux_386.s
diff options
context:
space:
mode:
authorFrank Somers <fsomers@arista.com>2017-10-10 22:50:01 +0100
committerIan Lance Taylor <iant@golang.org>2017-10-13 14:41:04 +0000
commitaf40cbe83c451176a1576ff5ce5755c3dc119f45 (patch)
tree1bc4e0b90fef19f0ffe18e2b2e862a2fc9016370 /src/runtime/sys_linux_386.s
parentbf237f534cc2a5ed23ae7ffa7deb38cbee64cae6 (diff)
downloadgo-af40cbe83c451176a1576ff5ce5755c3dc119f45.tar.xz
runtime: use vDSO on linux/386 to improve time.Now performance
This change adds support for accelerating time.Now by using the __vdso_clock_gettime fast-path via the vDSO on linux/386 if it is available. When the vDSO path to the clocks is available, it is typically 5x-10x faster than the syscall path (see benchmark extract below). Two such calls are made for each time.Now() call on most platforms as of go 1.9. - Add vdso_linux_386.go, containing the ELF32 definitions for use by vdso_linux.go, the maximum array size, and the symbols to be located in the vDSO. - Modify runtime.walltime and runtime.nanotime to check for and use the vDSO fast-path if available, or fall back to the existing syscall path. - Reduce the stack reservations for runtime.walltime and runtime.monotime from 32 to 16 bytes. It appears the syscall path actually only needed 8 bytes, but 16 is now needed to cover the syscall and vDSO paths. - Remove clearing DX from the syscall paths as clock_gettime only takes 2 args (BX, CX in syscall calling convention), so there should be no need to clear DX. The included BenchmarkTimeNow was run with -cpu=1 -count=20 on an "Intel(R) Celeron(R) CPU J1900 @ 1.99GHz", comparing released go 1.9.1 vs this change. This shows a gain in performance on linux/386 (6.89x), and that no regression occurred on linux/amd64 due to this change. Kernel: linux/i686, GOOS=linux GOARCH=386 name old time/op new time/op delta TimeNow 978ns ± 0% 142ns ± 0% -85.48% (p=0.000 n=16+20) Kernel: linux/x86_64, GOOS=linux GOARCH=amd64 name old time/op new time/op delta TimeNow 125ns ± 0% 125ns ± 0% ~ (all equal) Gains are more dramatic in virtualized environments, presumably due to the overhead of virtualizing the syscall. Fixes #22190 Change-Id: I2f83ce60cb1b8b310c9ced0706bb463c1b3aedf8 Reviewed-on: https://go-review.googlesource.com/69390 Run-TryBot: Ian Lance Taylor <iant@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Ian Lance Taylor <iant@golang.org>
Diffstat (limited to 'src/runtime/sys_linux_386.s')
-rw-r--r--src/runtime/sys_linux_386.s43
1 files changed, 39 insertions, 4 deletions
diff --git a/src/runtime/sys_linux_386.s b/src/runtime/sys_linux_386.s
index 79985070f1..722d2ab2d3 100644
--- a/src/runtime/sys_linux_386.s
+++ b/src/runtime/sys_linux_386.s
@@ -203,12 +203,34 @@ TEXT runtime·mincore(SB),NOSPLIT,$0-16
RET
// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB), NOSPLIT, $32
+TEXT runtime·walltime(SB), NOSPLIT, $16
+ // Stack layout, depending on call path:
+ // x(SP) vDSO INVOKE_SYSCALL
+ // 12 ts.tv_nsec ts.tv_nsec
+ // 8 ts.tv_sec ts.tv_sec
+ // 4 &ts -
+ // 0 CLOCK_<id> -
+ //
+ // If we take the vDSO path, we're calling a function with gcc calling convention.
+ // We're guaranteed 128 bytes on entry. We've taken 16, and the call uses another 4,
+ // leaving 108 for __vdso_clock_gettime to use.
+ MOVL runtime·__vdso_clock_gettime_sym(SB), AX
+ CMPL AX, $0
+ JEQ fallback
+
+ LEAL 8(SP), BX // &ts (struct timespec)
+ MOVL BX, 4(SP)
+ MOVL $0, 0(SP) // CLOCK_REALTIME
+ CALL AX
+ JMP finish
+
+fallback:
MOVL $SYS_clock_gettime, AX
MOVL $0, BX // CLOCK_REALTIME
LEAL 8(SP), CX
- MOVL $0, DX
INVOKE_SYSCALL
+
+finish:
MOVL 8(SP), AX // sec
MOVL 12(SP), BX // nsec
@@ -220,12 +242,25 @@ TEXT runtime·walltime(SB), NOSPLIT, $32
// int64 nanotime(void) so really
// void nanotime(int64 *nsec)
-TEXT runtime·nanotime(SB), NOSPLIT, $32
+TEXT runtime·nanotime(SB), NOSPLIT, $16
+ // See comments above in walltime() about stack space usage and layout.
+ MOVL runtime·__vdso_clock_gettime_sym(SB), AX
+ CMPL AX, $0
+ JEQ fallback
+
+ LEAL 8(SP), BX // &ts (struct timespec)
+ MOVL BX, 4(SP)
+ MOVL $1, 0(SP) // CLOCK_MONOTONIC
+ CALL AX
+ JMP finish
+
+fallback:
MOVL $SYS_clock_gettime, AX
MOVL $1, BX // CLOCK_MONOTONIC
LEAL 8(SP), CX
- MOVL $0, DX
INVOKE_SYSCALL
+
+finish:
MOVL 8(SP), AX // sec
MOVL 12(SP), BX // nsec