From eeca3ba92fdb07e44abf3e2bebfcede03e1eae12 Mon Sep 17 00:00:00 2001 From: Lynn Boger Date: Thu, 28 Apr 2016 07:16:08 -0500 Subject: sync/atomic, runtime/internal/atomic: improve ppc64x atomics The following performance improvements have been made to the low-level atomic functions for ppc64le & ppc64: - For those cases containing a lwarx and stwcx (or other sizes): sync, lwarx, maybe something, stwcx, loop to sync, sync, isync The sync is moved before (outside) the lwarx/stwcx loop, and the sync after is removed, so it becomes: sync, lwarx, maybe something, stwcx, loop to lwarx, isync - For the Or8 and And8, the shifting and manipulation of the address to the word aligned version were removed and the instructions were changed to use lbarx, stbcx instead of register shifting, xor, then lwarx, stwcx. - New instructions LWSYNC, LBAR, STBCC were tested and added. runtime/atomic_ppc64x.s was changed to use the LWSYNC opcode instead of the WORD encoding. Fixes #15469 Ran some of the benchmarks in the runtime and sync directories. Some results varied from run to run but the trend was improvement based on best times for base and new: runtime.test: BenchmarkChanNonblocking-128 0.88 0.89 +1.14% BenchmarkChanUncontended-128 569 511 -10.19% BenchmarkChanContended-128 63110 53231 -15.65% BenchmarkChanSync-128 691 598 -13.46% BenchmarkChanSyncWork-128 11355 11649 +2.59% BenchmarkChanProdCons0-128 2402 2090 -12.99% BenchmarkChanProdCons10-128 1348 1363 +1.11% BenchmarkChanProdCons100-128 1002 746 -25.55% BenchmarkChanProdConsWork0-128 2554 2720 +6.50% BenchmarkChanProdConsWork10-128 1909 1804 -5.50% BenchmarkChanProdConsWork100-128 1624 1580 -2.71% BenchmarkChanCreation-128 237 212 -10.55% BenchmarkChanSem-128 705 667 -5.39% BenchmarkChanPopular-128 5081190 4497566 -11.49% BenchmarkCreateGoroutines-128 532 473 -11.09% BenchmarkCreateGoroutinesParallel-128 35.0 34.7 -0.86% BenchmarkCreateGoroutinesCapture-128 4923 4200 -14.69% sync.test: BenchmarkUncontendedSemaphore-128 112 94.2 -15.89% BenchmarkContendedSemaphore-128 133 128 -3.76% BenchmarkMutexUncontended-128 1.90 1.67 -12.11% BenchmarkMutex-128 353 310 -12.18% BenchmarkMutexSlack-128 304 283 -6.91% BenchmarkMutexWork-128 554 541 -2.35% BenchmarkMutexWorkSlack-128 567 556 -1.94% BenchmarkMutexNoSpin-128 275 242 -12.00% BenchmarkMutexSpin-128 1129 1030 -8.77% BenchmarkOnce-128 1.08 0.96 -11.11% BenchmarkPool-128 29.8 27.4 -8.05% BenchmarkPoolOverflow-128 40564 36583 -9.81% BenchmarkSemaUncontended-128 3.14 2.63 -16.24% BenchmarkSemaSyntNonblock-128 1087 1069 -1.66% BenchmarkSemaSyntBlock-128 897 893 -0.45% BenchmarkSemaWorkNonblock-128 1034 1028 -0.58% BenchmarkSemaWorkBlock-128 949 886 -6.64% Change-Id: I4403fb29d3cd5254b7b1ce87a216bd11b391079e Reviewed-on: https://go-review.googlesource.com/22549 Reviewed-by: Michael Munday Reviewed-by: Minux Ma --- src/sync/atomic/asm_ppc64x.s | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'src/sync') diff --git a/src/sync/atomic/asm_ppc64x.s b/src/sync/atomic/asm_ppc64x.s index ed348458b4..2474e96435 100644 --- a/src/sync/atomic/asm_ppc64x.s +++ b/src/sync/atomic/asm_ppc64x.s @@ -15,8 +15,7 @@ TEXT ·SwapUint32(SB),NOSPLIT,$0-20 SYNC LWAR (R3), R5 STWCCC R4, (R3) - BNE -3(PC) - SYNC + BNE -2(PC) ISYNC MOVW R5, old+16(FP) RET @@ -30,8 +29,7 @@ TEXT ·SwapUint64(SB),NOSPLIT,$0-24 SYNC LDAR (R3), R5 STDCCC R4, (R3) - BNE -3(PC) - SYNC + BNE -2(PC) ISYNC MOVD R5, old+16(FP) RET @@ -49,10 +47,9 @@ TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-17 SYNC LWAR (R3), R6 CMPW R6, R4 - BNE 8(PC) + BNE 7(PC) STWCCC R5, (R3) - BNE -5(PC) - SYNC + BNE -4(PC) ISYNC MOVD $1, R3 MOVB R3, swapped+16(FP) @@ -73,10 +70,9 @@ TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-25 SYNC LDAR (R3), R6 CMP R6, R4 - BNE 8(PC) + BNE 7(PC) STDCCC R5, (R3) - BNE -5(PC) - SYNC + BNE -4(PC) ISYNC MOVD $1, R3 MOVB R3, swapped+24(FP) @@ -94,8 +90,7 @@ TEXT ·AddUint32(SB),NOSPLIT,$0-20 LWAR (R3), R5 ADD R4, R5 STWCCC R5, (R3) - BNE -4(PC) - SYNC + BNE -3(PC) ISYNC MOVW R5, ret+16(FP) RET @@ -113,8 +108,7 @@ TEXT ·AddUint64(SB),NOSPLIT,$0-24 LDAR (R3), R5 ADD R4, R5 STDCCC R5, (R3) - BNE -4(PC) - SYNC + BNE -3(PC) ISYNC MOVD R5, ret+16(FP) RET -- cgit v1.3