From eeca3ba92fdb07e44abf3e2bebfcede03e1eae12 Mon Sep 17 00:00:00 2001
From: Lynn Boger <laboger@linux.vnet.ibm.com>
Date: Thu, 28 Apr 2016 07:16:08 -0500
Subject: sync/atomic, runtime/internal/atomic: improve ppc64x atomics

The following performance improvements have been made to the
low-level atomic functions for ppc64le & ppc64:

- For those cases containing a lwarx and stwcx (or other sizes):
sync, lwarx, maybe something, stwcx, loop to sync, sync, isync
The sync is moved before (outside) the lwarx/stwcx loop, and the
 sync after is removed, so it becomes:
sync, lwarx, maybe something, stwcx, loop to lwarx, isync

- For the Or8 and And8, the shifting and manipulation of the
address to the word aligned version were removed and the
instructions were changed to use lbarx, stbcx instead of
register shifting, xor, then lwarx, stwcx.

- New instructions LWSYNC, LBAR, STBCC were tested and added.
runtime/atomic_ppc64x.s was changed to use the LWSYNC opcode
instead of the WORD encoding.

Fixes #15469

Ran some of the benchmarks in the runtime and sync directories.
Some results varied from run to run but the trend was improvement
based on best times for base and new:

runtime.test:
BenchmarkChanNonblocking-128         0.88          0.89          +1.14%
BenchmarkChanUncontended-128         569           511           -10.19%
BenchmarkChanContended-128           63110         53231         -15.65%
BenchmarkChanSync-128                691           598           -13.46%
BenchmarkChanSyncWork-128            11355         11649         +2.59%
BenchmarkChanProdCons0-128           2402          2090          -12.99%
BenchmarkChanProdCons10-128          1348          1363          +1.11%
BenchmarkChanProdCons100-128         1002          746           -25.55%
BenchmarkChanProdConsWork0-128       2554          2720          +6.50%
BenchmarkChanProdConsWork10-128      1909          1804          -5.50%
BenchmarkChanProdConsWork100-128     1624          1580          -2.71%
BenchmarkChanCreation-128            237           212           -10.55%
BenchmarkChanSem-128                 705           667           -5.39%
BenchmarkChanPopular-128             5081190       4497566       -11.49%

BenchmarkCreateGoroutines-128             532           473           -11.09%
BenchmarkCreateGoroutinesParallel-128     35.0          34.7          -0.86%
BenchmarkCreateGoroutinesCapture-128      4923          4200          -14.69%

sync.test:
BenchmarkUncontendedSemaphore-128      112           94.2          -15.89%
BenchmarkContendedSemaphore-128        133           128           -3.76%
BenchmarkMutexUncontended-128          1.90          1.67          -12.11%
BenchmarkMutex-128                     353           310           -12.18%
BenchmarkMutexSlack-128                304           283           -6.91%
BenchmarkMutexWork-128                 554           541           -2.35%
BenchmarkMutexWorkSlack-128            567           556           -1.94%
BenchmarkMutexNoSpin-128               275           242           -12.00%
BenchmarkMutexSpin-128                 1129          1030          -8.77%
BenchmarkOnce-128                      1.08          0.96          -11.11%
BenchmarkPool-128                      29.8          27.4          -8.05%
BenchmarkPoolOverflow-128              40564         36583         -9.81%
BenchmarkSemaUncontended-128           3.14          2.63          -16.24%
BenchmarkSemaSyntNonblock-128          1087          1069          -1.66%
BenchmarkSemaSyntBlock-128             897           893           -0.45%
BenchmarkSemaWorkNonblock-128          1034          1028          -0.58%
BenchmarkSemaWorkBlock-128             949           886           -6.64%

Change-Id: I4403fb29d3cd5254b7b1ce87a216bd11b391079e
Reviewed-on: https://go-review.googlesource.com/22549
Reviewed-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Minux Ma <minux@golang.org>
---
 src/sync/atomic/asm_ppc64x.s | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'src/sync')

diff --git a/src/sync/atomic/asm_ppc64x.s b/src/sync/atomic/asm_ppc64x.s
index ed348458b4..2474e96435 100644
--- a/src/sync/atomic/asm_ppc64x.s
+++ b/src/sync/atomic/asm_ppc64x.s
@@ -15,8 +15,7 @@ TEXT ·SwapUint32(SB),NOSPLIT,$0-20
 	SYNC
 	LWAR	(R3), R5
 	STWCCC	R4, (R3)
-	BNE	-3(PC)
-	SYNC
+	BNE	-2(PC)
 	ISYNC
 	MOVW	R5, old+16(FP)
 	RET
@@ -30,8 +29,7 @@ TEXT ·SwapUint64(SB),NOSPLIT,$0-24
 	SYNC
 	LDAR	(R3), R5
 	STDCCC	R4, (R3)
-	BNE	-3(PC)
-	SYNC
+	BNE	-2(PC)
 	ISYNC
 	MOVD	R5, old+16(FP)
 	RET
@@ -49,10 +47,9 @@ TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-17
 	SYNC
 	LWAR	(R3), R6
 	CMPW	R6, R4
-	BNE	8(PC)
+	BNE	7(PC)
 	STWCCC	R5, (R3)
-	BNE	-5(PC)
-	SYNC
+	BNE	-4(PC)
 	ISYNC
 	MOVD	$1, R3
 	MOVB	R3, swapped+16(FP)
@@ -73,10 +70,9 @@ TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-25
 	SYNC
 	LDAR	(R3), R6
 	CMP	R6, R4
-	BNE	8(PC)
+	BNE	7(PC)
 	STDCCC	R5, (R3)
-	BNE	-5(PC)
-	SYNC
+	BNE	-4(PC)
 	ISYNC
 	MOVD	$1, R3
 	MOVB	R3, swapped+24(FP)
@@ -94,8 +90,7 @@ TEXT ·AddUint32(SB),NOSPLIT,$0-20
 	LWAR	(R3), R5
 	ADD	R4, R5
 	STWCCC	R5, (R3)
-	BNE	-4(PC)
-	SYNC
+	BNE	-3(PC)
 	ISYNC
 	MOVW	R5, ret+16(FP)
 	RET
@@ -113,8 +108,7 @@ TEXT ·AddUint64(SB),NOSPLIT,$0-24
 	LDAR	(R3), R5
 	ADD	R4, R5
 	STDCCC	R5, (R3)
-	BNE	-4(PC)
-	SYNC
+	BNE	-3(PC)
 	ISYNC
 	MOVD	R5, ret+16(FP)
 	RET
-- 
cgit v1.3