aboutsummaryrefslogtreecommitdiff
path: root/src/pkg/runtime/proc.c
diff options
context:
space:
mode:
authorDmitriy Vyukov <dvyukov@google.com>2014-01-21 13:06:57 +0400
committerDmitriy Vyukov <dvyukov@google.com>2014-01-21 13:06:57 +0400
commitcb133c66073303b08e893d6b71faf98bda2402e9 (patch)
treea2c7929c7454d1cf754e383164ea12522884217c /src/pkg/runtime/proc.c
parent0e027fca428a893ceeb49e4f833f9a923c5f201d (diff)
downloadgo-cb133c66073303b08e893d6b71faf98bda2402e9.tar.xz
runtime: do not collect GC roots explicitly
Currently we collect (add) all roots into a global array in a single-threaded GC phase. This hinders parallelism. With this change we just kick off parallel for for number_of_goroutines+5 iterations. Then parallel for callback decides whether it needs to scan stack of a goroutine scan data segment, scan finalizers, etc. This eliminates the single-threaded phase entirely. This requires to store all goroutines in an array instead of a linked list (to allow direct indexing). This CL also removes DebugScan functionality. It is broken because it uses unbounded stack, so it can not run on g0. When it was working, I've found it helpless for debugging issues because the two algorithms are too different now. This change would require updating the DebugScan, so it's simpler to just delete it. With 8 threads this change reduces GC pause by ~6%, while keeping cputime roughly the same. garbage-8 allocated 2987886 2989221 +0.04% allocs 62885 62887 +0.00% cputime 21286000 21272000 -0.07% gc-pause-one 26633247 24885421 -6.56% gc-pause-total 873570 811264 -7.13% rss 242089984 242515968 +0.18% sys-gc 13934336 13869056 -0.47% sys-heap 205062144 205062144 +0.00% sys-other 12628288 12628288 +0.00% sys-stack 11534336 11927552 +3.41% sys-total 243159104 243487040 +0.13% time 2809477 2740795 -2.44% R=golang-codereviews, rsc CC=cshapiro, golang-codereviews, khr https://golang.org/cl/46860043
Diffstat (limited to 'src/pkg/runtime/proc.c')
-rw-r--r--src/pkg/runtime/proc.c77
1 files changed, 55 insertions, 22 deletions
diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c
index 47012ae550..d6732d2c61 100644
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -67,8 +67,7 @@ int32 runtime·gomaxprocs;
uint32 runtime·needextram;
bool runtime·iscgo;
M runtime·m0;
-G runtime·g0; // idle goroutine for m0
-G* runtime·allg;
+G runtime·g0; // idle goroutine for m0
G* runtime·lastg;
M* runtime·allm;
M* runtime·extram;
@@ -76,6 +75,11 @@ int8* runtime·goos;
int32 runtime·ncpu;
static int32 newprocs;
+static Lock allglock; // the following vars are protected by this lock or by stoptheworld
+G** runtime·allg;
+uintptr runtime·allglen;
+static uintptr allgcap;
+
void runtime·mstart(void);
static void runqput(P*, G*);
static G* runqget(P*);
@@ -115,6 +119,7 @@ static bool preemptall(void);
static bool preemptone(P*);
static bool exitsyscallfast(void);
static bool haveexperiment(int8*);
+static void allgadd(G*);
// The bootstrap sequence is:
//
@@ -278,6 +283,7 @@ runtime·tracebackothers(G *me)
{
G *gp;
int32 traceback;
+ uintptr i;
traceback = runtime·gotraceback(nil);
@@ -288,7 +294,9 @@ runtime·tracebackothers(G *me)
runtime·traceback(~(uintptr)0, ~(uintptr)0, 0, gp);
}
- for(gp = runtime·allg; gp != nil; gp = gp->alllink) {
+ runtime·lock(&allglock);
+ for(i = 0; i < runtime·allglen; i++) {
+ gp = runtime·allg[i];
if(gp == me || gp == m->curg || gp->status == Gdead)
continue;
if(gp->issystem && traceback < 2)
@@ -301,6 +309,7 @@ runtime·tracebackothers(G *me)
} else
runtime·traceback(~(uintptr)0, ~(uintptr)0, 0, gp);
}
+ runtime·unlock(&allglock);
}
static void
@@ -792,13 +801,7 @@ runtime·newextram(void)
if(raceenabled)
gp->racectx = runtime·racegostart(runtime·newextram);
// put on allg for garbage collector
- runtime·lock(&runtime·sched);
- if(runtime·lastg == nil)
- runtime·allg = gp;
- else
- runtime·lastg->alllink = gp;
- runtime·lastg = gp;
- runtime·unlock(&runtime·sched);
+ allgadd(gp);
// Add m to the extra list.
mnext = lockextra(true);
@@ -1766,13 +1769,7 @@ runtime·newproc1(FuncVal *fn, byte *argp, int32 narg, int32 nret, void *callerp
runtime·throw("invalid stack in newg");
} else {
newg = runtime·malg(StackMin);
- runtime·lock(&runtime·sched);
- if(runtime·lastg == nil)
- runtime·allg = newg;
- else
- runtime·lastg->alllink = newg;
- runtime·lastg = newg;
- runtime·unlock(&runtime·sched);
+ allgadd(newg);
}
sp = (byte*)newg->stackbase;
@@ -1805,6 +1802,31 @@ runtime·newproc1(FuncVal *fn, byte *argp, int32 narg, int32 nret, void *callerp
return newg;
}
+static void
+allgadd(G *gp)
+{
+ G **new;
+ uintptr cap;
+
+ runtime·lock(&allglock);
+ if(runtime·allglen >= allgcap) {
+ cap = 4096/sizeof(new[0]);
+ if(cap < 2*allgcap)
+ cap = 2*allgcap;
+ new = runtime·malloc(cap*sizeof(new[0]));
+ if(new == nil)
+ runtime·throw("runtime: cannot allocate memory");
+ if(runtime·allg != nil) {
+ runtime·memmove(new, runtime·allg, runtime·allglen*sizeof(new[0]));
+ runtime·free(runtime·allg);
+ }
+ runtime·allg = new;
+ allgcap = cap;
+ }
+ runtime·allg[runtime·allglen++] = gp;
+ runtime·unlock(&allglock);
+}
+
// Put on gfree list.
// If local list is too long, transfer a batch to the global list.
static void
@@ -1994,19 +2016,21 @@ runtime·gcount(void)
{
G *gp;
int32 n, s;
+ uintptr i;
n = 0;
- runtime·lock(&runtime·sched);
+ runtime·lock(&allglock);
// TODO(dvyukov): runtime.NumGoroutine() is O(N).
// We do not want to increment/decrement centralized counter in newproc/goexit,
// just to make runtime.NumGoroutine() faster.
// Compromise solution is to introduce per-P counters of active goroutines.
- for(gp = runtime·allg; gp; gp = gp->alllink) {
+ for(i = 0; i < runtime·allglen; i++) {
+ gp = runtime·allg[i];
s = gp->status;
if(s == Grunnable || s == Grunning || s == Gsyscall || s == Gwaiting)
n++;
}
- runtime·unlock(&runtime·sched);
+ runtime·unlock(&allglock);
return n;
}
@@ -2345,6 +2369,7 @@ checkdead(void)
{
G *gp;
int32 run, grunning, s;
+ uintptr i;
// -1 for sysmon
run = runtime·sched.mcount - runtime·sched.nmidle - runtime·sched.nmidlelocked - 1;
@@ -2356,17 +2381,21 @@ checkdead(void)
runtime·throw("checkdead: inconsistent counts");
}
grunning = 0;
- for(gp = runtime·allg; gp; gp = gp->alllink) {
+ runtime·lock(&allglock);
+ for(i = 0; i < runtime·allglen; i++) {
+ gp = runtime·allg[i];
if(gp->isbackground)
continue;
s = gp->status;
if(s == Gwaiting)
grunning++;
else if(s == Grunnable || s == Grunning || s == Gsyscall) {
+ runtime·unlock(&allglock);
runtime·printf("checkdead: find g %D in status %d\n", gp->goid, s);
runtime·throw("checkdead: runnable g");
}
}
+ runtime·unlock(&allglock);
if(grunning == 0) // possible if main goroutine calls runtime·Goexit()
runtime·exit(0);
m->throwing = -1; // do not dump full stacks
@@ -2553,6 +2582,7 @@ runtime·schedtrace(bool detailed)
int64 now;
int64 id1, id2, id3;
int32 i, t, h;
+ uintptr gi;
int8 *fmt;
M *mp, *lockedm;
G *gp, *lockedg;
@@ -2620,13 +2650,16 @@ runtime·schedtrace(bool detailed)
mp->mallocing, mp->throwing, mp->gcing, mp->locks, mp->dying, mp->helpgc,
mp->spinning, id3);
}
- for(gp = runtime·allg; gp; gp = gp->alllink) {
+ runtime·lock(&allglock);
+ for(gi = 0; gi < runtime·allglen; gi++) {
+ gp = runtime·allg[gi];
mp = gp->m;
lockedm = gp->lockedm;
runtime·printf(" G%D: status=%d(%s) m=%d lockedm=%d\n",
gp->goid, gp->status, gp->waitreason, mp ? mp->id : -1,
lockedm ? lockedm->id : -1);
}
+ runtime·unlock(&allglock);
runtime·unlock(&runtime·sched);
}