Skip to content

Commit d5fd2dd

Browse files
committed
sync: use lock-free structure for Pool stealing
Currently, Pool stores each per-P shard's overflow in a slice protected by a Mutex. In order to store to the overflow or steal from another shard, a P must lock that shard's Mutex. This allows for simple synchronization between Put and Get, but has unfortunate consequences for clearing pools. Pools are cleared during STW sweep termination, and hence rely on pinning a goroutine to its P to synchronize between Get/Put and clearing. This makes the Get/Put fast path extremely fast because it can rely on quiescence-style coordination, which doesn't even require atomic writes, much less locking. The catch is that a goroutine cannot acquire a Mutex while pinned to its P (as this could deadlock). Hence, it must drop the pin on the slow path. But this means the slow path is not synchronized with clearing. As a result, 1) It's difficult to reason about races between clearing and the slow path. Furthermore, this reasoning often depends on unspecified nuances of where preemption points can occur. 2) Clearing must zero out the pointer to every object in every Pool to prevent a concurrent slow path from causing all objects to be retained. Since this happens during STW, this has an O(# objects in Pools) effect on STW time. 3) We can't implement a victim cache without making clearing even slower. This CL solves these problems by replacing the locked overflow slice with a lock-free structure. This allows Gets and Puts to be pinned the whole time they're manipulating the shards slice (Pool.local), which eliminates the races between Get/Put and clearing. This, in turn, eliminates the need to zero all object pointers, reducing clearing to O(# of Pools) during STW. In addition to significantly reducing STW impact, this also happens to speed up the Get/Put fast-path and the slow path. It somewhat increases the cost of PoolExpensiveNew, but we'll fix that in the next CL. name old time/op new time/op delta Pool-12 3.00ns ± 0% 2.21ns ±36% -26.32% (p=0.000 n=18+19) PoolOverflow-12 600ns ± 1% 587ns ± 1% -2.21% (p=0.000 n=16+18) PoolSTW-12 71.0µs ± 2% 5.6µs ± 3% -92.15% (p=0.000 n=20+20) PoolExpensiveNew-12 3.14ms ± 5% 3.69ms ± 7% +17.67% (p=0.000 n=19+20) name old p50-ns/STW new p50-ns/STW delta PoolSTW-12 70.7k ± 1% 5.5k ± 2% -92.25% (p=0.000 n=20+20) name old p95-ns/STW new p95-ns/STW delta PoolSTW-12 73.1k ± 2% 6.7k ± 4% -90.86% (p=0.000 n=18+19) name old GCs/op new GCs/op delta PoolExpensiveNew-12 0.38 ± 1% 0.39 ± 1% +2.07% (p=0.000 n=20+18) name old New/op new New/op delta PoolExpensiveNew-12 33.9 ± 6% 40.0 ± 6% +17.97% (p=0.000 n=19+20) (https://perf.golang.org/search?q=upload:20190311.1) Fixes #22331. For #22950. Change-Id: Ic5cd826e25e218f3f8256dbc4d22835c1fecb391 Reviewed-on: https://go-review.googlesource.com/c/go/+/166960 Run-TryBot: Austin Clements <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: David Chase <[email protected]>
1 parent 59f2704 commit d5fd2dd

File tree

1 file changed

+26
-48
lines changed

1 file changed

+26
-48
lines changed

src/sync/pool.go

+26-48
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,8 @@ type Pool struct {
5555

5656
// Local per-P Pool appendix.
5757
type poolLocalInternal struct {
58-
private interface{} // Can be used only by the respective P.
59-
shared []interface{} // Can be used by any P.
60-
Mutex // Protects shared.
58+
private interface{} // Can be used only by the respective P.
59+
shared poolChain // Local P can pushHead/popHead; any P can popTail.
6160
}
6261

6362
type poolLocal struct {
@@ -97,17 +96,15 @@ func (p *Pool) Put(x interface{}) {
9796
race.ReleaseMerge(poolRaceAddr(x))
9897
race.Disable()
9998
}
100-
l := p.pin()
99+
l, _ := p.pin()
101100
if l.private == nil {
102101
l.private = x
103102
x = nil
104103
}
105-
runtime_procUnpin()
106104
if x != nil {
107-
l.Lock()
108-
l.shared = append(l.shared, x)
109-
l.Unlock()
105+
l.shared.pushHead(x)
110106
}
107+
runtime_procUnpin()
111108
if race.Enabled {
112109
race.Enable()
113110
}
@@ -125,22 +122,19 @@ func (p *Pool) Get() interface{} {
125122
if race.Enabled {
126123
race.Disable()
127124
}
128-
l := p.pin()
125+
l, pid := p.pin()
129126
x := l.private
130127
l.private = nil
131-
runtime_procUnpin()
132128
if x == nil {
133-
l.Lock()
134-
last := len(l.shared) - 1
135-
if last >= 0 {
136-
x = l.shared[last]
137-
l.shared = l.shared[:last]
138-
}
139-
l.Unlock()
129+
// Try to pop the head of the local shard. We prefer
130+
// the head over the tail for temporal locality of
131+
// reuse.
132+
x, _ = l.shared.popHead()
140133
if x == nil {
141-
x = p.getSlow()
134+
x = p.getSlow(pid)
142135
}
143136
}
137+
runtime_procUnpin()
144138
if race.Enabled {
145139
race.Enable()
146140
if x != nil {
@@ -153,31 +147,24 @@ func (p *Pool) Get() interface{} {
153147
return x
154148
}
155149

156-
func (p *Pool) getSlow() (x interface{}) {
150+
func (p *Pool) getSlow(pid int) interface{} {
157151
// See the comment in pin regarding ordering of the loads.
158152
size := atomic.LoadUintptr(&p.localSize) // load-acquire
159153
local := p.local // load-consume
160154
// Try to steal one element from other procs.
161-
pid := runtime_procPin()
162-
runtime_procUnpin()
163155
for i := 0; i < int(size); i++ {
164156
l := indexLocal(local, (pid+i+1)%int(size))
165-
l.Lock()
166-
last := len(l.shared) - 1
167-
if last >= 0 {
168-
x = l.shared[last]
169-
l.shared = l.shared[:last]
170-
l.Unlock()
171-
break
157+
if x, _ := l.shared.popTail(); x != nil {
158+
return x
172159
}
173-
l.Unlock()
174160
}
175-
return x
161+
return nil
176162
}
177163

178-
// pin pins the current goroutine to P, disables preemption and returns poolLocal pool for the P.
164+
// pin pins the current goroutine to P, disables preemption and
165+
// returns poolLocal pool for the P and the P's id.
179166
// Caller must call runtime_procUnpin() when done with the pool.
180-
func (p *Pool) pin() *poolLocal {
167+
func (p *Pool) pin() (*poolLocal, int) {
181168
pid := runtime_procPin()
182169
// In pinSlow we store to localSize and then to local, here we load in opposite order.
183170
// Since we've disabled preemption, GC cannot happen in between.
@@ -186,12 +173,12 @@ func (p *Pool) pin() *poolLocal {
186173
s := atomic.LoadUintptr(&p.localSize) // load-acquire
187174
l := p.local // load-consume
188175
if uintptr(pid) < s {
189-
return indexLocal(l, pid)
176+
return indexLocal(l, pid), pid
190177
}
191178
return p.pinSlow()
192179
}
193180

194-
func (p *Pool) pinSlow() *poolLocal {
181+
func (p *Pool) pinSlow() (*poolLocal, int) {
195182
// Retry under the mutex.
196183
// Can not lock the mutex while pinned.
197184
runtime_procUnpin()
@@ -202,7 +189,7 @@ func (p *Pool) pinSlow() *poolLocal {
202189
s := p.localSize
203190
l := p.local
204191
if uintptr(pid) < s {
205-
return indexLocal(l, pid)
192+
return indexLocal(l, pid), pid
206193
}
207194
if p.local == nil {
208195
allPools = append(allPools, p)
@@ -212,26 +199,17 @@ func (p *Pool) pinSlow() *poolLocal {
212199
local := make([]poolLocal, size)
213200
atomic.StorePointer(&p.local, unsafe.Pointer(&local[0])) // store-release
214201
atomic.StoreUintptr(&p.localSize, uintptr(size)) // store-release
215-
return &local[pid]
202+
return &local[pid], pid
216203
}
217204

218205
func poolCleanup() {
219206
// This function is called with the world stopped, at the beginning of a garbage collection.
220207
// It must not allocate and probably should not call any runtime functions.
221-
// Defensively zero out everything, 2 reasons:
222-
// 1. To prevent false retention of whole Pools.
223-
// 2. If GC happens while a goroutine works with l.shared in Put/Get,
224-
// it will retain whole Pool. So next cycle memory consumption would be doubled.
208+
209+
// Because the world is stopped, no pool user can be in a
210+
// pinned section (in effect, this has all Ps pinned).
225211
for i, p := range allPools {
226212
allPools[i] = nil
227-
for i := 0; i < int(p.localSize); i++ {
228-
l := indexLocal(p.local, i)
229-
l.private = nil
230-
for j := range l.shared {
231-
l.shared[j] = nil
232-
}
233-
l.shared = nil
234-
}
235213
p.local = nil
236214
p.localSize = 0
237215
}

0 commit comments

Comments
 (0)