Skip to content

Commit b2efd1d

Browse files
committed
runtime: put ReadMemStats debug assertions behind a double-check mode
ReadMemStats has a few assertions it makes about the consistency of the stats it's about to produce. Specifically, how those stats line up with runtime-internal stats. These checks are generally useful, but crashing just because some stats are wrong is a heavy price to pay. For a long time this wasn't a problem, but very recently it became a real problem. It turns out that there's real benign skew that can happen wherein sysmon (which doesn't synchronize with a STW) generates a trace event when tracing is enabled, and may mutate some stats while ReadMemStats is running its checks. Fix this by synchronizing with both sysmon and the tracer. This is a bit heavy-handed, but better that than false positives. Also, put the checks behind a debug mode. We want to reduce the risk of backporting this change, and again, it's not great to crash just because user-facing stats are off. Still, enable this debug mode during the runtime tests so we don't lose quite as much coverage from disabling these checks by default. Fixes #64401. Change-Id: I9adb3e5c7161d207648d07373a11da8a5f0fda9a Reviewed-on: https://go-review.googlesource.com/c/go/+/545277 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Michael Pratt <[email protected]> Reviewed-by: Felix Geisendörfer <[email protected]>
1 parent 5df4a63 commit b2efd1d

File tree

3 files changed

+71
-50
lines changed

3 files changed

+71
-50
lines changed

src/runtime/export_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,8 @@ func ReadMetricsSlow(memStats *MemStats, samplesp unsafe.Pointer, len, cap int)
464464
startTheWorld(stw)
465465
}
466466

467+
var DoubleCheckReadMemStats = &doubleCheckReadMemStats
468+
467469
// ReadMemStatsSlow returns both the runtime-computed MemStats and
468470
// MemStats accumulated by scanning the heap.
469471
func ReadMemStatsSlow() (base, slow MemStats) {

src/runtime/gc_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,11 @@ func TestPageAccounting(t *testing.T) {
577577
}
578578
}
579579

580+
func init() {
581+
// Enable ReadMemStats' double-check mode.
582+
*runtime.DoubleCheckReadMemStats = true
583+
}
584+
580585
func TestReadMemStats(t *testing.T) {
581586
base, slow := runtime.ReadMemStatsSlow()
582587
if base != slow {

src/runtime/mstats.go

Lines changed: 64 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,11 @@ func ReadMemStats(m *MemStats) {
361361
startTheWorld(stw)
362362
}
363363

364+
// doubleCheckReadMemStats controls a double-check mode for ReadMemStats that
365+
// ensures consistency between the values that ReadMemStats is using and the
366+
// runtime-internal stats.
367+
var doubleCheckReadMemStats = false
368+
364369
// readmemstats_m populates stats for internal runtime values.
365370
//
366371
// The world must be stopped.
@@ -435,56 +440,65 @@ func readmemstats_m(stats *MemStats) {
435440

436441
heapGoal := gcController.heapGoal()
437442

438-
// The world is stopped, so the consistent stats (after aggregation)
439-
// should be identical to some combination of memstats. In particular:
440-
//
441-
// * memstats.heapInUse == inHeap
442-
// * memstats.heapReleased == released
443-
// * memstats.heapInUse + memstats.heapFree == committed - inStacks - inWorkBufs - inPtrScalarBits
444-
// * memstats.totalAlloc == totalAlloc
445-
// * memstats.totalFree == totalFree
446-
//
447-
// Check if that's actually true.
448-
//
449-
// TODO(mknyszek): Maybe don't throw here. It would be bad if a
450-
// bug in otherwise benign accounting caused the whole application
451-
// to crash.
452-
if gcController.heapInUse.load() != uint64(consStats.inHeap) {
453-
print("runtime: heapInUse=", gcController.heapInUse.load(), "\n")
454-
print("runtime: consistent value=", consStats.inHeap, "\n")
455-
throw("heapInUse and consistent stats are not equal")
456-
}
457-
if gcController.heapReleased.load() != uint64(consStats.released) {
458-
print("runtime: heapReleased=", gcController.heapReleased.load(), "\n")
459-
print("runtime: consistent value=", consStats.released, "\n")
460-
throw("heapReleased and consistent stats are not equal")
461-
}
462-
heapRetained := gcController.heapInUse.load() + gcController.heapFree.load()
463-
consRetained := uint64(consStats.committed - consStats.inStacks - consStats.inWorkBufs - consStats.inPtrScalarBits)
464-
if heapRetained != consRetained {
465-
print("runtime: global value=", heapRetained, "\n")
466-
print("runtime: consistent value=", consRetained, "\n")
467-
throw("measures of the retained heap are not equal")
468-
}
469-
if gcController.totalAlloc.Load() != totalAlloc {
470-
print("runtime: totalAlloc=", gcController.totalAlloc.Load(), "\n")
471-
print("runtime: consistent value=", totalAlloc, "\n")
472-
throw("totalAlloc and consistent stats are not equal")
473-
}
474-
if gcController.totalFree.Load() != totalFree {
475-
print("runtime: totalFree=", gcController.totalFree.Load(), "\n")
476-
print("runtime: consistent value=", totalFree, "\n")
477-
throw("totalFree and consistent stats are not equal")
478-
}
479-
// Also check that mappedReady lines up with totalMapped - released.
480-
// This isn't really the same type of "make sure consistent stats line up" situation,
481-
// but this is an opportune time to check.
482-
if gcController.mappedReady.Load() != totalMapped-uint64(consStats.released) {
483-
print("runtime: mappedReady=", gcController.mappedReady.Load(), "\n")
484-
print("runtime: totalMapped=", totalMapped, "\n")
485-
print("runtime: released=", uint64(consStats.released), "\n")
486-
print("runtime: totalMapped-released=", totalMapped-uint64(consStats.released), "\n")
487-
throw("mappedReady and other memstats are not equal")
443+
if doubleCheckReadMemStats {
444+
// Only check this if we're debugging. It would be bad to crash an application
445+
// just because the debugging stats are wrong. We mostly rely on tests to catch
446+
// these issues, and we enable the double check mode for tests.
447+
//
448+
// The world is stopped, so the consistent stats (after aggregation)
449+
// should be identical to some combination of memstats. In particular:
450+
//
451+
// * memstats.heapInUse == inHeap
452+
// * memstats.heapReleased == released
453+
// * memstats.heapInUse + memstats.heapFree == committed - inStacks - inWorkBufs - inPtrScalarBits
454+
// * memstats.totalAlloc == totalAlloc
455+
// * memstats.totalFree == totalFree
456+
//
457+
// Check if that's actually true.
458+
//
459+
// Prevent sysmon and the tracer from skewing the stats since they can
460+
// act without synchronizing with a STW. See #64401.
461+
lock(&sched.sysmonlock)
462+
lock(&trace.lock)
463+
if gcController.heapInUse.load() != uint64(consStats.inHeap) {
464+
print("runtime: heapInUse=", gcController.heapInUse.load(), "\n")
465+
print("runtime: consistent value=", consStats.inHeap, "\n")
466+
throw("heapInUse and consistent stats are not equal")
467+
}
468+
if gcController.heapReleased.load() != uint64(consStats.released) {
469+
print("runtime: heapReleased=", gcController.heapReleased.load(), "\n")
470+
print("runtime: consistent value=", consStats.released, "\n")
471+
throw("heapReleased and consistent stats are not equal")
472+
}
473+
heapRetained := gcController.heapInUse.load() + gcController.heapFree.load()
474+
consRetained := uint64(consStats.committed - consStats.inStacks - consStats.inWorkBufs - consStats.inPtrScalarBits)
475+
if heapRetained != consRetained {
476+
print("runtime: global value=", heapRetained, "\n")
477+
print("runtime: consistent value=", consRetained, "\n")
478+
throw("measures of the retained heap are not equal")
479+
}
480+
if gcController.totalAlloc.Load() != totalAlloc {
481+
print("runtime: totalAlloc=", gcController.totalAlloc.Load(), "\n")
482+
print("runtime: consistent value=", totalAlloc, "\n")
483+
throw("totalAlloc and consistent stats are not equal")
484+
}
485+
if gcController.totalFree.Load() != totalFree {
486+
print("runtime: totalFree=", gcController.totalFree.Load(), "\n")
487+
print("runtime: consistent value=", totalFree, "\n")
488+
throw("totalFree and consistent stats are not equal")
489+
}
490+
// Also check that mappedReady lines up with totalMapped - released.
491+
// This isn't really the same type of "make sure consistent stats line up" situation,
492+
// but this is an opportune time to check.
493+
if gcController.mappedReady.Load() != totalMapped-uint64(consStats.released) {
494+
print("runtime: mappedReady=", gcController.mappedReady.Load(), "\n")
495+
print("runtime: totalMapped=", totalMapped, "\n")
496+
print("runtime: released=", uint64(consStats.released), "\n")
497+
print("runtime: totalMapped-released=", totalMapped-uint64(consStats.released), "\n")
498+
throw("mappedReady and other memstats are not equal")
499+
}
500+
unlock(&trace.lock)
501+
unlock(&sched.sysmonlock)
488502
}
489503

490504
// We've calculated all the values we need. Now, populate stats.

0 commit comments

Comments
 (0)