@@ -424,34 +424,42 @@ func (s *levelsController) pickCompactLevels() (prios []compactionPriority) {
424
424
prios = append (prios , pri )
425
425
}
426
426
}
427
- sort .Slice (prios , func (i , j int ) bool {
428
- return prios [i ].score > prios [j ].score
429
- })
427
+ // We used to sort compaction priorities based on the score. But, we
428
+ // decided to compact based on the level, not the priority. So, upper
429
+ // levels (level 0, level 1, etc) always get compacted first, before the
430
+ // lower levels -- this allows us to avoid stalls.
430
431
return prios
431
432
}
432
433
433
- // compactBuildTables merge topTables and botTables to form a list of new tables.
434
+ // checkOverlap checks if the given tables overlap with any level from the given "lev" onwards.
435
+ func (s * levelsController ) checkOverlap (tables []* table.Table , lev int ) bool {
436
+ kr := getKeyRange (tables ... )
437
+ for i , lh := range s .levels {
438
+ if i < lev { // Skip upper levels.
439
+ continue
440
+ }
441
+ lh .RLock ()
442
+ left , right := lh .overlappingTables (levelHandlerRLocked {}, kr )
443
+ lh .RUnlock ()
444
+ if right - left > 0 {
445
+ return true
446
+ }
447
+ }
448
+ return false
449
+ }
450
+
451
+ // compactBuildTables merges topTables and botTables to form a list of new tables.
434
452
func (s * levelsController ) compactBuildTables (
435
453
lev int , cd compactDef ) ([]* table.Table , func () error , error ) {
436
454
topTables := cd .top
437
455
botTables := cd .bot
438
456
439
- var hasOverlap bool
440
- {
441
- kr := getKeyRange (cd .top ... )
442
- for i , lh := range s .levels {
443
- if i <= lev { // Skip upper levels.
444
- continue
445
- }
446
- lh .RLock ()
447
- left , right := lh .overlappingTables (levelHandlerRLocked {}, kr )
448
- lh .RUnlock ()
449
- if right - left > 0 {
450
- hasOverlap = true
451
- break
452
- }
453
- }
454
- }
457
+ // Check overlap of the top level with the levels which are not being
458
+ // compacted in this compaction. We don't need to check overlap of the bottom
459
+ // tables with other levels because if the top tables overlap with any of the lower
460
+ // levels, it implies bottom level also overlaps because top and bottom tables
461
+ // overlap with each other.
462
+ hasOverlap := s .checkOverlap (cd .top , cd .nextLevel .level + 1 )
455
463
456
464
// Try to collect stats so that we can inform value log about GC. That would help us find which
457
465
// value log file should be GCed.
@@ -561,10 +569,15 @@ func (s *levelsController) compactBuildTables(
561
569
// versions which are below the minReadTs, otherwise, we might end up discarding the
562
570
// only valid version for a running transaction.
563
571
numVersions ++
564
- lastValidVersion := vs .Meta & bitDiscardEarlierVersions > 0
565
- if isDeletedOrExpired (vs .Meta , vs .ExpiresAt ) ||
566
- numVersions > s .kv .opt .NumVersionsToKeep ||
567
- lastValidVersion {
572
+
573
+ // Keep the current version and discard all the next versions if
574
+ // - The `discardEarlierVersions` bit is set OR
575
+ // - We've already processed `NumVersionsToKeep` number of versions
576
+ // (including the current item being processed)
577
+ lastValidVersion := vs .Meta & bitDiscardEarlierVersions > 0 ||
578
+ numVersions == s .kv .opt .NumVersionsToKeep
579
+
580
+ if isDeletedOrExpired (vs .Meta , vs .ExpiresAt ) || lastValidVersion {
568
581
// If this version of the key is deleted or expired, skip all the rest of the
569
582
// versions. Ensure that we're only removing versions below readTs.
570
583
skipKey = y .SafeCopy (skipKey , it .Key ())
@@ -925,15 +938,13 @@ func (s *levelsController) addLevel0Table(t *table.Table) error {
925
938
s .cstatus .RUnlock ()
926
939
timeStart = time .Now ()
927
940
}
928
- // Before we unstall, we need to make sure that level 0 and 1 are healthy. Otherwise, we
929
- // will very quickly fill up level 0 again and if the compaction strategy favors level 0,
930
- // then level 1 is going to super full.
941
+ // Before we unstall, we need to make sure that level 0 is healthy. Otherwise, we
942
+ // will very quickly fill up level 0 again.
931
943
for i := 0 ; ; i ++ {
932
- // Passing 0 for delSize to compactable means we're treating incomplete compactions as
933
- // not having finished -- we wait for them to finish. Also, it's crucial this behavior
934
- // replicates pickCompactLevels' behavior in computing compactability in order to
935
- // guarantee progress.
936
- if ! s .isLevel0Compactable () && ! s .levels [1 ].isCompactable (0 ) {
944
+ // It's crucial that this behavior replicates pickCompactLevels' behavior in
945
+ // computing compactability in order to guarantee progress.
946
+ // Break the loop once L0 has enough space to accommodate new tables.
947
+ if ! s .isLevel0Compactable () {
937
948
break
938
949
}
939
950
time .Sleep (10 * time .Millisecond )
0 commit comments