From df10150d398a2343ea3f41beaa0a33a67a721943 Mon Sep 17 00:00:00 2001
From: Alan Protasio <approtas@amazon.com>
Date: Fri, 10 Jan 2025 14:51:49 -0800
Subject: [PATCH 01/34] Purge expired postings cache items due inactivity
 (#6502)

* Purge expired postings cache items due inactivity

Signed-off-by: alanprot <alanprot@gmail.com>

* Fix comments

Signed-off-by: alanprot <alanprot@gmail.com>

---------

Signed-off-by: alanprot <alanprot@gmail.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 pkg/ingester/ingester.go                    | 20 ++++++++++++
 pkg/ingester/ingester_test.go               | 36 +++++++++++++++++++--
 pkg/storage/tsdb/config.go                  |  6 ++++
 pkg/storage/tsdb/expanded_postings_cache.go | 17 ++++++++++
 4 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go
index d46398d4b94..2502ef8c762 100644
--- a/pkg/ingester/ingester.go
+++ b/pkg/ingester/ingester.go
@@ -878,6 +878,14 @@ func (i *Ingester) starting(ctx context.Context) error {
 		servs = append(servs, closeIdleService)
 	}
 
+	if i.expandedPostingsCacheFactory != nil {
+		interval := i.cfg.BlocksStorageConfig.TSDB.ExpandedCachingExpireInterval
+		if interval == 0 {
+			interval = cortex_tsdb.ExpandedCachingExpireInterval
+		}
+		servs = append(servs, services.NewTimerService(interval, nil, i.expirePostingsCache, nil))
+	}
+
 	var err error
 	i.TSDBState.subservices, err = services.NewManager(servs...)
 	if err == nil {
@@ -2794,6 +2802,18 @@ func (i *Ingester) closeAndDeleteIdleUserTSDBs(ctx context.Context) error {
 	return nil
 }
 
+func (i *Ingester) expirePostingsCache(ctx context.Context) error {
+	for _, userID := range i.getTSDBUsers() {
+		if ctx.Err() != nil {
+			return nil
+		}
+		userDB := i.getTSDB(userID)
+		userDB.postingCache.PurgeExpiredItems()
+	}
+
+	return nil
+}
+
 func (i *Ingester) closeAndDeleteUserTSDBIfIdle(userID string) tsdbCloseCheckResult {
 	userDB := i.getTSDB(userID)
 	if userDB == nil || userDB.shipper == nil {
diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go
index 15fec1d1fd5..1de49fb8811 100644
--- a/pkg/ingester/ingester_test.go
+++ b/pkg/ingester/ingester_test.go
@@ -5525,6 +5525,7 @@ func TestExpendedPostingsCacheIsolation(t *testing.T) {
 
 func TestExpendedPostingsCache(t *testing.T) {
 	cfg := defaultIngesterTestConfig(t)
+	cfg.BlocksStorageConfig.TSDB.ExpandedCachingExpireInterval = time.Second
 	cfg.BlocksStorageConfig.TSDB.BlockRanges = []time.Duration{2 * time.Hour}
 
 	runQuery := func(t *testing.T, ctx context.Context, i *Ingester, matchers []*client.LabelMatcher) []client.TimeSeriesChunk {
@@ -5540,9 +5541,10 @@ func TestExpendedPostingsCache(t *testing.T) {
 	}
 
 	tc := map[string]struct {
-		cacheConfig              cortex_tsdb.TSDBPostingsCacheConfig
-		expectedBlockPostingCall int
-		expectedHeadPostingCall  int
+		cacheConfig               cortex_tsdb.TSDBPostingsCacheConfig
+		expectedBlockPostingCall  int
+		expectedHeadPostingCall   int
+		shouldExpireDueInactivity bool
 	}{
 		"cacheDisabled": {
 			expectedBlockPostingCall: 0,
@@ -5594,6 +5596,23 @@ func TestExpendedPostingsCache(t *testing.T) {
 				},
 			},
 		},
+		"expire due inactivity": {
+			expectedBlockPostingCall:  1,
+			expectedHeadPostingCall:   1,
+			shouldExpireDueInactivity: true,
+			cacheConfig: cortex_tsdb.TSDBPostingsCacheConfig{
+				Blocks: cortex_tsdb.PostingsCacheConfig{
+					Ttl:      time.Second,
+					MaxBytes: 1024 * 1024 * 1024,
+					Enabled:  true,
+				},
+				Head: cortex_tsdb.PostingsCacheConfig{
+					Ttl:      time.Second,
+					MaxBytes: 1024 * 1024 * 1024,
+					Enabled:  true,
+				},
+			},
+		},
 	}
 
 	for name, c := range tc {
@@ -5790,6 +5809,17 @@ func TestExpendedPostingsCache(t *testing.T) {
 			require.Len(t, runQuery(t, ctx, i, []*client.LabelMatcher{{Type: client.EQUAL, Name: "extra", Value: "1"}}), 1)
 			// Return cached value from block and bypass head
 			require.Equal(t, int64(0), postingsForMatchersCalls.Load())
+
+			if c.shouldExpireDueInactivity {
+				test.Poll(t, c.cacheConfig.Blocks.Ttl+c.cacheConfig.Head.Ttl+cfg.BlocksStorageConfig.TSDB.ExpandedCachingExpireInterval, 0, func() interface{} {
+					size := 0
+					for _, userID := range i.getTSDBUsers() {
+						userDB := i.getTSDB(userID)
+						size += userDB.postingCache.Size()
+					}
+					return size
+				})
+			}
 		})
 	}
 }
diff --git a/pkg/storage/tsdb/config.go b/pkg/storage/tsdb/config.go
index 42d3494abee..afb51d1a416 100644
--- a/pkg/storage/tsdb/config.go
+++ b/pkg/storage/tsdb/config.go
@@ -30,6 +30,9 @@ const (
 	// How often are open TSDBs checked for being idle and closed.
 	DefaultCloseIdleTSDBInterval = 5 * time.Minute
 
+	// How often expired items are cleaned from the PostingsCache
+	ExpandedCachingExpireInterval = 5 * time.Minute
+
 	// How often to check for tenant deletion mark.
 	DeletionMarkCheckInterval = 1 * time.Hour
 
@@ -156,6 +159,9 @@ type TSDBConfig struct {
 	// How often to check for idle TSDBs for closing. DefaultCloseIdleTSDBInterval is not suitable for testing, so tests can override.
 	CloseIdleTSDBInterval time.Duration `yaml:"-"`
 
+	// How often expired items are cleaned from the PostingsCache. ExpandedCachingExpireInterval is not suitable for testing, so tests can override.
+	ExpandedCachingExpireInterval time.Duration `yaml:"-"`
+
 	// Positive value enables experimental support for exemplars. 0 or less to disable.
 	MaxExemplars int `yaml:"max_exemplars"`
 
diff --git a/pkg/storage/tsdb/expanded_postings_cache.go b/pkg/storage/tsdb/expanded_postings_cache.go
index 3ea8da709ec..a24087e824f 100644
--- a/pkg/storage/tsdb/expanded_postings_cache.go
+++ b/pkg/storage/tsdb/expanded_postings_cache.go
@@ -124,6 +124,8 @@ func (f *ExpandedPostingsCacheFactory) NewExpandedPostingsCache(userId string, m
 type ExpandedPostingsCache interface {
 	PostingsForMatchers(ctx context.Context, blockID ulid.ULID, ix tsdb.IndexReader, ms ...*labels.Matcher) (index.Postings, error)
 	ExpireSeries(metric labels.Labels)
+	PurgeExpiredItems()
+	Size() int
 }
 
 type blocksPostingsForMatchersCache struct {
@@ -166,6 +168,15 @@ func (c *blocksPostingsForMatchersCache) ExpireSeries(metric labels.Labels) {
 	c.seedByHash.incrementSeed(c.userId, metricName)
 }
 
+func (c *blocksPostingsForMatchersCache) PurgeExpiredItems() {
+	c.headCache.expire()
+	c.blocksCache.expire()
+}
+
+func (c *blocksPostingsForMatchersCache) Size() int {
+	return c.headCache.size() + c.blocksCache.size()
+}
+
 func (c *blocksPostingsForMatchersCache) PostingsForMatchers(ctx context.Context, blockID ulid.ULID, ix tsdb.IndexReader, ms ...*labels.Matcher) (index.Postings, error) {
 	return c.fetchPostings(blockID, ix, ms...)(ctx)
 }
@@ -365,6 +376,12 @@ func (c *fifoCache[V]) expire() {
 	}
 }
 
+func (c *fifoCache[V]) size() int {
+	c.cachedMtx.RLock()
+	defer c.cachedMtx.RUnlock()
+	return c.cached.Len()
+}
+
 func (c *fifoCache[V]) getPromiseForKey(k string, fetch func() (V, int64, error)) (*cacheEntryPromise[V], bool) {
 	r := &cacheEntryPromise[V]{
 		done: make(chan struct{}),

From f53d0bce382aa5808750eb58fe5b84eead6e258f Mon Sep 17 00:00:00 2001
From: Daniel Sabsay <danielrsabsay@gmail.com>
Date: Sat, 11 Jan 2025 16:33:33 -0800
Subject: [PATCH 02/34] Update thanos to 4ba0ba403896 (#6503)

* Update thanos to 4ba0ba403896

Signed-off-by: Daniel Sabsay <sabsay@adobe.com>

* run go mod vendor

Signed-off-by: Daniel Sabsay <sabsay@adobe.com>

---------

Signed-off-by: Daniel Sabsay <sabsay@adobe.com>
Co-authored-by: Daniel Sabsay <sabsay@adobe.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 go.mod                                                   | 2 +-
 go.sum                                                   | 4 ++--
 .../thanos-io/thanos/pkg/cacheutil/async_op.go           | 9 ++++++++-
 vendor/modules.txt                                       | 2 +-
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/go.mod b/go.mod
index 982db263fd7..85adabfc834 100644
--- a/go.mod
+++ b/go.mod
@@ -52,7 +52,7 @@ require (
 	github.com/stretchr/testify v1.10.0
 	github.com/thanos-io/objstore v0.0.0-20241111205755-d1dd89d41f97
 	github.com/thanos-io/promql-engine v0.0.0-20250110162513-14f995518af3
-	github.com/thanos-io/thanos v0.37.3-0.20250107220537-0d426361672a
+	github.com/thanos-io/thanos v0.37.3-0.20250110074750-4ba0ba403896
 	github.com/uber/jaeger-client-go v2.30.0+incompatible
 	github.com/weaveworks/common v0.0.0-20230728070032-dd9e68f319d5
 	go.etcd.io/etcd/api/v3 v3.5.17
diff --git a/go.sum b/go.sum
index d05a6090ba3..92dc0762eee 100644
--- a/go.sum
+++ b/go.sum
@@ -1657,8 +1657,8 @@ github.com/thanos-io/objstore v0.0.0-20241111205755-d1dd89d41f97 h1:VjG0mwhN1Dkn
 github.com/thanos-io/objstore v0.0.0-20241111205755-d1dd89d41f97/go.mod h1:vyzFrBXgP+fGNG2FopEGWOO/zrIuoy7zt3LpLeezRsw=
 github.com/thanos-io/promql-engine v0.0.0-20250110162513-14f995518af3 h1:feQKBuPhRE/+xd4Ru6Jv48EzVatpXg2mnpl0x0f5OWY=
 github.com/thanos-io/promql-engine v0.0.0-20250110162513-14f995518af3/go.mod h1:wx0JlRZtsB2S10JYUgeg5GqLfMxw31SzArP+28yyE00=
-github.com/thanos-io/thanos v0.37.3-0.20250107220537-0d426361672a h1:VdOsK6zhseRVfpkOxCJ3b2MKhuP1sBjTnC7Bib7DLws=
-github.com/thanos-io/thanos v0.37.3-0.20250107220537-0d426361672a/go.mod h1:VOu1neDpx4n/2OCQmfT/0RMU85UzhO35ce0S3Ew+NSk=
+github.com/thanos-io/thanos v0.37.3-0.20250110074750-4ba0ba403896 h1:K5YqD5JzNPh7P/XGB2J19cxJlv61K9Mm2/UZ+iPVGMU=
+github.com/thanos-io/thanos v0.37.3-0.20250110074750-4ba0ba403896/go.mod h1:VOu1neDpx4n/2OCQmfT/0RMU85UzhO35ce0S3Ew+NSk=
 github.com/tjhop/slog-gokit v0.1.2 h1:pmQI4SvU9h4gA0vIQsdhJQSqQg4mOmsPykG2/PM3j1I=
 github.com/tjhop/slog-gokit v0.1.2/go.mod h1:8fhlcp8C8ELbg3GCyKv06tgt4B5sDq2P1r2DQAu1HuM=
 github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM=
diff --git a/vendor/github.com/thanos-io/thanos/pkg/cacheutil/async_op.go b/vendor/github.com/thanos-io/thanos/pkg/cacheutil/async_op.go
index f03f1d08271..fb468a5a78f 100644
--- a/vendor/github.com/thanos-io/thanos/pkg/cacheutil/async_op.go
+++ b/vendor/github.com/thanos-io/thanos/pkg/cacheutil/async_op.go
@@ -54,7 +54,14 @@ func (p *AsyncOperationProcessor) asyncQueueProcessLoop() {
 		case op := <-p.asyncQueue:
 			op()
 		case <-p.stop:
-			return
+			// Run all remaining operations before stopping
+			select {
+			case op := <-p.asyncQueue:
+				op()
+				continue
+			default:
+				return
+			}
 		}
 	}
 }
diff --git a/vendor/modules.txt b/vendor/modules.txt
index d6b508164c7..0ebdd814960 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -988,7 +988,7 @@ github.com/thanos-io/promql-engine/query
 github.com/thanos-io/promql-engine/ringbuffer
 github.com/thanos-io/promql-engine/storage
 github.com/thanos-io/promql-engine/storage/prometheus
-# github.com/thanos-io/thanos v0.37.3-0.20250107220537-0d426361672a
+# github.com/thanos-io/thanos v0.37.3-0.20250110074750-4ba0ba403896
 ## explicit; go 1.23.0
 github.com/thanos-io/thanos/pkg/api/query/querypb
 github.com/thanos-io/thanos/pkg/block

From d6de77cea5f01331a465c8f3e7a5de088fc2da77 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 14 Jan 2025 02:37:38 -0800
Subject: [PATCH 03/34] Bump the actions-dependencies group across 1 directory
 with 2 updates (#6505)

Bumps the actions-dependencies group with 2 updates in the / directory: [actions/upload-artifact](https://github.com/actions/upload-artifact) and [github/codeql-action](https://github.com/github/codeql-action).

Updates `actions/upload-artifact` from 4.5.0 to 4.6.0
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/6f51ac03b9356f520e9adb1b1b7802705f340c2b...65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08)

Updates `github/codeql-action` from 3.28.0 to 3.28.1
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/48ab28a6f5dbc2a99bf1e0131198dd8f1df78169...b6a472f63d85b9c78a3ac5e89422239fc15e9b3c)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: actions-dependencies
- dependency-name: github/codeql-action
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: actions-dependencies
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 .github/workflows/build-image.yml       |  2 +-
 .github/workflows/scorecards.yml        |  4 ++--
 .github/workflows/test-build-deploy.yml | 10 +++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml
index aacf18810a7..6a8bd118c4f 100644
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -34,7 +34,7 @@ jobs:
         run: make save-multiarch-build-image
 
       - name: Upload Docker Images Artifacts
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
         with:
           name: build-image
           path: |
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 0a0df034d88..0b635368489 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -55,7 +55,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v3.pre.node20
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v3.pre.node20
         with:
           name: SARIF file
           path: results.sarif
@@ -64,6 +64,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0
+        uses: github/codeql-action/upload-sarif@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/test-build-deploy.yml b/.github/workflows/test-build-deploy.yml
index 9e9c0499d63..9e6506a57a7 100644
--- a/.github/workflows/test-build-deploy.yml
+++ b/.github/workflows/test-build-deploy.yml
@@ -93,15 +93,15 @@ jobs:
 
       # Initializes the CodeQL tools for scanning.
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0
+        uses: github/codeql-action/init@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1
         with:
           languages: go
 
       - name: Autobuild
-        uses: github/codeql-action/autobuild@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0
+        uses: github/codeql-action/autobuild@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1
 
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0
+        uses: github/codeql-action/analyze@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1
 
   
   build:
@@ -131,7 +131,7 @@ jobs:
           touch build-image/.uptodate
           make BUILD_IN_CONTAINER=false web-build
       - name: Upload Website Artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
         with:
           name: website public
           path: website/public/
@@ -143,7 +143,7 @@ jobs:
       - name: Create Docker Images Archive
         run: tar -cvf images.tar /tmp/images
       - name: Upload Docker Images Artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
         with:
           name: Docker Images
           path: ./images.tar

From b95024fb9053f2608d780e6890f4dc4731dc6d2d Mon Sep 17 00:00:00 2001
From: SungJin1212 <tjdwls1201@gmail.com>
Date: Wed, 15 Jan 2025 01:21:13 +0900
Subject: [PATCH 04/34] calculate # of concurrency only once at the runner
 (#6506)

Signed-off-by: SungJin1212 <tjdwls1201@gmail.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 pkg/util/concurrency/runner.go | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pkg/util/concurrency/runner.go b/pkg/util/concurrency/runner.go
index 9b30da0f0e7..8f6d180c779 100644
--- a/pkg/util/concurrency/runner.go
+++ b/pkg/util/concurrency/runner.go
@@ -29,7 +29,8 @@ func ForEachUser(ctx context.Context, userIDs []string, concurrency int, userFun
 	errsMx := sync.Mutex{}
 
 	wg := sync.WaitGroup{}
-	for ix := 0; ix < min(concurrency, len(userIDs)); ix++ {
+	routines := min(concurrency, len(userIDs))
+	for ix := 0; ix < routines; ix++ {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
@@ -75,7 +76,8 @@ func ForEach(ctx context.Context, jobs []interface{}, concurrency int, jobFunc f
 
 	// Start workers to process jobs.
 	g, ctx := errgroup.WithContext(ctx)
-	for ix := 0; ix < min(concurrency, len(jobs)); ix++ {
+	routines := min(concurrency, len(jobs))
+	for ix := 0; ix < routines; ix++ {
 		g.Go(func() error {
 			for job := range ch {
 				if err := ctx.Err(); err != nil {

From 0d17b32dbbf07c33522ac00a60b074143a6125d5 Mon Sep 17 00:00:00 2001
From: Alex Le <leqiyue@amazon.com>
Date: Tue, 14 Jan 2025 11:41:25 -0800
Subject: [PATCH 05/34] Implement partition compaction planner (#6469)

* Implement partition compaction grouper

Signed-off-by: Alex Le <leqiyue@amazon.com>

* fix comment

Signed-off-by: Alex Le <leqiyue@amazon.com>

* replace level 1 compaction limits with ingestion replication factor

Signed-off-by: Alex Le <leqiyue@amazon.com>

* fix doc

Signed-off-by: Alex Le <leqiyue@amazon.com>

* update compaction_visit_marker_timeout default value

Signed-off-by: Alex Le <leqiyue@amazon.com>

* update default value for compactor_partition_index_size_limit_in_bytes

Signed-off-by: Alex Le <leqiyue@amazon.com>

* refactor code

Signed-off-by: Alex Le <leqiyue@amazon.com>

* address comments and refactor

Signed-off-by: Alex Le <leqiyue@amazon.com>

* address comment

Signed-off-by: Alex Le <leqiyue@amazon.com>

* address comment

Signed-off-by: Alex Le <leqiyue@amazon.com>

* update config name

Signed-off-by: Alex Le <leqiyue@amazon.com>

* Implement partition compaction planner

Signed-off-by: Alex Le <leqiyue@amazon.com>

* fix after rebase

Signed-off-by: Alex Le <leqiyue@amazon.com>

* addressed comments

Signed-off-by: Alex Le <leqiyue@amazon.com>

* updated doc and refactored metric

Signed-off-by: Alex Le <leqiyue@amazon.com>

* fix test

Signed-off-by: Alex Le <leqiyue@amazon.com>

---------

Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 docs/blocks-storage/compactor.md              |   6 +
 docs/configuration/config-file-reference.md   |   6 +
 pkg/compactor/compactor.go                    |  11 +-
 pkg/compactor/compactor_metrics.go            |   7 +
 pkg/compactor/compactor_metrics_test.go       |   8 +
 pkg/compactor/partition_compaction_planner.go | 132 ++++++-
 .../partition_compaction_planner_test.go      | 338 ++++++++++++++++++
 7 files changed, 497 insertions(+), 11 deletions(-)
 create mode 100644 pkg/compactor/partition_compaction_planner_test.go

diff --git a/docs/blocks-storage/compactor.md b/docs/blocks-storage/compactor.md
index f5539511ca3..1cfc53ec5c6 100644
--- a/docs/blocks-storage/compactor.md
+++ b/docs/blocks-storage/compactor.md
@@ -285,6 +285,12 @@ compactor:
     # CLI flag: -compactor.ring.wait-active-instance-timeout
     [wait_active_instance_timeout: <duration> | default = 10m]
 
+  # How long shuffle sharding planner would wait before running planning code.
+  # This delay would prevent double compaction when two compactors claimed same
+  # partition in grouper at same time.
+  # CLI flag: -compactor.sharding-planner-delay
+  [sharding_planner_delay: <duration> | default = 10s]
+
   # The compaction strategy to use. Supported values are: default, partitioning.
   # CLI flag: -compactor.compaction-strategy
   [compaction_strategy: <string> | default = "default"]
diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
index 138705552e8..548c9c67980 100644
--- a/docs/configuration/config-file-reference.md
+++ b/docs/configuration/config-file-reference.md
@@ -2334,6 +2334,12 @@ sharding_ring:
   # CLI flag: -compactor.ring.wait-active-instance-timeout
   [wait_active_instance_timeout: <duration> | default = 10m]
 
+# How long shuffle sharding planner would wait before running planning code.
+# This delay would prevent double compaction when two compactors claimed same
+# partition in grouper at same time.
+# CLI flag: -compactor.sharding-planner-delay
+[sharding_planner_delay: <duration> | default = 10s]
+
 # The compaction strategy to use. Supported values are: default, partitioning.
 # CLI flag: -compactor.compaction-strategy
 [compaction_strategy: <string> | default = "default"]
diff --git a/pkg/compactor/compactor.go b/pkg/compactor/compactor.go
index c50a98ca484..a1a9a8f8a2b 100644
--- a/pkg/compactor/compactor.go
+++ b/pkg/compactor/compactor.go
@@ -153,7 +153,7 @@ var (
 		plannerFactory := func(ctx context.Context, bkt objstore.InstrumentedBucket, logger log.Logger, cfg Config, noCompactionMarkFilter *compact.GatherNoCompactionMarkFilter, ringLifecycle *ring.Lifecycler, userID string, blockVisitMarkerReadFailed prometheus.Counter, blockVisitMarkerWriteFailed prometheus.Counter, compactorMetrics *compactorMetrics) compact.Planner {
 
 			if cfg.CompactionStrategy == util.CompactionStrategyPartitioning {
-				return NewPartitionCompactionPlanner(ctx, bkt, logger)
+				return NewPartitionCompactionPlanner(ctx, bkt, logger, cfg.BlockRanges.ToMilliseconds(), noCompactionMarkFilter.NoCompactMarkedBlocks, ringLifecycle.ID, userID, cfg.ShardingPlannerDelay, cfg.CompactionVisitMarkerTimeout, cfg.CompactionVisitMarkerFileUpdateInterval, compactorMetrics)
 			} else {
 				return NewShuffleShardingPlanner(ctx, bkt, logger, cfg.BlockRanges.ToMilliseconds(), noCompactionMarkFilter.NoCompactMarkedBlocks, ringLifecycle.ID, cfg.CompactionVisitMarkerTimeout, cfg.CompactionVisitMarkerFileUpdateInterval, blockVisitMarkerReadFailed, blockVisitMarkerWriteFailed)
 			}
@@ -234,9 +234,10 @@ type Config struct {
 	DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"`
 
 	// Compactors sharding.
-	ShardingEnabled  bool       `yaml:"sharding_enabled"`
-	ShardingStrategy string     `yaml:"sharding_strategy"`
-	ShardingRing     RingConfig `yaml:"sharding_ring"`
+	ShardingEnabled      bool          `yaml:"sharding_enabled"`
+	ShardingStrategy     string        `yaml:"sharding_strategy"`
+	ShardingRing         RingConfig    `yaml:"sharding_ring"`
+	ShardingPlannerDelay time.Duration `yaml:"sharding_planner_delay"`
 
 	// Compaction strategy.
 	CompactionStrategy string `yaml:"compaction_strategy"`
@@ -304,6 +305,8 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
 
 	f.BoolVar(&cfg.AcceptMalformedIndex, "compactor.accept-malformed-index", false, "When enabled, index verification will ignore out of order label names.")
 	f.BoolVar(&cfg.CachingBucketEnabled, "compactor.caching-bucket-enabled", false, "When enabled, caching bucket will be used for compactor, except cleaner service, which serves as the source of truth for block status")
+
+	f.DurationVar(&cfg.ShardingPlannerDelay, "compactor.sharding-planner-delay", 10*time.Second, "How long shuffle sharding planner would wait before running planning code. This delay would prevent double compaction when two compactors claimed same partition in grouper at same time.")
 }
 
 func (cfg *Config) Validate(limits validation.Limits) error {
diff --git a/pkg/compactor/compactor_metrics.go b/pkg/compactor/compactor_metrics.go
index e14fb9a0dc9..23e7bca6c02 100644
--- a/pkg/compactor/compactor_metrics.go
+++ b/pkg/compactor/compactor_metrics.go
@@ -39,6 +39,7 @@ type compactorMetrics struct {
 	remainingPlannedCompactions *prometheus.GaugeVec
 	compactionErrorsCount       *prometheus.CounterVec
 	partitionCount              *prometheus.GaugeVec
+	compactionsNotPlanned       *prometheus.CounterVec
 }
 
 const (
@@ -174,6 +175,10 @@ func newCompactorMetricsWithLabels(reg prometheus.Registerer, commonLabels []str
 		Name: "cortex_compactor_group_partition_count",
 		Help: "Number of partitions for each compaction group.",
 	}, compactionLabels)
+	m.compactionsNotPlanned = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
+		Name: "cortex_compactor_group_compactions_not_planned_total",
+		Help: "Total number of group compaction not planned due to error.",
+	}, compactionLabels)
 
 	return &m
 }
@@ -225,6 +230,7 @@ func (m *compactorMetrics) initMetricWithCompactionLabelValues(labelValue ...str
 	m.compactionFailures.WithLabelValues(labelValue...)
 	m.verticalCompactions.WithLabelValues(labelValue...)
 	m.partitionCount.WithLabelValues(labelValue...)
+	m.compactionsNotPlanned.WithLabelValues(labelValue...)
 }
 
 func (m *compactorMetrics) deleteMetricsForDeletedTenant(userID string) {
@@ -236,4 +242,5 @@ func (m *compactorMetrics) deleteMetricsForDeletedTenant(userID string) {
 	m.compactionFailures.DeleteLabelValues(userID)
 	m.verticalCompactions.DeleteLabelValues(userID)
 	m.partitionCount.DeleteLabelValues(userID)
+	m.compactionsNotPlanned.DeleteLabelValues(userID)
 }
diff --git a/pkg/compactor/compactor_metrics_test.go b/pkg/compactor/compactor_metrics_test.go
index f2a13276cd6..947fd7f396a 100644
--- a/pkg/compactor/compactor_metrics_test.go
+++ b/pkg/compactor/compactor_metrics_test.go
@@ -135,6 +135,11 @@ func TestSyncerMetrics(t *testing.T) {
 			cortex_compactor_group_partition_count{user="aaa"} 511060
 			cortex_compactor_group_partition_count{user="bbb"} 522170
 			cortex_compactor_group_partition_count{user="ccc"} 533280
+			# HELP cortex_compactor_group_compactions_not_planned_total Total number of group compaction not planned due to error.
+			# TYPE cortex_compactor_group_compactions_not_planned_total counter
+			cortex_compactor_group_compactions_not_planned_total{user="aaa"} 544390
+			cortex_compactor_group_compactions_not_planned_total{user="bbb"} 555500
+			cortex_compactor_group_compactions_not_planned_total{user="ccc"} 566610
 	`))
 	require.NoError(t, err)
 
@@ -191,4 +196,7 @@ func generateTestData(cm *compactorMetrics, base float64) {
 	cm.partitionCount.WithLabelValues("aaa").Add(46 * base)
 	cm.partitionCount.WithLabelValues("bbb").Add(47 * base)
 	cm.partitionCount.WithLabelValues("ccc").Add(48 * base)
+	cm.compactionsNotPlanned.WithLabelValues("aaa").Add(49 * base)
+	cm.compactionsNotPlanned.WithLabelValues("bbb").Add(50 * base)
+	cm.compactionsNotPlanned.WithLabelValues("ccc").Add(51 * base)
 }
diff --git a/pkg/compactor/partition_compaction_planner.go b/pkg/compactor/partition_compaction_planner.go
index 963771aa6d7..436bba426bf 100644
--- a/pkg/compactor/partition_compaction_planner.go
+++ b/pkg/compactor/partition_compaction_planner.go
@@ -2,30 +2,148 @@ package compactor
 
 import (
 	"context"
+	"fmt"
+	"time"
 
 	"github.com/go-kit/log"
+	"github.com/go-kit/log/level"
+	"github.com/oklog/ulid"
+	"github.com/pkg/errors"
 	"github.com/thanos-io/objstore"
 	"github.com/thanos-io/thanos/pkg/block/metadata"
+
+	"github.com/cortexproject/cortex/pkg/storage/tsdb"
+)
+
+var (
+	plannerCompletedPartitionError = errors.New("got completed partition")
+	plannerVisitedPartitionError   = errors.New("got partition visited by other compactor")
 )
 
 type PartitionCompactionPlanner struct {
-	ctx    context.Context
-	bkt    objstore.InstrumentedBucket
-	logger log.Logger
+	ctx                                    context.Context
+	bkt                                    objstore.InstrumentedBucket
+	logger                                 log.Logger
+	ranges                                 []int64
+	noCompBlocksFunc                       func() map[ulid.ULID]*metadata.NoCompactMark
+	ringLifecyclerID                       string
+	userID                                 string
+	plannerDelay                           time.Duration
+	partitionVisitMarkerTimeout            time.Duration
+	partitionVisitMarkerFileUpdateInterval time.Duration
+	compactorMetrics                       *compactorMetrics
 }
 
 func NewPartitionCompactionPlanner(
 	ctx context.Context,
 	bkt objstore.InstrumentedBucket,
 	logger log.Logger,
+	ranges []int64,
+	noCompBlocksFunc func() map[ulid.ULID]*metadata.NoCompactMark,
+	ringLifecyclerID string,
+	userID string,
+	plannerDelay time.Duration,
+	partitionVisitMarkerTimeout time.Duration,
+	partitionVisitMarkerFileUpdateInterval time.Duration,
+	compactorMetrics *compactorMetrics,
 ) *PartitionCompactionPlanner {
 	return &PartitionCompactionPlanner{
-		ctx:    ctx,
-		bkt:    bkt,
-		logger: logger,
+		ctx:                                    ctx,
+		bkt:                                    bkt,
+		logger:                                 logger,
+		ranges:                                 ranges,
+		noCompBlocksFunc:                       noCompBlocksFunc,
+		ringLifecyclerID:                       ringLifecyclerID,
+		userID:                                 userID,
+		plannerDelay:                           plannerDelay,
+		partitionVisitMarkerTimeout:            partitionVisitMarkerTimeout,
+		partitionVisitMarkerFileUpdateInterval: partitionVisitMarkerFileUpdateInterval,
+		compactorMetrics:                       compactorMetrics,
 	}
 }
 
 func (p *PartitionCompactionPlanner) Plan(ctx context.Context, metasByMinTime []*metadata.Meta, errChan chan error, extensions any) ([]*metadata.Meta, error) {
-	panic("PartitionCompactionPlanner not implemented")
+	cortexMetaExtensions, err := tsdb.ConvertToCortexMetaExtensions(extensions)
+	if err != nil {
+		return nil, err
+	}
+	if cortexMetaExtensions == nil {
+		return nil, fmt.Errorf("cortexMetaExtensions cannot be nil")
+	}
+	return p.PlanWithPartition(ctx, metasByMinTime, cortexMetaExtensions, errChan)
+}
+
+func (p *PartitionCompactionPlanner) PlanWithPartition(_ context.Context, metasByMinTime []*metadata.Meta, cortexMetaExtensions *tsdb.CortexMetaExtensions, errChan chan error) ([]*metadata.Meta, error) {
+	partitionInfo := cortexMetaExtensions.PartitionInfo
+	if partitionInfo == nil {
+		return nil, fmt.Errorf("partitionInfo cannot be nil")
+	}
+	partitionID := partitionInfo.PartitionID
+	partitionedGroupID := partitionInfo.PartitionedGroupID
+
+	// This delay would prevent double compaction when two compactors
+	// claimed same partition in grouper at same time.
+	time.Sleep(p.plannerDelay)
+
+	visitMarker := newPartitionVisitMarker(p.ringLifecyclerID, partitionedGroupID, partitionID)
+	visitMarkerManager := NewVisitMarkerManager(p.bkt, p.logger, p.ringLifecyclerID, visitMarker)
+	existingPartitionVisitMarker := &partitionVisitMarker{}
+	err := visitMarkerManager.ReadVisitMarker(p.ctx, existingPartitionVisitMarker)
+	visitMarkerExists := true
+	if err != nil {
+		if errors.Is(err, errorVisitMarkerNotFound) {
+			visitMarkerExists = false
+		} else {
+			p.compactorMetrics.compactionsNotPlanned.WithLabelValues(p.userID, cortexMetaExtensions.TimeRangeStr()).Inc()
+			return nil, fmt.Errorf("unable to get visit marker file for partition with partition ID %d, partitioned group ID %d: %s", partitionID, partitionedGroupID, err.Error())
+		}
+	}
+	if visitMarkerExists {
+		if existingPartitionVisitMarker.GetStatus() == Completed {
+			level.Warn(p.logger).Log("msg", "partition is in completed status", "partitioned_group_id", partitionedGroupID, "partition_id", partitionID, "compactor_id", p.ringLifecyclerID, existingPartitionVisitMarker.String())
+			return nil, plannerCompletedPartitionError
+		}
+		if !existingPartitionVisitMarker.IsPendingByCompactor(p.partitionVisitMarkerTimeout, partitionID, p.ringLifecyclerID) {
+			level.Warn(p.logger).Log("msg", "partition is not visited by current compactor", "partitioned_group_id", partitionedGroupID, "partition_id", partitionID, "compactor_id", p.ringLifecyclerID, existingPartitionVisitMarker.String())
+			return nil, plannerVisitedPartitionError
+		}
+	}
+
+	// Ensure all blocks fits within the largest range. This is a double check
+	// to ensure there's no bug in the previous blocks grouping, given this Plan()
+	// is just a pass-through.
+	// Modified from https://github.com/cortexproject/cortex/pull/2616/files#diff-e3051fc530c48bb276ba958dd8fadc684e546bd7964e6bc75cef9a86ef8df344R28-R63
+	largestRange := p.ranges[len(p.ranges)-1]
+	rangeStart := getRangeStart(metasByMinTime[0], largestRange)
+	rangeEnd := rangeStart + largestRange
+	noCompactMarked := p.noCompBlocksFunc()
+	resultMetas := make([]*metadata.Meta, 0, len(metasByMinTime))
+
+	for _, b := range metasByMinTime {
+		if b.ULID == DUMMY_BLOCK_ID {
+			continue
+		}
+		blockID := b.ULID.String()
+		if _, excluded := noCompactMarked[b.ULID]; excluded {
+			continue
+		}
+
+		if b.MinTime < rangeStart || b.MaxTime > rangeEnd {
+			p.compactorMetrics.compactionsNotPlanned.WithLabelValues(p.userID, cortexMetaExtensions.TimeRangeStr()).Inc()
+			level.Warn(p.logger).Log("msg", "block is outside the largest expected range", "partitioned_group_id", partitionedGroupID, "partition_id", partitionID, "block_id", blockID, "block_min_time", b.MinTime, "block_max_time", b.MaxTime, "range_start", rangeStart, "range_end", rangeEnd)
+			return nil, fmt.Errorf("block %s with time range %d:%d is outside the largest expected range %d:%d", blockID, b.MinTime, b.MaxTime, rangeStart, rangeEnd)
+		}
+
+		resultMetas = append(resultMetas, b)
+	}
+
+	if len(resultMetas) < 1 {
+		p.compactorMetrics.compactionsNotPlanned.WithLabelValues(p.userID, cortexMetaExtensions.TimeRangeStr()).Inc()
+		level.Warn(p.logger).Log("msg", "result meta size is empty", "partitioned_group_id", partitionedGroupID, "partition_id", partitionID, "group_size", len(metasByMinTime))
+		return nil, nil
+	}
+
+	go visitMarkerManager.HeartBeat(p.ctx, errChan, p.partitionVisitMarkerFileUpdateInterval, false)
+
+	return resultMetas, nil
 }
diff --git a/pkg/compactor/partition_compaction_planner_test.go b/pkg/compactor/partition_compaction_planner_test.go
new file mode 100644
index 00000000000..67d5ba60f54
--- /dev/null
+++ b/pkg/compactor/partition_compaction_planner_test.go
@@ -0,0 +1,338 @@
+package compactor
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/go-kit/log"
+	"github.com/oklog/ulid"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/prometheus/tsdb"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/objstore"
+	"github.com/thanos-io/thanos/pkg/block/metadata"
+
+	"github.com/cortexproject/cortex/pkg/storage/bucket"
+	cortextsdb "github.com/cortexproject/cortex/pkg/storage/tsdb"
+	"github.com/cortexproject/cortex/pkg/util/concurrency"
+)
+
+func TestPartitionCompactionPlanner_Plan(t *testing.T) {
+	type VisitedPartition struct {
+		isExpired   bool
+		compactorID string
+	}
+
+	currentCompactor := "test-compactor"
+	otherCompactor := "other-compactor"
+
+	block1ulid := ulid.MustNew(1, nil)
+	block2ulid := ulid.MustNew(2, nil)
+	block3ulid := ulid.MustNew(3, nil)
+
+	tests := map[string]struct {
+		ranges           []int64
+		noCompactBlocks  map[ulid.ULID]*metadata.NoCompactMark
+		blocks           []*metadata.Meta
+		expected         []*metadata.Meta
+		expectedErr      error
+		visitedPartition VisitedPartition
+	}{
+		"test basic plan": {
+			ranges: []int64{2 * time.Hour.Milliseconds()},
+			blocks: []*metadata.Meta{
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block1ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block2ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+			},
+			visitedPartition: VisitedPartition{
+				isExpired:   false,
+				compactorID: currentCompactor,
+			},
+			expected: []*metadata.Meta{
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block1ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block2ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+			},
+		},
+		"test blocks outside largest range smaller min time after": {
+			ranges: []int64{2 * time.Hour.Milliseconds()},
+			blocks: []*metadata.Meta{
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block1ulid,
+						MinTime: 2 * time.Hour.Milliseconds(),
+						MaxTime: 4 * time.Hour.Milliseconds(),
+					},
+				},
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block2ulid,
+						MinTime: 0 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+			},
+			visitedPartition: VisitedPartition{
+				isExpired:   false,
+				compactorID: currentCompactor,
+			},
+			expectedErr: fmt.Errorf("block %s with time range %d:%d is outside the largest expected range %d:%d", block2ulid.String(), 0*time.Hour.Milliseconds(), 2*time.Hour.Milliseconds(), 2*time.Hour.Milliseconds(), 4*time.Hour.Milliseconds()),
+		},
+		"test blocks outside largest range 1": {
+			ranges: []int64{2 * time.Hour.Milliseconds()},
+			blocks: []*metadata.Meta{
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block1ulid,
+						MinTime: 0 * time.Hour.Milliseconds(),
+						MaxTime: 4 * time.Hour.Milliseconds(),
+					},
+				},
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block2ulid,
+						MinTime: 0 * time.Hour.Milliseconds(),
+						MaxTime: 4 * time.Hour.Milliseconds(),
+					},
+				},
+			},
+			visitedPartition: VisitedPartition{
+				isExpired:   false,
+				compactorID: currentCompactor,
+			},
+			expectedErr: fmt.Errorf("block %s with time range %d:%d is outside the largest expected range %d:%d", block1ulid.String(), 0*time.Hour.Milliseconds(), 4*time.Hour.Milliseconds(), 0*time.Hour.Milliseconds(), 2*time.Hour.Milliseconds()),
+		},
+		"test blocks outside largest range 2": {
+			ranges: []int64{2 * time.Hour.Milliseconds()},
+			blocks: []*metadata.Meta{
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block1ulid,
+						MinTime: 0 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block2ulid,
+						MinTime: 0 * time.Hour.Milliseconds(),
+						MaxTime: 4 * time.Hour.Milliseconds(),
+					},
+				},
+			},
+			visitedPartition: VisitedPartition{
+				isExpired:   false,
+				compactorID: currentCompactor,
+			},
+			expectedErr: fmt.Errorf("block %s with time range %d:%d is outside the largest expected range %d:%d", block2ulid.String(), 0*time.Hour.Milliseconds(), 4*time.Hour.Milliseconds(), 0*time.Hour.Milliseconds(), 2*time.Hour.Milliseconds()),
+		},
+		"test should skip blocks marked for no compact": {
+			ranges:          []int64{2 * time.Hour.Milliseconds()},
+			noCompactBlocks: map[ulid.ULID]*metadata.NoCompactMark{block1ulid: {}},
+			blocks: []*metadata.Meta{
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block1ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block2ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block3ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+			},
+			visitedPartition: VisitedPartition{
+				isExpired:   false,
+				compactorID: currentCompactor,
+			},
+			expected: []*metadata.Meta{
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block2ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block3ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+			},
+		},
+		"test should not compact if there is no compactable block": {
+			ranges:          []int64{2 * time.Hour.Milliseconds()},
+			noCompactBlocks: map[ulid.ULID]*metadata.NoCompactMark{block1ulid: {}},
+			blocks: []*metadata.Meta{
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block1ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+			},
+			visitedPartition: VisitedPartition{
+				isExpired:   false,
+				compactorID: currentCompactor,
+			},
+			expected: []*metadata.Meta{},
+		},
+		"test should not compact if visit marker file is not expired and visited by other compactor": {
+			ranges: []int64{2 * time.Hour.Milliseconds()},
+			blocks: []*metadata.Meta{
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block1ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block2ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+			},
+			visitedPartition: VisitedPartition{
+				isExpired:   false,
+				compactorID: otherCompactor,
+			},
+			expectedErr: plannerVisitedPartitionError,
+		},
+		"test should not compact if visit marker file is expired": {
+			ranges: []int64{2 * time.Hour.Milliseconds()},
+			blocks: []*metadata.Meta{
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block1ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+				{
+					BlockMeta: tsdb.BlockMeta{
+						ULID:    block2ulid,
+						MinTime: 1 * time.Hour.Milliseconds(),
+						MaxTime: 2 * time.Hour.Milliseconds(),
+					},
+				},
+			},
+			visitedPartition: VisitedPartition{
+				isExpired:   true,
+				compactorID: currentCompactor,
+			},
+			expectedErr: plannerVisitedPartitionError,
+		},
+	}
+
+	visitMarkerTimeout := 5 * time.Minute
+	partitionedGroupID := uint32(1)
+	partitionID := 0
+	for testName, testData := range tests {
+		t.Run(testName, func(t *testing.T) {
+			bkt := &bucket.ClientMock{}
+			visitMarkerFile := GetPartitionVisitMarkerFilePath(partitionedGroupID, partitionID)
+			expireTime := time.Now()
+			if testData.visitedPartition.isExpired {
+				expireTime = expireTime.Add(-1 * visitMarkerTimeout)
+			}
+			visitMarker := partitionVisitMarker{
+				CompactorID:        testData.visitedPartition.compactorID,
+				PartitionedGroupID: partitionedGroupID,
+				PartitionID:        partitionID,
+				VisitTime:          expireTime.Unix(),
+				Status:             Pending,
+				Version:            PartitionVisitMarkerVersion1,
+			}
+			visitMarkerFileContent, _ := json.Marshal(visitMarker)
+			bkt.MockGet(visitMarkerFile, string(visitMarkerFileContent), nil)
+			bkt.MockUpload(mock.Anything, nil)
+			bkt.MockGet(mock.Anything, "", nil)
+
+			registerer := prometheus.NewPedanticRegistry()
+
+			metrics := newCompactorMetrics(registerer)
+
+			logs := &concurrency.SyncBuffer{}
+			logger := log.NewLogfmtLogger(logs)
+			p := NewPartitionCompactionPlanner(
+				context.Background(),
+				objstore.WithNoopInstr(bkt),
+				logger,
+				testData.ranges,
+				func() map[ulid.ULID]*metadata.NoCompactMark {
+					return testData.noCompactBlocks
+				},
+				currentCompactor,
+				"test-user",
+				10*time.Millisecond,
+				visitMarkerTimeout,
+				time.Minute,
+				metrics,
+			)
+			actual, err := p.Plan(context.Background(), testData.blocks, nil, &cortextsdb.CortexMetaExtensions{
+				PartitionInfo: &cortextsdb.PartitionInfo{
+					PartitionCount:     1,
+					PartitionID:        partitionID,
+					PartitionedGroupID: partitionedGroupID,
+				},
+			})
+
+			if testData.expectedErr != nil {
+				assert.Equal(t, err, testData.expectedErr)
+			} else {
+				require.NoError(t, err)
+			}
+
+			require.Len(t, actual, len(testData.expected))
+
+			for idx, expectedMeta := range testData.expected {
+				assert.Equal(t, expectedMeta.ULID, actual[idx].ULID)
+			}
+		})
+	}
+}

From 5ebdb83eabb4be28a388c5098f6639af00b64dc7 Mon Sep 17 00:00:00 2001
From: SungJin1212 <tjdwls1201@gmail.com>
Date: Thu, 16 Jan 2025 06:26:17 +0900
Subject: [PATCH 06/34] Add max tenant config to tenant federation (#6493)

Signed-off-by: SungJin1212 <tjdwls1201@gmail.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 CHANGELOG.md                                  |   1 +
 docs/configuration/config-file-reference.md   |   4 +
 pkg/cortex/modules.go                         |   2 +-
 pkg/frontend/frontend_test.go                 |   3 +-
 pkg/frontend/transport/handler.go             |  28 +++--
 pkg/frontend/transport/handler_test.go        | 106 +++++++++++++++++-
 pkg/frontend/v1/frontend_test.go              |   5 +-
 .../tenantfederation/tenant_federation.go     |   3 +
 8 files changed, 140 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e3c309dae9a..e4820d90b11 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@
 * [FEATURE] Query Frontend: Support an exemplar federated query when `-tenant-federation.enabled=true`. #6455
 * [FEATURE] Ingester/StoreGateway: Add support for cache regex query matchers via `-ingester.matchers-cache-max-items` and `-blocks-storage.bucket-store.matchers-cache-max-items`. #6477 #6491
 * [ENHANCEMENT] Query Frontend: Add a `source` label to query stat metrics. #6470
+* [ENHANCEMENT] Query Frontend: Add a flag `-tenant-federation.max-tenant` to limit the number of tenants for federated query. #6493
 * [ENHANCEMENT] Querier: Add a `-tenant-federation.max-concurrent` flags to configure the number of worker processing federated query and add a `cortex_querier_federated_tenants_per_query` histogram to track the number of tenants per query. #6449
 * [ENHANCEMENT] Query Frontend: Add a number of series in the query response to the query stat log. #6423
 * [ENHANCEMENT] Store Gateway: Add a hedged request to reduce the tail latency. #6388
diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
index 548c9c67980..28154b7b180 100644
--- a/docs/configuration/config-file-reference.md
+++ b/docs/configuration/config-file-reference.md
@@ -161,6 +161,10 @@ tenant_federation:
   # CLI flag: -tenant-federation.max-concurrent
   [max_concurrent: <int> | default = 16]
 
+  # A maximum number of tenants to query at once. 0 means no limit.
+  # CLI flag: -tenant-federation.max-tenant
+  [max_tenant: <int> | default = 0]
+
 # The ruler_config configures the Cortex ruler.
 [ruler: <ruler_config>]
 
diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go
index 795c588af51..390212a313d 100644
--- a/pkg/cortex/modules.go
+++ b/pkg/cortex/modules.go
@@ -536,7 +536,7 @@ func (t *Cortex) initQueryFrontend() (serv services.Service, err error) {
 	// Wrap roundtripper into Tripperware.
 	roundTripper = t.QueryFrontendTripperware(roundTripper)
 
-	handler := transport.NewHandler(t.Cfg.Frontend.Handler, roundTripper, util_log.Logger, prometheus.DefaultRegisterer)
+	handler := transport.NewHandler(t.Cfg.Frontend.Handler, t.Cfg.TenantFederation, roundTripper, util_log.Logger, prometheus.DefaultRegisterer)
 	t.API.RegisterQueryFrontendHandler(handler)
 
 	if frontendV1 != nil {
diff --git a/pkg/frontend/frontend_test.go b/pkg/frontend/frontend_test.go
index 77694689bdb..08251a2f2ed 100644
--- a/pkg/frontend/frontend_test.go
+++ b/pkg/frontend/frontend_test.go
@@ -27,6 +27,7 @@ import (
 	"github.com/cortexproject/cortex/pkg/frontend/transport"
 	frontendv1 "github.com/cortexproject/cortex/pkg/frontend/v1"
 	"github.com/cortexproject/cortex/pkg/frontend/v1/frontendv1pb"
+	"github.com/cortexproject/cortex/pkg/querier/tenantfederation"
 	querier_worker "github.com/cortexproject/cortex/pkg/querier/worker"
 	"github.com/cortexproject/cortex/pkg/util/concurrency"
 	"github.com/cortexproject/cortex/pkg/util/flagext"
@@ -279,7 +280,7 @@ func testFrontend(t *testing.T, config CombinedFrontendConfig, handler http.Hand
 	r.PathPrefix("/").Handler(middleware.Merge(
 		middleware.AuthenticateUser,
 		middleware.Tracer{},
-	).Wrap(transport.NewHandler(config.Handler, rt, logger, nil)))
+	).Wrap(transport.NewHandler(config.Handler, tenantfederation.Config{}, rt, logger, nil)))
 
 	httpServer := http.Server{
 		Handler: r,
diff --git a/pkg/frontend/transport/handler.go b/pkg/frontend/transport/handler.go
index 68a0ae61445..b703778ca0a 100644
--- a/pkg/frontend/transport/handler.go
+++ b/pkg/frontend/transport/handler.go
@@ -22,6 +22,7 @@ import (
 	"google.golang.org/grpc/status"
 
 	querier_stats "github.com/cortexproject/cortex/pkg/querier/stats"
+	"github.com/cortexproject/cortex/pkg/querier/tenantfederation"
 	"github.com/cortexproject/cortex/pkg/querier/tripperware"
 	"github.com/cortexproject/cortex/pkg/tenant"
 	"github.com/cortexproject/cortex/pkg/util"
@@ -33,6 +34,8 @@ const (
 	// StatusClientClosedRequest is the status code for when a client request cancellation of a http request
 	StatusClientClosedRequest = 499
 	ServiceTimingHeaderName   = "Server-Timing"
+
+	errTooManyTenants = "too many tenants, max: %d, actual: %d"
 )
 
 var (
@@ -84,9 +87,10 @@ func (cfg *HandlerConfig) RegisterFlags(f *flag.FlagSet) {
 // Handler accepts queries and forwards them to RoundTripper. It can log slow queries,
 // but all other logic is inside the RoundTripper.
 type Handler struct {
-	cfg          HandlerConfig
-	log          log.Logger
-	roundTripper http.RoundTripper
+	cfg                 HandlerConfig
+	tenantFederationCfg tenantfederation.Config
+	log                 log.Logger
+	roundTripper        http.RoundTripper
 
 	// Metrics.
 	querySeconds        *prometheus.CounterVec
@@ -101,11 +105,12 @@ type Handler struct {
 }
 
 // NewHandler creates a new frontend handler.
-func NewHandler(cfg HandlerConfig, roundTripper http.RoundTripper, log log.Logger, reg prometheus.Registerer) *Handler {
+func NewHandler(cfg HandlerConfig, tenantFederationCfg tenantfederation.Config, roundTripper http.RoundTripper, log log.Logger, reg prometheus.Registerer) *Handler {
 	h := &Handler{
-		cfg:          cfg,
-		log:          log,
-		roundTripper: roundTripper,
+		cfg:                 cfg,
+		tenantFederationCfg: tenantFederationCfg,
+		log:                 log,
+		roundTripper:        roundTripper,
 	}
 
 	if cfg.QueryStatsEnabled {
@@ -185,6 +190,15 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	if err != nil {
 		return
 	}
+
+	if f.tenantFederationCfg.Enabled {
+		maxTenant := f.tenantFederationCfg.MaxTenant
+		if maxTenant > 0 && len(tenantIDs) > maxTenant {
+			http.Error(w, fmt.Errorf(errTooManyTenants, maxTenant, len(tenantIDs)).Error(), http.StatusBadRequest)
+			return
+		}
+	}
+
 	userID := tenant.JoinTenantIDs(tenantIDs)
 
 	// Initialise the stats in the context and make sure it's propagated
diff --git a/pkg/frontend/transport/handler_test.go b/pkg/frontend/transport/handler_test.go
index c21e1bf18ea..92e0b59fd48 100644
--- a/pkg/frontend/transport/handler_test.go
+++ b/pkg/frontend/transport/handler_test.go
@@ -20,11 +20,14 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"github.com/weaveworks/common/httpgrpc"
+	"github.com/weaveworks/common/middleware"
 	"github.com/weaveworks/common/user"
 	"google.golang.org/grpc/codes"
 
 	querier_stats "github.com/cortexproject/cortex/pkg/querier/stats"
+	"github.com/cortexproject/cortex/pkg/querier/tenantfederation"
 	"github.com/cortexproject/cortex/pkg/querier/tripperware"
+	"github.com/cortexproject/cortex/pkg/tenant"
 	util_api "github.com/cortexproject/cortex/pkg/util/api"
 	util_log "github.com/cortexproject/cortex/pkg/util/log"
 )
@@ -178,6 +181,7 @@ func TestHandler_ServeHTTP(t *testing.T) {
 		}, nil
 	})
 	userID := "12345"
+	tenantFederationCfg := tenantfederation.Config{}
 	for _, tt := range []struct {
 		name                       string
 		cfg                        HandlerConfig
@@ -379,7 +383,7 @@ func TestHandler_ServeHTTP(t *testing.T) {
 	} {
 		t.Run(tt.name, func(t *testing.T) {
 			reg := prometheus.NewPedanticRegistry()
-			handler := NewHandler(tt.cfg, tt.roundTripperFunc, log.NewNopLogger(), reg)
+			handler := NewHandler(tt.cfg, tenantFederationCfg, tt.roundTripperFunc, log.NewNopLogger(), reg)
 
 			ctx := user.InjectOrgID(context.Background(), userID)
 			req := httptest.NewRequest("GET", "/", nil)
@@ -413,7 +417,7 @@ func TestHandler_ServeHTTP(t *testing.T) {
 func TestReportQueryStatsFormat(t *testing.T) {
 	outputBuf := bytes.NewBuffer(nil)
 	logger := log.NewSyncLogger(log.NewLogfmtLogger(outputBuf))
-	handler := NewHandler(HandlerConfig{QueryStatsEnabled: true}, http.DefaultTransport, logger, nil)
+	handler := NewHandler(HandlerConfig{QueryStatsEnabled: true}, tenantfederation.Config{}, http.DefaultTransport, logger, nil)
 	userID := "fake"
 	req, _ := http.NewRequest(http.MethodGet, "http://localhost:8080/prometheus/api/v1/query", nil)
 	resp := &http.Response{ContentLength: 1000}
@@ -506,3 +510,101 @@ func TestReportQueryStatsFormat(t *testing.T) {
 		})
 	}
 }
+
+func Test_TenantFederation_MaxTenant(t *testing.T) {
+	// set a multi tenant resolver
+	tenant.WithDefaultResolver(tenant.NewMultiResolver())
+
+	roundTripper := roundTripperFunc(func(req *http.Request) (*http.Response, error) {
+		return &http.Response{
+			StatusCode: http.StatusOK,
+			Body:       io.NopCloser(strings.NewReader("{}")),
+		}, nil
+	})
+
+	tests := []struct {
+		name               string
+		cfg                tenantfederation.Config
+		orgId              string
+		expectedStatusCode int
+		expectedErrMsg     string
+	}{
+		{
+			name: "one tenant",
+			cfg: tenantfederation.Config{
+				Enabled:   true,
+				MaxTenant: 0,
+			},
+			orgId:              "org1",
+			expectedStatusCode: http.StatusOK,
+		},
+		{
+			name: "less than max tenant",
+			cfg: tenantfederation.Config{
+				Enabled:   true,
+				MaxTenant: 3,
+			},
+			orgId:              "org1|org2",
+			expectedStatusCode: http.StatusOK,
+		},
+		{
+			name: "equal to max tenant",
+			cfg: tenantfederation.Config{
+				Enabled:   true,
+				MaxTenant: 2,
+			},
+			orgId:              "org1|org2",
+			expectedStatusCode: http.StatusOK,
+		},
+		{
+			name: "exceeds max tenant",
+			cfg: tenantfederation.Config{
+				Enabled:   true,
+				MaxTenant: 2,
+			},
+			orgId:              "org1|org2|org3",
+			expectedStatusCode: http.StatusBadRequest,
+			expectedErrMsg:     "too many tenants, max: 2, actual: 3",
+		},
+		{
+			name: "no org Id",
+			cfg: tenantfederation.Config{
+				Enabled:   true,
+				MaxTenant: 0,
+			},
+			orgId:              "",
+			expectedStatusCode: http.StatusUnauthorized,
+			expectedErrMsg:     "no org id",
+		},
+		{
+			name: "no limit",
+			cfg: tenantfederation.Config{
+				Enabled:   true,
+				MaxTenant: 0,
+			},
+			orgId:              "org1|org2|org3",
+			expectedStatusCode: http.StatusOK,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			handler := NewHandler(HandlerConfig{}, test.cfg, roundTripper, log.NewNopLogger(), nil)
+			handlerWithAuth := middleware.Merge(middleware.AuthenticateUser).Wrap(handler)
+
+			req := httptest.NewRequest("GET", "http://fake", nil)
+			req.Header.Set("X-Scope-OrgId", test.orgId)
+			resp := httptest.NewRecorder()
+
+			handlerWithAuth.ServeHTTP(resp, req)
+
+			body, err := io.ReadAll(resp.Body)
+			require.NoError(t, err)
+			require.Equal(t, test.expectedStatusCode, resp.Code)
+
+			if test.expectedErrMsg != "" {
+				require.Contains(t, string(body), test.expectedErrMsg)
+			}
+		})
+	}
+}
diff --git a/pkg/frontend/v1/frontend_test.go b/pkg/frontend/v1/frontend_test.go
index ef7cd705f2a..7ae0c97c299 100644
--- a/pkg/frontend/v1/frontend_test.go
+++ b/pkg/frontend/v1/frontend_test.go
@@ -29,6 +29,7 @@ import (
 
 	"github.com/cortexproject/cortex/pkg/frontend/transport"
 	"github.com/cortexproject/cortex/pkg/frontend/v1/frontendv1pb"
+	"github.com/cortexproject/cortex/pkg/querier/tenantfederation"
 	querier_worker "github.com/cortexproject/cortex/pkg/querier/worker"
 	"github.com/cortexproject/cortex/pkg/scheduler/queue"
 	"github.com/cortexproject/cortex/pkg/util/flagext"
@@ -264,6 +265,8 @@ func testFrontend(t *testing.T, config Config, handler http.Handler, test func(a
 
 	// Default HTTP handler config.
 	handlerCfg := transport.HandlerConfig{}
+	tenantFederationCfg := tenantfederation.Config{}
+
 	flagext.DefaultValues(&handlerCfg)
 
 	rt := transport.AdaptGrpcRoundTripperToHTTPRoundTripper(v1)
@@ -271,7 +274,7 @@ func testFrontend(t *testing.T, config Config, handler http.Handler, test func(a
 	r.PathPrefix("/").Handler(middleware.Merge(
 		middleware.AuthenticateUser,
 		middleware.Tracer{},
-	).Wrap(transport.NewHandler(handlerCfg, rt, logger, nil)))
+	).Wrap(transport.NewHandler(handlerCfg, tenantFederationCfg, rt, logger, nil)))
 
 	httpServer := http.Server{
 		Handler: r,
diff --git a/pkg/querier/tenantfederation/tenant_federation.go b/pkg/querier/tenantfederation/tenant_federation.go
index 56e5fb59db9..4b161ab7328 100644
--- a/pkg/querier/tenantfederation/tenant_federation.go
+++ b/pkg/querier/tenantfederation/tenant_federation.go
@@ -9,9 +9,12 @@ type Config struct {
 	Enabled bool `yaml:"enabled"`
 	// MaxConcurrent The number of workers used for processing federated query.
 	MaxConcurrent int `yaml:"max_concurrent"`
+	// MaxTenant A maximum number of tenants to query at once.
+	MaxTenant int `yaml:"max_tenant"`
 }
 
 func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
 	f.BoolVar(&cfg.Enabled, "tenant-federation.enabled", false, "If enabled on all Cortex services, queries can be federated across multiple tenants. The tenant IDs involved need to be specified separated by a `|` character in the `X-Scope-OrgID` header (experimental).")
 	f.IntVar(&cfg.MaxConcurrent, "tenant-federation.max-concurrent", defaultMaxConcurrency, "The number of workers used to process each federated query.")
+	f.IntVar(&cfg.MaxTenant, "tenant-federation.max-tenant", 0, "A maximum number of tenants to query at once. 0 means no limit.")
 }

From 963d7bde71085af5d7466152cd53ae660df9d858 Mon Sep 17 00:00:00 2001
From: Alex Le <leqiyue@amazon.com>
Date: Wed, 15 Jan 2025 13:43:00 -0800
Subject: [PATCH 07/34] Add cleaner logic to clean partition compaction blocks
 and related files (#6507)

* Add cleaner logic to clean partition compaction blocks and related files

Signed-off-by: Alex Le <leqiyue@amazon.com>

* refactored metrics

Signed-off-by: Alex Le <leqiyue@amazon.com>

* refactor

Signed-off-by: Alex Le <leqiyue@amazon.com>

* update logs

Signed-off-by: Alex Le <leqiyue@amazon.com>

---------

Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 pkg/compactor/blocks_cleaner.go         | 133 ++++++++++++++++++++++++
 pkg/compactor/blocks_cleaner_test.go    |  96 +++++++++++++++--
 pkg/compactor/compactor.go              |   6 +-
 pkg/compactor/partitioned_group_info.go |  21 ++++
 4 files changed, 248 insertions(+), 8 deletions(-)

diff --git a/pkg/compactor/blocks_cleaner.go b/pkg/compactor/blocks_cleaner.go
index 3ea46a5f38a..71dfc775dfd 100644
--- a/pkg/compactor/blocks_cleaner.go
+++ b/pkg/compactor/blocks_cleaner.go
@@ -3,6 +3,7 @@ package compactor
 import (
 	"context"
 	"fmt"
+	"strings"
 	"sync"
 	"time"
 
@@ -39,6 +40,8 @@ type BlocksCleanerConfig struct {
 	CleanupConcurrency                 int
 	BlockDeletionMarksMigrationEnabled bool          // TODO Discuss whether we should remove it in Cortex 1.8.0 and document that upgrading to 1.7.0 before 1.8.0 is required.
 	TenantCleanupDelay                 time.Duration // Delay before removing tenant deletion mark and "debug".
+	ShardingStrategy                   string
+	CompactionStrategy                 string
 }
 
 type BlocksCleaner struct {
@@ -57,6 +60,7 @@ type BlocksCleaner struct {
 
 	cleanerVisitMarkerTimeout            time.Duration
 	cleanerVisitMarkerFileUpdateInterval time.Duration
+	compactionVisitMarkerTimeout         time.Duration
 
 	// Metrics.
 	runsStarted                       *prometheus.CounterVec
@@ -73,12 +77,16 @@ type BlocksCleaner struct {
 	tenantBucketIndexLastUpdate       *prometheus.GaugeVec
 	tenantBlocksCleanedTotal          *prometheus.CounterVec
 	tenantCleanDuration               *prometheus.GaugeVec
+	remainingPlannedCompactions       *prometheus.GaugeVec
+	inProgressCompactions             *prometheus.GaugeVec
+	oldestPartitionGroupOffset        *prometheus.GaugeVec
 }
 
 func NewBlocksCleaner(
 	cfg BlocksCleanerConfig,
 	bucketClient objstore.InstrumentedBucket,
 	usersScanner *cortex_tsdb.UsersScanner,
+	compactionVisitMarkerTimeout time.Duration,
 	cfgProvider ConfigProvider,
 	logger log.Logger,
 	ringLifecyclerID string,
@@ -86,11 +94,27 @@ func NewBlocksCleaner(
 	cleanerVisitMarkerTimeout time.Duration,
 	cleanerVisitMarkerFileUpdateInterval time.Duration,
 	blocksMarkedForDeletion *prometheus.CounterVec,
+	remainingPlannedCompactions *prometheus.GaugeVec,
 ) *BlocksCleaner {
+
+	var inProgressCompactions *prometheus.GaugeVec
+	var oldestPartitionGroupOffset *prometheus.GaugeVec
+	if cfg.ShardingStrategy == util.ShardingStrategyShuffle && cfg.CompactionStrategy == util.CompactionStrategyPartitioning {
+		inProgressCompactions = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
+			Name: "cortex_compactor_in_progress_compactions",
+			Help: "Total number of in progress compactions. Only available with shuffle-sharding strategy and partitioning compaction strategy",
+		}, commonLabels)
+		oldestPartitionGroupOffset = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
+			Name: "cortex_compactor_oldest_partition_offset",
+			Help: "Time in seconds between now and the oldest created partition group not completed. Only available with shuffle-sharding strategy and partitioning compaction strategy",
+		}, commonLabels)
+	}
+
 	c := &BlocksCleaner{
 		cfg:                                  cfg,
 		bucketClient:                         bucketClient,
 		usersScanner:                         usersScanner,
+		compactionVisitMarkerTimeout:         compactionVisitMarkerTimeout,
 		cfgProvider:                          cfgProvider,
 		logger:                               log.With(logger, "component", "cleaner"),
 		ringLifecyclerID:                     ringLifecyclerID,
@@ -153,6 +177,9 @@ func NewBlocksCleaner(
 			Name: "cortex_bucket_clean_duration_seconds",
 			Help: "Duration of cleaner runtime for a tenant in seconds",
 		}, commonLabels),
+		remainingPlannedCompactions: remainingPlannedCompactions,
+		inProgressCompactions:       inProgressCompactions,
+		oldestPartitionGroupOffset:  oldestPartitionGroupOffset,
 	}
 
 	c.Service = services.NewBasicService(c.starting, c.loop, nil)
@@ -327,6 +354,13 @@ func (c *BlocksCleaner) scanUsers(ctx context.Context) ([]string, []string, erro
 			c.tenantBlocksMarkedForNoCompaction.DeleteLabelValues(userID)
 			c.tenantPartialBlocks.DeleteLabelValues(userID)
 			c.tenantBucketIndexLastUpdate.DeleteLabelValues(userID)
+			if c.cfg.ShardingStrategy == util.ShardingStrategyShuffle {
+				c.remainingPlannedCompactions.DeleteLabelValues(userID)
+				if c.cfg.CompactionStrategy == util.CompactionStrategyPartitioning {
+					c.inProgressCompactions.DeleteLabelValues(userID)
+					c.oldestPartitionGroupOffset.DeleteLabelValues(userID)
+				}
+			}
 		}
 	}
 	c.lastOwnedUsers = allUsers
@@ -447,6 +481,15 @@ func (c *BlocksCleaner) deleteUserMarkedForDeletion(ctx context.Context, userLog
 		level.Info(userLogger).Log("msg", "deleted files under "+block.DebugMetas+" for tenant marked for deletion", "count", deleted)
 	}
 
+	if c.cfg.CompactionStrategy == util.CompactionStrategyPartitioning {
+		// Clean up partitioned group info files
+		if deleted, err := bucket.DeletePrefix(ctx, userBucket, PartitionedGroupDirectory, userLogger); err != nil {
+			return errors.Wrap(err, "failed to delete "+PartitionedGroupDirectory)
+		} else if deleted > 0 {
+			level.Info(userLogger).Log("msg", "deleted files under "+PartitionedGroupDirectory+" for tenant marked for deletion", "count", deleted)
+		}
+	}
+
 	if deleted, err := bucket.DeletePrefix(ctx, userBucket, bucketindex.MarkersPathname, userLogger); err != nil {
 		return errors.Wrap(err, "failed to delete marker files")
 	} else if deleted > 0 {
@@ -592,6 +635,12 @@ func (c *BlocksCleaner) cleanUser(ctx context.Context, userLogger log.Logger, us
 	}
 	level.Info(userLogger).Log("msg", "finish writing new index", "duration", time.Since(begin), "duration_ms", time.Since(begin).Milliseconds())
 
+	if c.cfg.ShardingStrategy == util.ShardingStrategyShuffle && c.cfg.CompactionStrategy == util.CompactionStrategyPartitioning {
+		begin = time.Now()
+		c.cleanPartitionedGroupInfo(ctx, userBucket, userLogger, userID)
+		level.Info(userLogger).Log("msg", "finish cleaning partitioned group info files", "duration", time.Since(begin), "duration_ms", time.Since(begin).Milliseconds())
+	}
+
 	c.tenantBlocks.WithLabelValues(userID).Set(float64(len(idx.Blocks)))
 	c.tenantBlocksMarkedForDelete.WithLabelValues(userID).Set(float64(len(idx.BlockDeletionMarks)))
 	c.tenantBlocksMarkedForNoCompaction.WithLabelValues(userID).Set(float64(totalBlocksBlocksMarkedForNoCompaction))
@@ -600,6 +649,90 @@ func (c *BlocksCleaner) cleanUser(ctx context.Context, userLogger log.Logger, us
 	return nil
 }
 
+func (c *BlocksCleaner) cleanPartitionedGroupInfo(ctx context.Context, userBucket objstore.InstrumentedBucket, userLogger log.Logger, userID string) {
+	existentPartitionedGroupInfo := make(map[*PartitionedGroupInfo]struct {
+		path   string
+		status PartitionedGroupStatus
+	})
+	err := userBucket.Iter(ctx, PartitionedGroupDirectory, func(file string) error {
+		if strings.Contains(file, PartitionVisitMarkerDirectory) {
+			return nil
+		}
+		partitionedGroupInfo, err := ReadPartitionedGroupInfoFile(ctx, userBucket, userLogger, file)
+		if err != nil {
+			level.Warn(userLogger).Log("msg", "failed to read partitioned group info", "partitioned_group_info", file)
+			return nil
+		}
+
+		status := partitionedGroupInfo.getPartitionedGroupStatus(ctx, userBucket, c.compactionVisitMarkerTimeout, userLogger)
+		level.Debug(userLogger).Log("msg", "got partitioned group status", "partitioned_group_status", status.String())
+		existentPartitionedGroupInfo[partitionedGroupInfo] = struct {
+			path   string
+			status PartitionedGroupStatus
+		}{
+			path:   file,
+			status: status,
+		}
+		return nil
+	})
+
+	if err != nil {
+		level.Warn(userLogger).Log("msg", "error return when going through partitioned group directory", "err", err)
+	}
+
+	remainingCompactions := 0
+	inProgressCompactions := 0
+	var oldestPartitionGroup *PartitionedGroupInfo
+	defer func() {
+		c.remainingPlannedCompactions.WithLabelValues(userID).Set(float64(remainingCompactions))
+		c.inProgressCompactions.WithLabelValues(userID).Set(float64(inProgressCompactions))
+		if c.oldestPartitionGroupOffset != nil {
+			if oldestPartitionGroup != nil {
+				c.oldestPartitionGroupOffset.WithLabelValues(userID).Set(float64(time.Now().Unix() - oldestPartitionGroup.CreationTime))
+				level.Debug(userLogger).Log("msg", "partition group info with oldest creation time", "partitioned_group_id", oldestPartitionGroup.PartitionedGroupID, "creation_time", oldestPartitionGroup.CreationTime)
+			} else {
+				c.oldestPartitionGroupOffset.WithLabelValues(userID).Set(0)
+			}
+		}
+	}()
+	for partitionedGroupInfo, extraInfo := range existentPartitionedGroupInfo {
+		partitionedGroupInfoFile := extraInfo.path
+
+		remainingCompactions += extraInfo.status.PendingPartitions
+		inProgressCompactions += extraInfo.status.InProgressPartitions
+		if oldestPartitionGroup == nil || partitionedGroupInfo.CreationTime < oldestPartitionGroup.CreationTime {
+			oldestPartitionGroup = partitionedGroupInfo
+		}
+		if extraInfo.status.CanDelete {
+			if extraInfo.status.IsCompleted {
+				// Try to remove all blocks included in partitioned group info
+				if err := partitionedGroupInfo.markAllBlocksForDeletion(ctx, userBucket, userLogger, c.blocksMarkedForDeletion, userID); err != nil {
+					level.Warn(userLogger).Log("msg", "unable to mark all blocks in partitioned group info for deletion", "partitioned_group_id", partitionedGroupInfo.PartitionedGroupID)
+					// if one block can not be marked for deletion, we should
+					// skip delete this partitioned group. next iteration
+					// would try it again.
+					continue
+				}
+			}
+
+			if err := userBucket.Delete(ctx, partitionedGroupInfoFile); err != nil {
+				level.Warn(userLogger).Log("msg", "failed to delete partitioned group info", "partitioned_group_info", partitionedGroupInfoFile, "err", err)
+			} else {
+				level.Info(userLogger).Log("msg", "deleted partitioned group info", "partitioned_group_info", partitionedGroupInfoFile)
+			}
+		}
+
+		if extraInfo.status.CanDelete || extraInfo.status.DeleteVisitMarker {
+			// Remove partition visit markers
+			if _, err := bucket.DeletePrefix(ctx, userBucket, GetPartitionVisitMarkerDirectoryPath(partitionedGroupInfo.PartitionedGroupID), userLogger); err != nil {
+				level.Warn(userLogger).Log("msg", "failed to delete partition visit markers for partitioned group", "partitioned_group_info", partitionedGroupInfoFile, "err", err)
+			} else {
+				level.Info(userLogger).Log("msg", "deleted partition visit markers for partitioned group", "partitioned_group_info", partitionedGroupInfoFile)
+			}
+		}
+	}
+}
+
 // cleanUserPartialBlocks delete partial blocks which are safe to be deleted. The provided partials map
 // and index are updated accordingly.
 func (c *BlocksCleaner) cleanUserPartialBlocks(ctx context.Context, userID string, partials map[ulid.ULID]error, idx *bucketindex.Index, userBucket objstore.InstrumentedBucket, userLogger log.Logger) {
diff --git a/pkg/compactor/blocks_cleaner_test.go b/pkg/compactor/blocks_cleaner_test.go
index d3c7aa6da9e..f7ac1a998b0 100644
--- a/pkg/compactor/blocks_cleaner_test.go
+++ b/pkg/compactor/blocks_cleaner_test.go
@@ -24,6 +24,7 @@ import (
 	"github.com/cortexproject/cortex/pkg/storage/tsdb"
 	"github.com/cortexproject/cortex/pkg/storage/tsdb/bucketindex"
 	cortex_testutil "github.com/cortexproject/cortex/pkg/storage/tsdb/testutil"
+	"github.com/cortexproject/cortex/pkg/util"
 	util_log "github.com/cortexproject/cortex/pkg/util/log"
 	"github.com/cortexproject/cortex/pkg/util/services"
 )
@@ -86,8 +87,9 @@ func TestBlockCleaner_KeyPermissionDenied(t *testing.T) {
 		Name: blocksMarkedForDeletionName,
 		Help: blocksMarkedForDeletionHelp,
 	}, append(commonLabels, reasonLabelName))
+	dummyGaugeVec := prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"test"})
 
-	cleaner := NewBlocksCleaner(cfg, mbucket, scanner, cfgProvider, logger, "test-cleaner", nil, time.Minute, 30*time.Second, blocksMarkedForDeletion)
+	cleaner := NewBlocksCleaner(cfg, mbucket, scanner, 60*time.Second, cfgProvider, logger, "test-cleaner", nil, time.Minute, 30*time.Second, blocksMarkedForDeletion, dummyGaugeVec)
 
 	// Clean User with no error
 	cleaner.bucketClient = bkt
@@ -193,8 +195,9 @@ func testBlocksCleanerWithOptions(t *testing.T, options testBlocksCleanerOptions
 		Name: blocksMarkedForDeletionName,
 		Help: blocksMarkedForDeletionHelp,
 	}, append(commonLabels, reasonLabelName))
+	dummyGaugeVec := prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"test"})
 
-	cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, "test-cleaner", reg, time.Minute, 30*time.Second, blocksMarkedForDeletion)
+	cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, 60*time.Second, cfgProvider, logger, "test-cleaner", reg, time.Minute, 30*time.Second, blocksMarkedForDeletion, dummyGaugeVec)
 	require.NoError(t, services.StartAndAwaitRunning(ctx, cleaner))
 	defer services.StopAndAwaitTerminated(ctx, cleaner) //nolint:errcheck
 
@@ -354,8 +357,9 @@ func TestBlocksCleaner_ShouldContinueOnBlockDeletionFailure(t *testing.T) {
 		Name: blocksMarkedForDeletionName,
 		Help: blocksMarkedForDeletionHelp,
 	}, append(commonLabels, reasonLabelName))
+	dummyGaugeVec := prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"test"})
 
-	cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, "test-cleaner", nil, time.Minute, 30*time.Second, blocksMarkedForDeletion)
+	cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, 60*time.Second, cfgProvider, logger, "test-cleaner", nil, time.Minute, 30*time.Second, blocksMarkedForDeletion, dummyGaugeVec)
 	require.NoError(t, services.StartAndAwaitRunning(ctx, cleaner))
 	defer services.StopAndAwaitTerminated(ctx, cleaner) //nolint:errcheck
 
@@ -418,8 +422,9 @@ func TestBlocksCleaner_ShouldRebuildBucketIndexOnCorruptedOne(t *testing.T) {
 		Name: blocksMarkedForDeletionName,
 		Help: blocksMarkedForDeletionHelp,
 	}, append(commonLabels, reasonLabelName))
+	dummyGaugeVec := prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"test"})
 
-	cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, "test-cleaner", nil, time.Minute, 30*time.Second, blocksMarkedForDeletion)
+	cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, 60*time.Second, cfgProvider, logger, "test-cleaner", nil, time.Minute, 30*time.Second, blocksMarkedForDeletion, dummyGaugeVec)
 	require.NoError(t, services.StartAndAwaitRunning(ctx, cleaner))
 	defer services.StopAndAwaitTerminated(ctx, cleaner) //nolint:errcheck
 
@@ -476,8 +481,9 @@ func TestBlocksCleaner_ShouldRemoveMetricsForTenantsNotBelongingAnymoreToTheShar
 		Name: blocksMarkedForDeletionName,
 		Help: blocksMarkedForDeletionHelp,
 	}, append(commonLabels, reasonLabelName))
+	dummyGaugeVec := prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"test"})
 
-	cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, "test-cleaner", reg, time.Minute, 30*time.Second, blocksMarkedForDeletion)
+	cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, 60*time.Second, cfgProvider, logger, "test-cleaner", reg, time.Minute, 30*time.Second, blocksMarkedForDeletion, dummyGaugeVec)
 	activeUsers, deleteUsers, err := cleaner.scanUsers(ctx)
 	require.NoError(t, err)
 	require.NoError(t, cleaner.cleanUpActiveUsers(ctx, activeUsers, true))
@@ -617,8 +623,9 @@ func TestBlocksCleaner_ShouldRemoveBlocksOutsideRetentionPeriod(t *testing.T) {
 		Name: blocksMarkedForDeletionName,
 		Help: blocksMarkedForDeletionHelp,
 	}, append(commonLabels, reasonLabelName))
+	dummyGaugeVec := prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"test"})
 
-	cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, cfgProvider, logger, "test-cleaner", reg, time.Minute, 30*time.Second, blocksMarkedForDeletion)
+	cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, 60*time.Second, cfgProvider, logger, "test-cleaner", reg, time.Minute, 30*time.Second, blocksMarkedForDeletion, dummyGaugeVec)
 
 	assertBlockExists := func(user string, block ulid.ULID, expectExists bool) {
 		exists, err := bucketClient.Exists(ctx, path.Join(user, block.String(), metadata.MetaFilename))
@@ -811,6 +818,83 @@ func TestBlocksCleaner_ShouldRemoveBlocksOutsideRetentionPeriod(t *testing.T) {
 	}
 }
 
+func TestBlocksCleaner_CleanPartitionedGroupInfo(t *testing.T) {
+	bucketClient, _ := cortex_testutil.PrepareFilesystemBucket(t)
+	bucketClient = bucketindex.BucketWithGlobalMarkers(bucketClient)
+
+	ts := func(hours int) int64 {
+		return time.Now().Add(time.Duration(hours)*time.Hour).Unix() * 1000
+	}
+
+	userID := "user-1"
+	partitionedGroupID := uint32(123)
+	partitionCount := 1
+	startTime := ts(-10)
+	endTime := ts(-8)
+	block1 := createTSDBBlock(t, bucketClient, userID, startTime, endTime, nil)
+
+	cfg := BlocksCleanerConfig{
+		DeletionDelay:      time.Hour,
+		CleanupInterval:    time.Minute,
+		CleanupConcurrency: 1,
+		ShardingStrategy:   util.ShardingStrategyShuffle,
+		CompactionStrategy: util.CompactionStrategyPartitioning,
+	}
+
+	ctx := context.Background()
+	logger := log.NewNopLogger()
+	reg := prometheus.NewPedanticRegistry()
+	scanner := tsdb.NewUsersScanner(bucketClient, tsdb.AllUsers, logger)
+	cfgProvider := newMockConfigProvider()
+	blocksMarkedForDeletion := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
+		Name: blocksMarkedForDeletionName,
+		Help: blocksMarkedForDeletionHelp,
+	}, append(commonLabels, reasonLabelName))
+	dummyGaugeVec := prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"test"})
+
+	cleaner := NewBlocksCleaner(cfg, bucketClient, scanner, 60*time.Second, cfgProvider, logger, "test-cleaner", reg, time.Minute, 30*time.Second, blocksMarkedForDeletion, dummyGaugeVec)
+
+	userBucket := bucket.NewUserBucketClient(userID, bucketClient, cfgProvider)
+
+	partitionedGroupInfo := PartitionedGroupInfo{
+		PartitionedGroupID: partitionedGroupID,
+		PartitionCount:     partitionCount,
+		Partitions: []Partition{
+			{
+				PartitionID: 0,
+				Blocks:      []ulid.ULID{block1},
+			},
+		},
+		RangeStart:   startTime,
+		RangeEnd:     endTime,
+		CreationTime: time.Now().Add(-5 * time.Minute).Unix(),
+		Version:      PartitionedGroupInfoVersion1,
+	}
+	_, err := UpdatePartitionedGroupInfo(ctx, userBucket, logger, partitionedGroupInfo)
+	require.NoError(t, err)
+
+	visitMarker := &partitionVisitMarker{
+		PartitionedGroupID: partitionedGroupID,
+		PartitionID:        0,
+		Status:             Completed,
+		VisitTime:          time.Now().Add(-2 * time.Minute).Unix(),
+	}
+	visitMarkerManager := NewVisitMarkerManager(userBucket, logger, "dummy-cleaner", visitMarker)
+	err = visitMarkerManager.updateVisitMarker(ctx)
+	require.NoError(t, err)
+
+	cleaner.cleanPartitionedGroupInfo(ctx, userBucket, logger, userID)
+
+	partitionedGroupFileExists, err := userBucket.Exists(ctx, GetPartitionedGroupFile(partitionedGroupID))
+	require.NoError(t, err)
+	require.False(t, partitionedGroupFileExists)
+
+	block1DeletionMarkerExists, err := userBucket.Exists(ctx, path.Join(block1.String(), metadata.DeletionMarkFilename))
+	require.NoError(t, err)
+	require.True(t, block1DeletionMarkerExists)
+
+}
+
 type mockConfigProvider struct {
 	userRetentionPeriods map[string]time.Duration
 }
diff --git a/pkg/compactor/compactor.go b/pkg/compactor/compactor.go
index a1a9a8f8a2b..01f534f6296 100644
--- a/pkg/compactor/compactor.go
+++ b/pkg/compactor/compactor.go
@@ -657,8 +657,10 @@ func (c *Compactor) starting(ctx context.Context) error {
 		CleanupConcurrency:                 c.compactorCfg.CleanupConcurrency,
 		BlockDeletionMarksMigrationEnabled: c.compactorCfg.BlockDeletionMarksMigrationEnabled,
 		TenantCleanupDelay:                 c.compactorCfg.TenantCleanupDelay,
-	}, c.bucketClient, c.usersScanner, c.limits, c.parentLogger, cleanerRingLifecyclerID, c.registerer, c.compactorCfg.CleanerVisitMarkerTimeout, c.compactorCfg.CleanerVisitMarkerFileUpdateInterval,
-		c.compactorMetrics.syncerBlocksMarkedForDeletion)
+		ShardingStrategy:                   c.compactorCfg.ShardingStrategy,
+		CompactionStrategy:                 c.compactorCfg.CompactionStrategy,
+	}, c.bucketClient, c.usersScanner, c.compactorCfg.CompactionVisitMarkerTimeout, c.limits, c.parentLogger, cleanerRingLifecyclerID, c.registerer, c.compactorCfg.CleanerVisitMarkerTimeout, c.compactorCfg.CleanerVisitMarkerFileUpdateInterval,
+		c.compactorMetrics.syncerBlocksMarkedForDeletion, c.compactorMetrics.remainingPlannedCompactions)
 
 	// Ensure an initial cleanup occurred before starting the compactor.
 	if err := services.StartAndAwaitRunning(ctx, c.blocksCleaner); err != nil {
diff --git a/pkg/compactor/partitioned_group_info.go b/pkg/compactor/partitioned_group_info.go
index 71d4c61639c..f1c429a07a6 100644
--- a/pkg/compactor/partitioned_group_info.go
+++ b/pkg/compactor/partitioned_group_info.go
@@ -14,7 +14,9 @@ import (
 	"github.com/go-kit/log/level"
 	"github.com/oklog/ulid"
 	"github.com/pkg/errors"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/thanos-io/objstore"
+	"github.com/thanos-io/thanos/pkg/block"
 	"github.com/thanos-io/thanos/pkg/block/metadata"
 
 	"github.com/cortexproject/cortex/pkg/util/runutil"
@@ -232,6 +234,25 @@ func (p *PartitionedGroupInfo) isBlockNoCompact(ctx context.Context, userBucket
 	return noCompactMarkerExists
 }
 
+func (p *PartitionedGroupInfo) markAllBlocksForDeletion(ctx context.Context, userBucket objstore.InstrumentedBucket, userLogger log.Logger, blocksMarkedForDeletion *prometheus.CounterVec, userID string) error {
+	blocks := p.getAllBlocks()
+	deleteBlocksCount := 0
+	defer func() {
+		level.Info(userLogger).Log("msg", "total number of blocks marked for deletion during partitioned group info clean up", "count", deleteBlocksCount)
+	}()
+	for _, blockID := range blocks {
+		if p.doesBlockExist(ctx, userBucket, userLogger, blockID) && !p.isBlockDeleted(ctx, userBucket, userLogger, blockID) {
+			if err := block.MarkForDeletion(ctx, userLogger, userBucket, blockID, "delete block during partitioned group completion check", blocksMarkedForDeletion.WithLabelValues(userID, reasonValueRetention)); err != nil {
+				level.Warn(userLogger).Log("msg", "unable to mark block for deletion", "partitioned_group_id", p.PartitionedGroupID, "block", blockID.String())
+				return err
+			}
+			deleteBlocksCount++
+			level.Debug(userLogger).Log("msg", "marked block for deletion during partitioned group info clean up", "partitioned_group_id", p.PartitionedGroupID, "block", blockID.String())
+		}
+	}
+	return nil
+}
+
 func (p *PartitionedGroupInfo) String() string {
 	var partitions []string
 	for _, partition := range p.Partitions {

From 0aa90488ccdb4487224ded3bd9731b406a4bd19b Mon Sep 17 00:00:00 2001
From: Charlie Le <charlie_le@apple.com>
Date: Thu, 16 Jan 2025 08:31:27 -0800
Subject: [PATCH 08/34] Update RELEASE.md (#6511)

Maintainers would like an additional week to get the partition compactor changes in before the first release candidate for 1.19.

Signed-off-by: Charlie Le <charlie_le@apple.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 RELEASE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RELEASE.md b/RELEASE.md
index 1f4041e7ad5..7a25bec8a9d 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -34,7 +34,7 @@ Our goal is to provide a new minor release every 6 weeks. This is a new process
 | v1.16.0        | 2023-11-05                                 | Ben Ye (@yeya24)                            |
 | v1.17.0        | 2024-04-25                                 | Ben Ye (@yeya24)                            |
 | v1.18.0        | 2024-08-16                                 | Daniel Blando (@danielblando)               |
-| v1.19.0        | 2025-01-15                                 | Charlie Le (@charlietle)                    |
+| v1.19.0        | 2025-01-22                                 | Charlie Le (@charlietle)                    |
 
 ## Release shepherd responsibilities
 

From 51772d4302c18f4c2e4a2100733e64177b7a0707 Mon Sep 17 00:00:00 2001
From: Ben Ye <benye@amazon.com>
Date: Thu, 16 Jan 2025 10:27:30 -0800
Subject: [PATCH 09/34] update thanos version to
 236777732278c64ca01c1c09d726f0f712c87164 (#6514)

Signed-off-by: yeya24 <benye@amazon.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 go.mod                                        |  2 +-
 go.sum                                        |  4 +--
 pkg/storegateway/bucket_stores.go             |  3 +-
 .../thanos/pkg/discovery/dns/grpc.go          |  2 +-
 .../thanos/pkg/errutil/multierror.go          |  9 ++++++
 .../thanos-io/thanos/pkg/extkingpin/flags.go  | 10 +------
 .../pkg/extkingpin/path_content_reloader.go   | 19 +++++++++++++
 .../thanos-io/thanos/pkg/query/endpointset.go | 28 +++++++------------
 .../thanos-io/thanos/pkg/store/bucket.go      | 21 ++++++++++++--
 .../thanos/pkg/store/lazy_postings.go         |  3 +-
 vendor/modules.txt                            |  2 +-
 11 files changed, 67 insertions(+), 36 deletions(-)

diff --git a/go.mod b/go.mod
index 85adabfc834..be6de1356eb 100644
--- a/go.mod
+++ b/go.mod
@@ -52,7 +52,7 @@ require (
 	github.com/stretchr/testify v1.10.0
 	github.com/thanos-io/objstore v0.0.0-20241111205755-d1dd89d41f97
 	github.com/thanos-io/promql-engine v0.0.0-20250110162513-14f995518af3
-	github.com/thanos-io/thanos v0.37.3-0.20250110074750-4ba0ba403896
+	github.com/thanos-io/thanos v0.37.3-0.20250115144759-236777732278
 	github.com/uber/jaeger-client-go v2.30.0+incompatible
 	github.com/weaveworks/common v0.0.0-20230728070032-dd9e68f319d5
 	go.etcd.io/etcd/api/v3 v3.5.17
diff --git a/go.sum b/go.sum
index 92dc0762eee..64dfdce3ba3 100644
--- a/go.sum
+++ b/go.sum
@@ -1657,8 +1657,8 @@ github.com/thanos-io/objstore v0.0.0-20241111205755-d1dd89d41f97 h1:VjG0mwhN1Dkn
 github.com/thanos-io/objstore v0.0.0-20241111205755-d1dd89d41f97/go.mod h1:vyzFrBXgP+fGNG2FopEGWOO/zrIuoy7zt3LpLeezRsw=
 github.com/thanos-io/promql-engine v0.0.0-20250110162513-14f995518af3 h1:feQKBuPhRE/+xd4Ru6Jv48EzVatpXg2mnpl0x0f5OWY=
 github.com/thanos-io/promql-engine v0.0.0-20250110162513-14f995518af3/go.mod h1:wx0JlRZtsB2S10JYUgeg5GqLfMxw31SzArP+28yyE00=
-github.com/thanos-io/thanos v0.37.3-0.20250110074750-4ba0ba403896 h1:K5YqD5JzNPh7P/XGB2J19cxJlv61K9Mm2/UZ+iPVGMU=
-github.com/thanos-io/thanos v0.37.3-0.20250110074750-4ba0ba403896/go.mod h1:VOu1neDpx4n/2OCQmfT/0RMU85UzhO35ce0S3Ew+NSk=
+github.com/thanos-io/thanos v0.37.3-0.20250115144759-236777732278 h1:HkZohVruRD0ENAXZIl2qDcpblbMok++jb3zHvjUeQfg=
+github.com/thanos-io/thanos v0.37.3-0.20250115144759-236777732278/go.mod h1:DvlfyJhdYeufGbw3z6VQuDpGh2Q46/XvalnmEtQOf/0=
 github.com/tjhop/slog-gokit v0.1.2 h1:pmQI4SvU9h4gA0vIQsdhJQSqQg4mOmsPykG2/PM3j1I=
 github.com/tjhop/slog-gokit v0.1.2/go.mod h1:8fhlcp8C8ELbg3GCyKv06tgt4B5sDq2P1r2DQAu1HuM=
 github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM=
diff --git a/pkg/storegateway/bucket_stores.go b/pkg/storegateway/bucket_stores.go
index 778136b3c30..fe69645c57c 100644
--- a/pkg/storegateway/bucket_stores.go
+++ b/pkg/storegateway/bucket_stores.go
@@ -642,7 +642,8 @@ func (u *BucketStores) getOrCreateStore(userID string) (*store.BucketStore, erro
 		}),
 		store.WithLazyExpandedPostings(u.cfg.BucketStore.LazyExpandedPostingsEnabled),
 		store.WithPostingGroupMaxKeySeriesRatio(u.cfg.BucketStore.LazyExpandedPostingGroupMaxKeySeriesRatio),
-		store.WithDontResort(true), // Cortex doesn't need to resort series in store gateway.
+		store.WithSeriesMatchRatio(0.5), // TODO: expose this as a config.
+		store.WithDontResort(true),      // Cortex doesn't need to resort series in store gateway.
 		store.WithBlockLifecycleCallback(&shardingBlockLifecycleCallbackAdapter{
 			userID:   userID,
 			strategy: u.shardingStrategy,
diff --git a/vendor/github.com/thanos-io/thanos/pkg/discovery/dns/grpc.go b/vendor/github.com/thanos-io/thanos/pkg/discovery/dns/grpc.go
index 79e832b6529..7971e7991cb 100644
--- a/vendor/github.com/thanos-io/thanos/pkg/discovery/dns/grpc.go
+++ b/vendor/github.com/thanos-io/thanos/pkg/discovery/dns/grpc.go
@@ -23,7 +23,7 @@ type builder struct {
 	logger          log.Logger
 }
 
-func RegisterGRPCResolver(provider *Provider, interval time.Duration, logger log.Logger) {
+func RegisterGRPCResolver(logger log.Logger, provider *Provider, interval time.Duration) {
 	grpcresolver.Register(&builder{
 		resolveInterval: interval,
 		provider:        provider,
diff --git a/vendor/github.com/thanos-io/thanos/pkg/errutil/multierror.go b/vendor/github.com/thanos-io/thanos/pkg/errutil/multierror.go
index a99b714e275..600a5573248 100644
--- a/vendor/github.com/thanos-io/thanos/pkg/errutil/multierror.go
+++ b/vendor/github.com/thanos-io/thanos/pkg/errutil/multierror.go
@@ -71,6 +71,15 @@ func (es NonNilMultiError) Cause() error {
 	return es.getCause()
 }
 
+func (es NonNilMultiError) Is(target error) bool {
+	for _, err := range es {
+		if errors.Is(err, target) {
+			return true
+		}
+	}
+	return false
+}
+
 func (es NonNilMultiError) getCause() NonNilMultiRootError {
 	var causes []error
 	for _, err := range es {
diff --git a/vendor/github.com/thanos-io/thanos/pkg/extkingpin/flags.go b/vendor/github.com/thanos-io/thanos/pkg/extkingpin/flags.go
index 62b9142bebe..033769c56ed 100644
--- a/vendor/github.com/thanos-io/thanos/pkg/extkingpin/flags.go
+++ b/vendor/github.com/thanos-io/thanos/pkg/extkingpin/flags.go
@@ -47,10 +47,8 @@ func Addrs(flags *kingpin.FlagClause) (target *addressSlice) {
 	return
 }
 
-// validateAddrs checks an address slice for duplicates and empty or invalid elements.
+// validateAddrs checks an address slice for empty or invalid elements.
 func validateAddrs(addrs addressSlice) error {
-	set := map[string]struct{}{}
-
 	for _, addr := range addrs {
 		if addr == "" {
 			return errors.New("Address is empty.")
@@ -61,12 +59,6 @@ func validateAddrs(addrs addressSlice) error {
 		if len(qtypeAndName) != 2 && len(hostAndPort) != 2 {
 			return errors.Errorf("Address %s is not of <host>:<port> format or a valid DNS query.", addr)
 		}
-
-		if _, ok := set[addr]; ok {
-			return errors.Errorf("Address %s is duplicated.", addr)
-		}
-
-		set[addr] = struct{}{}
 	}
 
 	return nil
diff --git a/vendor/github.com/thanos-io/thanos/pkg/extkingpin/path_content_reloader.go b/vendor/github.com/thanos-io/thanos/pkg/extkingpin/path_content_reloader.go
index e96b0ddb34a..b2b84db6c5f 100644
--- a/vendor/github.com/thanos-io/thanos/pkg/extkingpin/path_content_reloader.go
+++ b/vendor/github.com/thanos-io/thanos/pkg/extkingpin/path_content_reloader.go
@@ -21,6 +21,25 @@ type fileContent interface {
 	Path() string
 }
 
+type NopConfigContent struct{}
+
+var _ fileContent = (*NopConfigContent)(nil)
+
+// Content returns no content and no error.
+func (n NopConfigContent) Content() ([]byte, error) {
+	return nil, nil
+}
+
+// Path returns an empty path.
+func (n NopConfigContent) Path() string {
+	return ""
+}
+
+// NewNopConfig creates a no-op config content (no configuration).
+func NewNopConfig() NopConfigContent {
+	return NopConfigContent{}
+}
+
 // PathContentReloader runs the reloadFunc when it detects that the contents of fileContent have changed.
 func PathContentReloader(ctx context.Context, fileContent fileContent, logger log.Logger, reloadFunc func(), debounceTime time.Duration) error {
 	filePath, err := filepath.Abs(fileContent.Path())
diff --git a/vendor/github.com/thanos-io/thanos/pkg/query/endpointset.go b/vendor/github.com/thanos-io/thanos/pkg/query/endpointset.go
index 4c519bf925f..071e04a8465 100644
--- a/vendor/github.com/thanos-io/thanos/pkg/query/endpointset.go
+++ b/vendor/github.com/thanos-io/thanos/pkg/query/endpointset.go
@@ -211,8 +211,7 @@ type EndpointSet struct {
 
 	// Endpoint specifications can change dynamically. If some component is missing from the list, we assume it is no longer
 	// accessible and we close gRPC client for it, unless it is strict.
-	endpointSpec             func() map[string]*GRPCEndpointSpec
-	dialOpts                 []grpc.DialOption
+	endpointSpecs            func() map[string]*GRPCEndpointSpec
 	endpointInfoTimeout      time.Duration
 	unhealthyEndpointTimeout time.Duration
 
@@ -235,7 +234,6 @@ func NewEndpointSet(
 	logger log.Logger,
 	reg prometheus.Registerer,
 	endpointSpecs func() []*GRPCEndpointSpec,
-	dialOpts []grpc.DialOption,
 	unhealthyEndpointTimeout time.Duration,
 	endpointInfoTimeout time.Duration,
 	endpointMetricLabels ...string,
@@ -254,19 +252,17 @@ func NewEndpointSet(
 	}
 
 	return &EndpointSet{
-		now:             now,
-		logger:          log.With(logger, "component", "endpointset"),
-		endpointsMetric: endpointsMetric,
-
-		dialOpts:                 dialOpts,
+		now:                      now,
+		logger:                   log.With(logger, "component", "endpointset"),
+		endpointsMetric:          endpointsMetric,
 		endpointInfoTimeout:      endpointInfoTimeout,
 		unhealthyEndpointTimeout: unhealthyEndpointTimeout,
-		endpointSpec: func() map[string]*GRPCEndpointSpec {
-			specs := make(map[string]*GRPCEndpointSpec)
+		endpointSpecs: func() map[string]*GRPCEndpointSpec {
+			res := make(map[string]*GRPCEndpointSpec)
 			for _, s := range endpointSpecs() {
-				specs[s.addr] = s
+				res[s.addr] = s
 			}
-			return specs
+			return res
 		},
 		endpoints: make(map[string]*endpointRef),
 	}
@@ -288,7 +284,7 @@ func (e *EndpointSet) Update(ctx context.Context) {
 		mu sync.Mutex
 	)
 
-	for _, spec := range e.endpointSpec() {
+	for _, spec := range e.endpointSpecs() {
 		spec := spec
 
 		if er, existingRef := e.endpoints[spec.Addr()]; existingRef {
@@ -571,11 +567,7 @@ type endpointRef struct {
 // newEndpointRef creates a new endpointRef with a gRPC channel to the given the IP address.
 // The call to newEndpointRef will return an error if establishing the channel fails.
 func (e *EndpointSet) newEndpointRef(spec *GRPCEndpointSpec) (*endpointRef, error) {
-	var dialOpts []grpc.DialOption
-
-	dialOpts = append(dialOpts, e.dialOpts...)
-	dialOpts = append(dialOpts, spec.dialOpts...)
-	conn, err := grpc.NewClient(spec.Addr(), dialOpts...)
+	conn, err := grpc.NewClient(spec.Addr(), spec.dialOpts...)
 	if err != nil {
 		return nil, errors.Wrap(err, "dialing connection")
 	}
diff --git a/vendor/github.com/thanos-io/thanos/pkg/store/bucket.go b/vendor/github.com/thanos-io/thanos/pkg/store/bucket.go
index 16e9e8c39de..6e5a656e629 100644
--- a/vendor/github.com/thanos-io/thanos/pkg/store/bucket.go
+++ b/vendor/github.com/thanos-io/thanos/pkg/store/bucket.go
@@ -442,6 +442,7 @@ type BucketStore struct {
 	enableChunkHashCalculation bool
 
 	enabledLazyExpandedPostings   bool
+	seriesMatchRatio              float64
 	postingGroupMaxKeySeriesRatio float64
 
 	sortingStrategy sortingStrategy
@@ -591,6 +592,15 @@ func WithPostingGroupMaxKeySeriesRatio(postingGroupMaxKeySeriesRatio float64) Bu
 	}
 }
 
+// WithSeriesMatchRatio configures how many series would match when intersecting posting groups.
+// This is used for lazy posting optimization strategy. Ratio should be within (0, 1).
+// The closer to 1, it means matchers have bad selectivity.
+func WithSeriesMatchRatio(seriesMatchRatio float64) BucketStoreOption {
+	return func(s *BucketStore) {
+		s.seriesMatchRatio = seriesMatchRatio
+	}
+}
+
 // WithDontResort disables series resorting in Store Gateway.
 func WithDontResort(true bool) BucketStoreOption {
 	return func(s *BucketStore) {
@@ -1065,6 +1075,7 @@ type blockSeriesClient struct {
 	bytesLimiter  BytesLimiter
 
 	lazyExpandedPostingEnabled bool
+	seriesMatchRatio           float64
 	// Mark posting group as lazy if it adds too many keys. 0 to disable.
 	postingGroupMaxKeySeriesRatio                 float64
 	lazyExpandedPostingsCount                     prometheus.Counter
@@ -1111,6 +1122,7 @@ func newBlockSeriesClient(
 	chunkFetchDurationSum *prometheus.HistogramVec,
 	extLsetToRemove map[string]struct{},
 	lazyExpandedPostingEnabled bool,
+	seriesMatchRatio float64,
 	postingGroupMaxKeySeriesRatio float64,
 	lazyExpandedPostingsCount prometheus.Counter,
 	lazyExpandedPostingByReason *prometheus.CounterVec,
@@ -1148,6 +1160,7 @@ func newBlockSeriesClient(
 		chunkFetchDurationSum:  chunkFetchDurationSum,
 
 		lazyExpandedPostingEnabled:                    lazyExpandedPostingEnabled,
+		seriesMatchRatio:                              seriesMatchRatio,
 		postingGroupMaxKeySeriesRatio:                 postingGroupMaxKeySeriesRatio,
 		lazyExpandedPostingsCount:                     lazyExpandedPostingsCount,
 		lazyExpandedPostingGroupByReason:              lazyExpandedPostingByReason,
@@ -1202,7 +1215,7 @@ func (b *blockSeriesClient) ExpandPostings(
 	matchers sortedMatchers,
 	seriesLimiter SeriesLimiter,
 ) error {
-	ps, err := b.indexr.ExpandedPostings(b.ctx, matchers, b.bytesLimiter, b.lazyExpandedPostingEnabled, b.postingGroupMaxKeySeriesRatio, b.lazyExpandedPostingSizeBytes, b.lazyExpandedPostingGroupByReason, b.tenant)
+	ps, err := b.indexr.ExpandedPostings(b.ctx, matchers, b.bytesLimiter, b.lazyExpandedPostingEnabled, b.seriesMatchRatio, b.postingGroupMaxKeySeriesRatio, b.lazyExpandedPostingSizeBytes, b.lazyExpandedPostingGroupByReason, b.tenant)
 	if err != nil {
 		return errors.Wrap(err, "expanded matching posting")
 	}
@@ -1635,6 +1648,7 @@ func (s *BucketStore) Series(req *storepb.SeriesRequest, seriesSrv storepb.Store
 				s.metrics.chunkFetchDurationSum,
 				extLsetToRemove,
 				s.enabledLazyExpandedPostings,
+				s.seriesMatchRatio,
 				s.postingGroupMaxKeySeriesRatio,
 				s.metrics.lazyExpandedPostingsCount,
 				s.metrics.lazyExpandedPostingGroupsByReason,
@@ -1951,6 +1965,7 @@ func (s *BucketStore) LabelNames(ctx context.Context, req *storepb.LabelNamesReq
 					nil,
 					extLsetToRemove,
 					s.enabledLazyExpandedPostings,
+					s.seriesMatchRatio,
 					s.postingGroupMaxKeySeriesRatio,
 					s.metrics.lazyExpandedPostingsCount,
 					s.metrics.lazyExpandedPostingGroupsByReason,
@@ -2179,6 +2194,7 @@ func (s *BucketStore) LabelValues(ctx context.Context, req *storepb.LabelValuesR
 					nil,
 					nil,
 					s.enabledLazyExpandedPostings,
+					s.seriesMatchRatio,
 					s.postingGroupMaxKeySeriesRatio,
 					s.metrics.lazyExpandedPostingsCount,
 					s.metrics.lazyExpandedPostingGroupsByReason,
@@ -2647,6 +2663,7 @@ func (r *bucketIndexReader) ExpandedPostings(
 	ms sortedMatchers,
 	bytesLimiter BytesLimiter,
 	lazyExpandedPostingEnabled bool,
+	seriesMatchRatio float64,
 	postingGroupMaxKeySeriesRatio float64,
 	lazyExpandedPostingSizeBytes prometheus.Counter,
 	lazyExpandedPostingGroupsByReason *prometheus.CounterVec,
@@ -2703,7 +2720,7 @@ func (r *bucketIndexReader) ExpandedPostings(
 		postingGroups = append(postingGroups, newPostingGroup(true, name, []string{value}, nil))
 	}
 
-	ps, err := fetchLazyExpandedPostings(ctx, postingGroups, r, bytesLimiter, addAllPostings, lazyExpandedPostingEnabled, postingGroupMaxKeySeriesRatio, lazyExpandedPostingSizeBytes, lazyExpandedPostingGroupsByReason, tenant)
+	ps, err := fetchLazyExpandedPostings(ctx, postingGroups, r, bytesLimiter, addAllPostings, lazyExpandedPostingEnabled, seriesMatchRatio, postingGroupMaxKeySeriesRatio, lazyExpandedPostingSizeBytes, lazyExpandedPostingGroupsByReason, tenant)
 	if err != nil {
 		return nil, errors.Wrap(err, "fetch and expand postings")
 	}
diff --git a/vendor/github.com/thanos-io/thanos/pkg/store/lazy_postings.go b/vendor/github.com/thanos-io/thanos/pkg/store/lazy_postings.go
index 57b48cc342b..325e92e9040 100644
--- a/vendor/github.com/thanos-io/thanos/pkg/store/lazy_postings.go
+++ b/vendor/github.com/thanos-io/thanos/pkg/store/lazy_postings.go
@@ -214,6 +214,7 @@ func fetchLazyExpandedPostings(
 	bytesLimiter BytesLimiter,
 	addAllPostings bool,
 	lazyExpandedPostingEnabled bool,
+	seriesMatchRatio float64,
 	postingGroupMaxKeySeriesRatio float64,
 	lazyExpandedPostingSizeBytes prometheus.Counter,
 	lazyExpandedPostingGroupsByReason *prometheus.CounterVec,
@@ -237,7 +238,7 @@ func fetchLazyExpandedPostings(
 			r,
 			postingGroups,
 			int64(r.block.estimatedMaxSeriesSize),
-			0.5, // TODO(yeya24): Expose this as a flag.
+			seriesMatchRatio,
 			postingGroupMaxKeySeriesRatio,
 			lazyExpandedPostingSizeBytes,
 			lazyExpandedPostingGroupsByReason,
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 0ebdd814960..a8d2eac2f3e 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -988,7 +988,7 @@ github.com/thanos-io/promql-engine/query
 github.com/thanos-io/promql-engine/ringbuffer
 github.com/thanos-io/promql-engine/storage
 github.com/thanos-io/promql-engine/storage/prometheus
-# github.com/thanos-io/thanos v0.37.3-0.20250110074750-4ba0ba403896
+# github.com/thanos-io/thanos v0.37.3-0.20250115144759-236777732278
 ## explicit; go 1.23.0
 github.com/thanos-io/thanos/pkg/api/query/querypb
 github.com/thanos-io/thanos/pkg/block

From 3666e70e5bf159a149a0223e186ea6fb5d8ad844 Mon Sep 17 00:00:00 2001
From: Alan Protasio <approtas@amazon.com>
Date: Thu, 16 Jan 2025 15:05:10 -0800
Subject: [PATCH 10/34] Fix race that can cause nil reference when using
 expanded postings (#6518)

Signed-off-by: alanprot <alanprot@gmail.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 pkg/ingester/ingester.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go
index 2502ef8c762..bf921243af4 100644
--- a/pkg/ingester/ingester.go
+++ b/pkg/ingester/ingester.go
@@ -2808,6 +2808,9 @@ func (i *Ingester) expirePostingsCache(ctx context.Context) error {
 			return nil
 		}
 		userDB := i.getTSDB(userID)
+		if userDB == nil || userDB.postingCache == nil {
+			continue
+		}
 		userDB.postingCache.PurgeExpiredItems()
 	}
 

From 22d231ceb1f338a29aa09246a832e944d5996bef Mon Sep 17 00:00:00 2001
From: SungJin1212 <tjdwls1201@gmail.com>
Date: Sun, 19 Jan 2025 06:30:00 +0900
Subject: [PATCH 11/34] Add more op label values to
 cortex_query_frontend_queries_total metric (#6519)

Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 CHANGELOG.md                              |  1 +
 pkg/querier/tripperware/roundtrip.go      | 39 +++++++++++++---
 pkg/querier/tripperware/roundtrip_test.go | 54 ++++++++++++++++++++++-
 3 files changed, 87 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e4820d90b11..586b3e2539e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@
 * [FEATURE] Query Frontend: Support a metadata federated query when `-tenant-federation.enabled=true`. #6461
 * [FEATURE] Query Frontend: Support an exemplar federated query when `-tenant-federation.enabled=true`. #6455
 * [FEATURE] Ingester/StoreGateway: Add support for cache regex query matchers via `-ingester.matchers-cache-max-items` and `-blocks-storage.bucket-store.matchers-cache-max-items`. #6477 #6491
+* [ENHANCEMENT] Query Frontend: Add more operation label values to the `cortex_query_frontend_queries_total` metric. #6519
 * [ENHANCEMENT] Query Frontend: Add a `source` label to query stat metrics. #6470
 * [ENHANCEMENT] Query Frontend: Add a flag `-tenant-federation.max-tenant` to limit the number of tenants for federated query. #6493
 * [ENHANCEMENT] Querier: Add a `-tenant-federation.max-concurrent` flags to configure the number of worker processing federated query and add a `cortex_querier_federated_tenants_per_query` histogram to track the number of tenants per query. #6449
diff --git a/pkg/querier/tripperware/roundtrip.go b/pkg/querier/tripperware/roundtrip.go
index 38945b96691..1c26d8b304e 100644
--- a/pkg/querier/tripperware/roundtrip.go
+++ b/pkg/querier/tripperware/roundtrip.go
@@ -37,6 +37,17 @@ import (
 	util_log "github.com/cortexproject/cortex/pkg/util/log"
 )
 
+const (
+	opTypeQuery          = "query"
+	opTypeQueryRange     = "query_range"
+	opTypeSeries         = "series"
+	opTypeRemoteRead     = "remote_read"
+	opTypeLabelNames     = "label_names"
+	opTypeLabelValues    = "label_values"
+	opTypeMetadata       = "metadata"
+	opTypeQueryExemplars = "query_exemplars"
+)
+
 // HandlerFunc is like http.HandlerFunc, but for Handler.
 type HandlerFunc func(context.Context, Request) (Response, error)
 
@@ -140,12 +151,28 @@ func NewQueryTripperware(
 				isQuery := strings.HasSuffix(r.URL.Path, "/query")
 				isQueryRange := strings.HasSuffix(r.URL.Path, "/query_range")
 				isSeries := strings.HasSuffix(r.URL.Path, "/series")
-
-				op := "query"
-				if isQueryRange {
-					op = "query_range"
-				} else if isSeries {
-					op = "series"
+				isRemoteRead := strings.HasSuffix(r.URL.Path, "/read")
+				isLabelNames := strings.HasSuffix(r.URL.Path, "/labels")
+				isLabelValues := strings.HasSuffix(r.URL.Path, "/values")
+				isMetadata := strings.HasSuffix(r.URL.Path, "/metadata")
+				isQueryExemplars := strings.HasSuffix(r.URL.Path, "/query_exemplars")
+
+				op := opTypeQuery
+				switch {
+				case isQueryRange:
+					op = opTypeQueryRange
+				case isSeries:
+					op = opTypeSeries
+				case isRemoteRead:
+					op = opTypeRemoteRead
+				case isLabelNames:
+					op = opTypeLabelNames
+				case isLabelValues:
+					op = opTypeLabelValues
+				case isMetadata:
+					op = opTypeMetadata
+				case isQueryExemplars:
+					op = opTypeQueryExemplars
 				}
 
 				tenantIDs, err := tenant.TenantIDs(r.Context())
diff --git a/pkg/querier/tripperware/roundtrip_test.go b/pkg/querier/tripperware/roundtrip_test.go
index 12aa5f12406..1456d84bbbf 100644
--- a/pkg/querier/tripperware/roundtrip_test.go
+++ b/pkg/querier/tripperware/roundtrip_test.go
@@ -34,6 +34,10 @@ const (
 	querySubqueryStepSizeTooSmall = "/api/v1/query?query=up%5B30d%3A%5D"
 	queryExceedsMaxQueryLength    = "/api/v1/query?query=up%5B90d%5D"
 	seriesQuery                   = "/api/v1/series?match[]"
+	remoteReadQuery               = "/api/v1/read"
+	labelNamesQuery               = "/api/v1/labels"
+	labelValuesQuery              = "/api/v1/label/label/values"
+	metadataQuery                 = "/api/v1/metadata"
 
 	responseBody        = `{"status":"success","data":{"resultType":"matrix","result":[{"metric":{"foo":"bar"},"values":[[1536673680,"137"],[1536673780,"137"]]}]}}`
 	instantResponseBody = `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"foo":"bar"},"values":[[1536673680,"137"],[1536673780,"137"]]}]}}`
@@ -153,7 +157,7 @@ cortex_query_frontend_queries_total{op="query", source="api", user="1"} 1
 			expectedMetric: `
 # HELP cortex_query_frontend_queries_total Total queries sent per tenant.
 # TYPE cortex_query_frontend_queries_total counter
-cortex_query_frontend_queries_total{op="query", source="api", user="1"} 1
+cortex_query_frontend_queries_total{op="query_exemplars", source="api", user="1"} 1
 `,
 		},
 		{
@@ -166,6 +170,54 @@ cortex_query_frontend_queries_total{op="query", source="api", user="1"} 1
 # HELP cortex_query_frontend_queries_total Total queries sent per tenant.
 # TYPE cortex_query_frontend_queries_total counter
 cortex_query_frontend_queries_total{op="series", source="api", user="1"} 1
+`,
+		},
+		{
+			path:             labelNamesQuery,
+			expectedBody:     "bar",
+			limits:           defaultOverrides,
+			maxSubQuerySteps: 11000,
+			userAgent:        "dummyUserAgent/1.2",
+			expectedMetric: `
+# HELP cortex_query_frontend_queries_total Total queries sent per tenant.
+# TYPE cortex_query_frontend_queries_total counter
+cortex_query_frontend_queries_total{op="label_names", source="api", user="1"} 1
+`,
+		},
+		{
+			path:             labelValuesQuery,
+			expectedBody:     "bar",
+			limits:           defaultOverrides,
+			maxSubQuerySteps: 11000,
+			userAgent:        "dummyUserAgent/1.2",
+			expectedMetric: `
+# HELP cortex_query_frontend_queries_total Total queries sent per tenant.
+# TYPE cortex_query_frontend_queries_total counter
+cortex_query_frontend_queries_total{op="label_values", source="api", user="1"} 1
+`,
+		},
+		{
+			path:             metadataQuery,
+			expectedBody:     "bar",
+			limits:           defaultOverrides,
+			maxSubQuerySteps: 11000,
+			userAgent:        "dummyUserAgent/1.2",
+			expectedMetric: `
+# HELP cortex_query_frontend_queries_total Total queries sent per tenant.
+# TYPE cortex_query_frontend_queries_total counter
+cortex_query_frontend_queries_total{op="metadata", source="api", user="1"} 1
+`,
+		},
+		{
+			path:             remoteReadQuery,
+			expectedBody:     "bar",
+			limits:           defaultOverrides,
+			maxSubQuerySteps: 11000,
+			userAgent:        "dummyUserAgent/1.2",
+			expectedMetric: `
+# HELP cortex_query_frontend_queries_total Total queries sent per tenant.
+# TYPE cortex_query_frontend_queries_total counter
+cortex_query_frontend_queries_total{op="remote_read", source="api", user="1"} 1
 `,
 		},
 		{

From c0e64d5e1dc3cec6e4669109ee293289c3c329c0 Mon Sep 17 00:00:00 2001
From: Sam McBroom <86423878+sam-mcbr@users.noreply.github.com>
Date: Sun, 19 Jan 2025 13:36:27 -0800
Subject: [PATCH 12/34] Allow use of non-dualstack endpoints for S3 blocks
 storage (#6522)

Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 CHANGELOG.md                                |  1 +
 docs/blocks-storage/querier.md              |  4 ++++
 docs/blocks-storage/store-gateway.md        |  4 ++++
 docs/configuration/config-file-reference.md | 16 ++++++++++++++++
 pkg/storage/bucket/s3/bucket_client.go      | 17 +++++++++--------
 pkg/storage/bucket/s3/config.go             |  2 ++
 pkg/storage/bucket/s3/config_test.go        |  2 ++
 7 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 586b3e2539e..8d50c3cacfe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -62,6 +62,7 @@
 * [ENHANCEMENT] Distributor: Added `cortex_distributor_received_samples_per_labelset_total` metric to calculate ingestion rate per label set. #6443
 * [ENHANCEMENT] Added metric name in limiter per-metric exceeded errors. #6416
 * [ENHANCEMENT] StoreGateway: Added `cortex_bucket_store_indexheader_load_duration_seconds` and `cortex_bucket_store_indexheader_download_duration_seconds` metrics for time of downloading and loading index header files. #6445
+* [ENHANCEMENT] Blocks Storage: Allow use of non-dualstack endpoints for S3 blocks storage via `-blocks-storage.s3.disable-dualstack`. #6522
 * [BUGFIX] Runtime-config: Handle absolute file paths when working directory is not / #6224
 * [BUGFIX] Ruler: Allow rule evaluation to complete during shutdown. #6326
 * [BUGFIX] Ring: update ring with new ip address when instance is lost, rejoins, but heartbeat is disabled.  #6271
diff --git a/docs/blocks-storage/querier.md b/docs/blocks-storage/querier.md
index 05a5bbdd6da..19317be05ff 100644
--- a/docs/blocks-storage/querier.md
+++ b/docs/blocks-storage/querier.md
@@ -286,6 +286,10 @@ blocks_storage:
     # CLI flag: -blocks-storage.s3.bucket-name
     [bucket_name: <string> | default = ""]
 
+    # If enabled, S3 endpoint will use the non-dualstack variant.
+    # CLI flag: -blocks-storage.s3.disable-dualstack
+    [disable_dualstack: <boolean> | default = false]
+
     # S3 secret access key
     # CLI flag: -blocks-storage.s3.secret-access-key
     [secret_access_key: <string> | default = ""]
diff --git a/docs/blocks-storage/store-gateway.md b/docs/blocks-storage/store-gateway.md
index e40abbb31c0..e7a65dd58c1 100644
--- a/docs/blocks-storage/store-gateway.md
+++ b/docs/blocks-storage/store-gateway.md
@@ -390,6 +390,10 @@ blocks_storage:
     # CLI flag: -blocks-storage.s3.bucket-name
     [bucket_name: <string> | default = ""]
 
+    # If enabled, S3 endpoint will use the non-dualstack variant.
+    # CLI flag: -blocks-storage.s3.disable-dualstack
+    [disable_dualstack: <boolean> | default = false]
+
     # S3 secret access key
     # CLI flag: -blocks-storage.s3.secret-access-key
     [secret_access_key: <string> | default = ""]
diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
index 28154b7b180..0b977b3aff7 100644
--- a/docs/configuration/config-file-reference.md
+++ b/docs/configuration/config-file-reference.md
@@ -541,6 +541,10 @@ s3:
   # CLI flag: -alertmanager-storage.s3.bucket-name
   [bucket_name: <string> | default = ""]
 
+  # If enabled, S3 endpoint will use the non-dualstack variant.
+  # CLI flag: -alertmanager-storage.s3.disable-dualstack
+  [disable_dualstack: <boolean> | default = false]
+
   # S3 secret access key
   # CLI flag: -alertmanager-storage.s3.secret-access-key
   [secret_access_key: <string> | default = ""]
@@ -836,6 +840,10 @@ s3:
   # CLI flag: -blocks-storage.s3.bucket-name
   [bucket_name: <string> | default = ""]
 
+  # If enabled, S3 endpoint will use the non-dualstack variant.
+  # CLI flag: -blocks-storage.s3.disable-dualstack
+  [disable_dualstack: <boolean> | default = false]
+
   # S3 secret access key
   # CLI flag: -blocks-storage.s3.secret-access-key
   [secret_access_key: <string> | default = ""]
@@ -4771,6 +4779,10 @@ s3:
   # CLI flag: -ruler-storage.s3.bucket-name
   [bucket_name: <string> | default = ""]
 
+  # If enabled, S3 endpoint will use the non-dualstack variant.
+  # CLI flag: -ruler-storage.s3.disable-dualstack
+  [disable_dualstack: <boolean> | default = false]
+
   # S3 secret access key
   # CLI flag: -ruler-storage.s3.secret-access-key
   [secret_access_key: <string> | default = ""]
@@ -5074,6 +5086,10 @@ s3:
   # CLI flag: -runtime-config.s3.bucket-name
   [bucket_name: <string> | default = ""]
 
+  # If enabled, S3 endpoint will use the non-dualstack variant.
+  # CLI flag: -runtime-config.s3.disable-dualstack
+  [disable_dualstack: <boolean> | default = false]
+
   # S3 secret access key
   # CLI flag: -runtime-config.s3.secret-access-key
   [secret_access_key: <string> | default = ""]
diff --git a/pkg/storage/bucket/s3/bucket_client.go b/pkg/storage/bucket/s3/bucket_client.go
index 53a0f4f5882..220afb90256 100644
--- a/pkg/storage/bucket/s3/bucket_client.go
+++ b/pkg/storage/bucket/s3/bucket_client.go
@@ -83,14 +83,15 @@ func newS3Config(cfg Config) (s3.Config, error) {
 	}
 
 	return s3.Config{
-		Bucket:         cfg.BucketName,
-		Endpoint:       cfg.Endpoint,
-		Region:         cfg.Region,
-		AccessKey:      cfg.AccessKeyID,
-		SecretKey:      cfg.SecretAccessKey.Value,
-		Insecure:       cfg.Insecure,
-		SSEConfig:      sseCfg,
-		SendContentMd5: cfg.SendContentMd5,
+		Bucket:           cfg.BucketName,
+		Endpoint:         cfg.Endpoint,
+		Region:           cfg.Region,
+		DisableDualstack: cfg.DisableDualstack,
+		AccessKey:        cfg.AccessKeyID,
+		SecretKey:        cfg.SecretAccessKey.Value,
+		Insecure:         cfg.Insecure,
+		SSEConfig:        sseCfg,
+		SendContentMd5:   cfg.SendContentMd5,
 		HTTPConfig: s3.HTTPConfig{
 			IdleConnTimeout:       model.Duration(cfg.HTTP.IdleConnTimeout),
 			ResponseHeaderTimeout: model.Duration(cfg.HTTP.ResponseHeaderTimeout),
diff --git a/pkg/storage/bucket/s3/config.go b/pkg/storage/bucket/s3/config.go
index bb7bb9f9f86..df5bd33ab29 100644
--- a/pkg/storage/bucket/s3/config.go
+++ b/pkg/storage/bucket/s3/config.go
@@ -66,6 +66,7 @@ type Config struct {
 	Endpoint           string         `yaml:"endpoint"`
 	Region             string         `yaml:"region"`
 	BucketName         string         `yaml:"bucket_name"`
+	DisableDualstack   bool           `yaml:"disable_dualstack"`
 	SecretAccessKey    flagext.Secret `yaml:"secret_access_key"`
 	AccessKeyID        string         `yaml:"access_key_id"`
 	Insecure           bool           `yaml:"insecure"`
@@ -89,6 +90,7 @@ func (cfg *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
 	f.Var(&cfg.SecretAccessKey, prefix+"s3.secret-access-key", "S3 secret access key")
 	f.StringVar(&cfg.BucketName, prefix+"s3.bucket-name", "", "S3 bucket name")
 	f.StringVar(&cfg.Region, prefix+"s3.region", "", "S3 region. If unset, the client will issue a S3 GetBucketLocation API call to autodetect it.")
+	f.BoolVar(&cfg.DisableDualstack, prefix+"s3.disable-dualstack", false, "If enabled, S3 endpoint will use the non-dualstack variant.")
 	f.StringVar(&cfg.Endpoint, prefix+"s3.endpoint", "", "The S3 bucket endpoint. It could be an AWS S3 endpoint listed at https://docs.aws.amazon.com/general/latest/gr/s3.html or the address of an S3-compatible service in hostname:port format.")
 	f.BoolVar(&cfg.Insecure, prefix+"s3.insecure", false, "If enabled, use http:// for the S3 endpoint instead of https://. This could be useful in local dev/test environments while using an S3-compatible backend storage, like Minio.")
 	f.StringVar(&cfg.SignatureVersion, prefix+"s3.signature-version", SignatureVersionV4, fmt.Sprintf("The signature version to use for authenticating against S3. Supported values are: %s.", strings.Join(supportedSignatureVersions, ", ")))
diff --git a/pkg/storage/bucket/s3/config_test.go b/pkg/storage/bucket/s3/config_test.go
index b1f38ce6f46..a01a8a07b7e 100644
--- a/pkg/storage/bucket/s3/config_test.go
+++ b/pkg/storage/bucket/s3/config_test.go
@@ -51,6 +51,7 @@ func TestConfig(t *testing.T) {
 endpoint: test-endpoint
 region: test-region
 bucket_name: test-bucket-name
+disable_dualstack: true
 secret_access_key: test-secret-access-key
 access_key_id: test-access-key-id
 insecure: true
@@ -74,6 +75,7 @@ http:
 				Endpoint:         "test-endpoint",
 				Region:           "test-region",
 				BucketName:       "test-bucket-name",
+				DisableDualstack: true,
 				SecretAccessKey:  flagext.Secret{Value: "test-secret-access-key"},
 				AccessKeyID:      "test-access-key-id",
 				Insecure:         true,

From a64382d4633e1702c117a7af1ca2998b88e88bbe Mon Sep 17 00:00:00 2001
From: Ben Ye <benye@amazon.com>
Date: Mon, 20 Jan 2025 14:02:32 -0800
Subject: [PATCH 13/34] Expose grpc client connect timeout config and default
 to 5s (#6523)

* expose grpc client connect timeout config

Signed-off-by: yeya24 <benye@amazon.com>

* changelog

Signed-off-by: yeya24 <benye@amazon.com>

---------

Signed-off-by: yeya24 <benye@amazon.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 CHANGELOG.md                                |  1 +
 docs/configuration/config-file-reference.md | 30 +++++++++++++++++++++
 pkg/util/grpcclient/grpcclient.go           | 14 ++++++++++
 3 files changed, 45 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8d50c3cacfe..c3699d14bfe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@
 * [CHANGE] Change default value of `-blocks-storage.bucket-store.index-cache.multilevel.max-async-concurrency` from `50` to `3` #6265
 * [CHANGE] Enable Compactor and Alertmanager in target all. #6204
 * [CHANGE] Update the `cortex_ingester_inflight_push_requests` metric to represent the maximum number of inflight requests recorded in the last minute. #6437
+* [CHANGE] gRPC Client: Expose connection timeout and set default to value to 5s. #6523
 * [FEATURE] Ruler: Add an experimental flag `-ruler.query-response-format` to retrieve query response as a proto format. #6345
 * [FEATURE] Ruler: Pagination support for List Rules API. #6299
 * [FEATURE] Query Frontend/Querier: Add protobuf codec `-api.querier-default-codec` and the option to choose response compression type `-querier.response-compression`. #5527
diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
index 0b977b3aff7..69d6713e930 100644
--- a/docs/configuration/config-file-reference.md
+++ b/docs/configuration/config-file-reference.md
@@ -266,6 +266,11 @@ query_scheduler:
     # CLI flag: -query-scheduler.grpc-client-config.tls-insecure-skip-verify
     [tls_insecure_skip_verify: <boolean> | default = false]
 
+    # The maximum amount of time to establish a connection. A value of 0 means
+    # using default gRPC client connect timeout 20s.
+    # CLI flag: -query-scheduler.grpc-client-config.connect-timeout
+    [connect_timeout: <duration> | default = 5s]
+
 # The tracing_config configures backends cortex uses.
 [tracing: <tracing_config>]
 ```
@@ -2972,6 +2977,11 @@ grpc_client_config:
   # Skip validating server certificate.
   # CLI flag: -querier.frontend-client.tls-insecure-skip-verify
   [tls_insecure_skip_verify: <boolean> | default = false]
+
+  # The maximum amount of time to establish a connection. A value of 0 means
+  # using default gRPC client connect timeout 20s.
+  # CLI flag: -querier.frontend-client.connect-timeout
+  [connect_timeout: <duration> | default = 5s]
 ```
 
 ### `ingester_config`
@@ -3282,6 +3292,11 @@ grpc_client_config:
   # CLI flag: -ingester.client.tls-insecure-skip-verify
   [tls_insecure_skip_verify: <boolean> | default = false]
 
+  # The maximum amount of time to establish a connection. A value of 0 means
+  # using default gRPC client connect timeout 20s.
+  # CLI flag: -ingester.client.connect-timeout
+  [connect_timeout: <duration> | default = 5s]
+
   # EXPERIMENTAL: If enabled, gRPC clients perform health checks for each target
   # and fail the request if the target is marked as unhealthy.
   healthcheck_config:
@@ -4200,6 +4215,11 @@ grpc_client_config:
   # CLI flag: -frontend.grpc-client-config.tls-insecure-skip-verify
   [tls_insecure_skip_verify: <boolean> | default = false]
 
+  # The maximum amount of time to establish a connection. A value of 0 means
+  # using default gRPC client connect timeout 20s.
+  # CLI flag: -frontend.grpc-client-config.connect-timeout
+  [connect_timeout: <duration> | default = 5s]
+
 # When multiple query-schedulers are available, re-enqueue queries that were
 # rejected due to too many outstanding requests.
 # CLI flag: -frontend.retry-on-too-many-outstanding-requests
@@ -4426,6 +4446,11 @@ frontend_client:
   # CLI flag: -ruler.frontendClient.tls-insecure-skip-verify
   [tls_insecure_skip_verify: <boolean> | default = false]
 
+  # The maximum amount of time to establish a connection. A value of 0 means
+  # using default gRPC client connect timeout 20s.
+  # CLI flag: -ruler.frontendClient.connect-timeout
+  [connect_timeout: <duration> | default = 5s]
+
 # URL of alerts return path.
 # CLI flag: -ruler.external.url
 [external_url: <url> | default = ]
@@ -4501,6 +4526,11 @@ ruler_client:
   # CLI flag: -ruler.client.tls-insecure-skip-verify
   [tls_insecure_skip_verify: <boolean> | default = false]
 
+  # The maximum amount of time to establish a connection. A value of 0 means
+  # using default gRPC client connect timeout 20s.
+  # CLI flag: -ruler.client.connect-timeout
+  [connect_timeout: <duration> | default = 5s]
+
   # Timeout for downstream rulers.
   # CLI flag: -ruler.client.remote-timeout
   [remote_timeout: <duration> | default = 2m]
diff --git a/pkg/util/grpcclient/grpcclient.go b/pkg/util/grpcclient/grpcclient.go
index 9816b7ab6c6..6adb8139a2c 100644
--- a/pkg/util/grpcclient/grpcclient.go
+++ b/pkg/util/grpcclient/grpcclient.go
@@ -8,6 +8,7 @@ import (
 	middleware "github.com/grpc-ecosystem/go-grpc-middleware"
 	"github.com/pkg/errors"
 	"google.golang.org/grpc"
+	grpcbackoff "google.golang.org/grpc/backoff"
 	"google.golang.org/grpc/encoding/gzip"
 	"google.golang.org/grpc/keepalive"
 
@@ -32,6 +33,8 @@ type Config struct {
 	TLSEnabled               bool             `yaml:"tls_enabled"`
 	TLS                      tls.ClientConfig `yaml:",inline"`
 	SignWriteRequestsEnabled bool             `yaml:"-"`
+
+	ConnectTimeout time.Duration `yaml:"connect_timeout"`
 }
 
 type ConfigWithHealthCheck struct {
@@ -58,6 +61,7 @@ func (cfg *Config) RegisterFlagsWithPrefix(prefix, defaultGrpcCompression string
 	f.IntVar(&cfg.RateLimitBurst, prefix+".grpc-client-rate-limit-burst", 0, "Rate limit burst for gRPC client.")
 	f.BoolVar(&cfg.BackoffOnRatelimits, prefix+".backoff-on-ratelimits", false, "Enable backoff and retry when we hit ratelimits.")
 	f.BoolVar(&cfg.TLSEnabled, prefix+".tls-enabled", cfg.TLSEnabled, "Enable TLS in the GRPC client. This flag needs to be enabled when any other TLS flag is set. If set to false, insecure connection to gRPC server will be used.")
+	f.DurationVar(&cfg.ConnectTimeout, prefix+".connect-timeout", 5*time.Second, "The maximum amount of time to establish a connection. A value of 0 means using default gRPC client connect timeout 20s.")
 
 	cfg.BackoffConfig.RegisterFlagsWithPrefix(prefix, f)
 
@@ -111,6 +115,16 @@ func (cfg *Config) DialOption(unaryClientInterceptors []grpc.UnaryClientIntercep
 		unaryClientInterceptors = append([]grpc.UnaryClientInterceptor{NewRateLimiter(cfg)}, unaryClientInterceptors...)
 	}
 
+	if cfg.ConnectTimeout > 0 {
+		opts = append(
+			opts,
+			grpc.WithConnectParams(grpc.ConnectParams{
+				Backoff:           grpcbackoff.DefaultConfig,
+				MinConnectTimeout: cfg.ConnectTimeout,
+			}),
+		)
+	}
+
 	if cfg.SignWriteRequestsEnabled {
 		unaryClientInterceptors = append(unaryClientInterceptors, UnarySigningClientInterceptor)
 	}

From 12e8808aabb21ab46ae8ee0b159d5945e6d51765 Mon Sep 17 00:00:00 2001
From: Alex Le <leqiyue@amazon.com>
Date: Tue, 21 Jan 2025 15:21:40 -0800
Subject: [PATCH 14/34] Hook up partition compaction end to end implementation
 (#6510)

* Implemented partition compaction end to end with custom compaction lifecycle

Signed-off-by: Alex Le <leqiyue@amazon.com>

* removed unused variable

Signed-off-by: Alex Le <leqiyue@amazon.com>

* tweak test

Signed-off-by: Alex Le <leqiyue@amazon.com>

* tweak test

Signed-off-by: Alex Le <leqiyue@amazon.com>

* refactor according to comments

Signed-off-by: Alex Le <leqiyue@amazon.com>

* tweak test

Signed-off-by: Alex Le <leqiyue@amazon.com>

* check context error inside sharded posting

Signed-off-by: Alex Le <leqiyue@amazon.com>

* fix lint

Signed-off-by: Alex Le <leqiyue@amazon.com>

* fix integration test for memberlist

Signed-off-by: Alex Le <leqiyue@amazon.com>

* make compactor initial wait cancellable

Signed-off-by: Alex Le <leqiyue@amazon.com>

---------

Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 pkg/compactor/background_chunks_series_set.go |   60 +
 pkg/compactor/blocks_cleaner.go               |   10 +-
 pkg/compactor/compactor.go                    |  161 +-
 pkg/compactor/compactor_metrics.go            |    7 +
 pkg/compactor/compactor_metrics_test.go       |   10 +-
 pkg/compactor/compactor_paritioning_test.go   | 1807 +++++++++++++++++
 pkg/compactor/compactor_test.go               |   22 +-
 .../partition_compaction_complete_checker.go  |   16 +
 pkg/compactor/sharded_block_populator.go      |  208 ++
 .../sharded_compaction_lifecycle_callback.go  |  108 +
 ...rded_compaction_lifecycle_callback_test.go |   96 +
 pkg/compactor/sharded_posting.go              |   38 +
 pkg/compactor/sharded_posting_test.go         |  109 +
 .../thanos/pkg/testutil/e2eutil/copy.go       |   55 +
 .../thanos/pkg/testutil/e2eutil/port.go       |   20 +
 .../thanos/pkg/testutil/e2eutil/prometheus.go |  818 ++++++++
 .../thanos/pkg/testutil/e2eutil/rand.go       |   11 +
 .../pkg/testutil/e2eutil/sysprocattr.go       |   13 +
 .../pkg/testutil/e2eutil/sysprocattr_linux.go |   13 +
 vendor/modules.txt                            |    1 +
 20 files changed, 3537 insertions(+), 46 deletions(-)
 create mode 100644 pkg/compactor/background_chunks_series_set.go
 create mode 100644 pkg/compactor/compactor_paritioning_test.go
 create mode 100644 pkg/compactor/partition_compaction_complete_checker.go
 create mode 100644 pkg/compactor/sharded_block_populator.go
 create mode 100644 pkg/compactor/sharded_compaction_lifecycle_callback.go
 create mode 100644 pkg/compactor/sharded_compaction_lifecycle_callback_test.go
 create mode 100644 pkg/compactor/sharded_posting.go
 create mode 100644 pkg/compactor/sharded_posting_test.go
 create mode 100644 vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/copy.go
 create mode 100644 vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/port.go
 create mode 100644 vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/prometheus.go
 create mode 100644 vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/rand.go
 create mode 100644 vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/sysprocattr.go
 create mode 100644 vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/sysprocattr_linux.go

diff --git a/pkg/compactor/background_chunks_series_set.go b/pkg/compactor/background_chunks_series_set.go
new file mode 100644
index 00000000000..bca40f73d2e
--- /dev/null
+++ b/pkg/compactor/background_chunks_series_set.go
@@ -0,0 +1,60 @@
+package compactor
+
+import (
+	"context"
+
+	"github.com/prometheus/prometheus/storage"
+	"github.com/prometheus/prometheus/util/annotations"
+)
+
+type backgrounChunkSeriesSet struct {
+	nextSet chan storage.ChunkSeries
+	actual  storage.ChunkSeries
+	cs      storage.ChunkSeriesSet
+}
+
+func (b *backgrounChunkSeriesSet) Next() bool {
+	s, ok := <-b.nextSet
+	b.actual = s
+	return ok
+}
+
+func (b *backgrounChunkSeriesSet) At() storage.ChunkSeries {
+	return b.actual
+}
+
+func (b *backgrounChunkSeriesSet) Err() error {
+	return b.cs.Err()
+}
+
+func (b *backgrounChunkSeriesSet) Warnings() annotations.Annotations {
+	return b.cs.Warnings()
+}
+
+func (b *backgrounChunkSeriesSet) run(ctx context.Context) {
+	for {
+		if !b.cs.Next() {
+			close(b.nextSet)
+			return
+		}
+
+		select {
+		case b.nextSet <- b.cs.At():
+		case <-ctx.Done():
+			return
+		}
+	}
+}
+
+func NewBackgroundChunkSeriesSet(ctx context.Context, cs storage.ChunkSeriesSet) storage.ChunkSeriesSet {
+	r := &backgrounChunkSeriesSet{
+		cs:      cs,
+		nextSet: make(chan storage.ChunkSeries, 1000),
+	}
+
+	go func() {
+		r.run(ctx)
+	}()
+
+	return r
+}
diff --git a/pkg/compactor/blocks_cleaner.go b/pkg/compactor/blocks_cleaner.go
index 71dfc775dfd..d8970de61a7 100644
--- a/pkg/compactor/blocks_cleaner.go
+++ b/pkg/compactor/blocks_cleaner.go
@@ -152,23 +152,23 @@ func NewBlocksCleaner(
 		tenantBlocks: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
 			Name: "cortex_bucket_blocks_count",
 			Help: "Total number of blocks in the bucket. Includes blocks marked for deletion, but not partial blocks.",
-		}, []string{"user"}),
+		}, commonLabels),
 		tenantBlocksMarkedForDelete: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
 			Name: "cortex_bucket_blocks_marked_for_deletion_count",
 			Help: "Total number of blocks marked for deletion in the bucket.",
-		}, []string{"user"}),
+		}, commonLabels),
 		tenantBlocksMarkedForNoCompaction: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
 			Name: "cortex_bucket_blocks_marked_for_no_compaction_count",
 			Help: "Total number of blocks marked for no compaction in the bucket.",
-		}, []string{"user"}),
+		}, commonLabels),
 		tenantPartialBlocks: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
 			Name: "cortex_bucket_blocks_partials_count",
 			Help: "Total number of partial blocks.",
-		}, []string{"user"}),
+		}, commonLabels),
 		tenantBucketIndexLastUpdate: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
 			Name: "cortex_bucket_index_last_successful_update_timestamp_seconds",
 			Help: "Timestamp of the last successful update of a tenant's bucket index.",
-		}, []string{"user"}),
+		}, commonLabels),
 		tenantBlocksCleanedTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
 			Name: "cortex_bucket_blocks_cleaned_total",
 			Help: "Total number of blocks deleted for a tenant.",
diff --git a/pkg/compactor/compactor.go b/pkg/compactor/compactor.go
index 01f534f6296..e0ea3b21022 100644
--- a/pkg/compactor/compactor.go
+++ b/pkg/compactor/compactor.go
@@ -160,6 +160,30 @@ var (
 		}
 		return compactor, plannerFactory, nil
 	}
+
+	DefaultBlockDeletableCheckerFactory = func(_ context.Context, _ objstore.InstrumentedBucket, _ log.Logger) compact.BlockDeletableChecker {
+		return compact.DefaultBlockDeletableChecker{}
+	}
+
+	PartitionCompactionBlockDeletableCheckerFactory = func(ctx context.Context, bkt objstore.InstrumentedBucket, logger log.Logger) compact.BlockDeletableChecker {
+		return NewPartitionCompactionBlockDeletableChecker()
+	}
+
+	DefaultCompactionLifecycleCallbackFactory = func(_ context.Context, _ objstore.InstrumentedBucket, _ log.Logger, _ int, _ string, _ string, _ *compactorMetrics) compact.CompactionLifecycleCallback {
+		return compact.DefaultCompactionLifecycleCallback{}
+	}
+
+	ShardedCompactionLifecycleCallbackFactory = func(ctx context.Context, userBucket objstore.InstrumentedBucket, logger log.Logger, metaSyncConcurrency int, compactDir string, userID string, compactorMetrics *compactorMetrics) compact.CompactionLifecycleCallback {
+		return NewShardedCompactionLifecycleCallback(
+			ctx,
+			userBucket,
+			logger,
+			metaSyncConcurrency,
+			compactDir,
+			userID,
+			compactorMetrics,
+		)
+	}
 )
 
 // BlocksGrouperFactory builds and returns the grouper to use to compact a tenant's blocks.
@@ -202,6 +226,22 @@ type PlannerFactory func(
 	compactorMetrics *compactorMetrics,
 ) compact.Planner
 
+type CompactionLifecycleCallbackFactory func(
+	ctx context.Context,
+	userBucket objstore.InstrumentedBucket,
+	logger log.Logger,
+	metaSyncConcurrency int,
+	compactDir string,
+	userID string,
+	compactorMetrics *compactorMetrics,
+) compact.CompactionLifecycleCallback
+
+type BlockDeletableCheckerFactory func(
+	ctx context.Context,
+	bkt objstore.InstrumentedBucket,
+	logger log.Logger,
+) compact.BlockDeletableChecker
+
 // Limits defines limits used by the Compactor.
 type Limits interface {
 	CompactorTenantShardSize(userID string) int
@@ -380,6 +420,10 @@ type Compactor struct {
 
 	blocksPlannerFactory PlannerFactory
 
+	blockDeletableCheckerFactory BlockDeletableCheckerFactory
+
+	compactionLifecycleCallbackFactory CompactionLifecycleCallbackFactory
+
 	// Client used to run operations on the bucket storing blocks.
 	bucketClient objstore.InstrumentedBucket
 
@@ -436,11 +480,25 @@ func NewCompactor(compactorCfg Config, storageCfg cortex_tsdb.BlocksStorageConfi
 		}
 	}
 
+	var blockDeletableCheckerFactory BlockDeletableCheckerFactory
+	if compactorCfg.ShardingStrategy == util.ShardingStrategyShuffle && compactorCfg.CompactionStrategy == util.CompactionStrategyPartitioning {
+		blockDeletableCheckerFactory = PartitionCompactionBlockDeletableCheckerFactory
+	} else {
+		blockDeletableCheckerFactory = DefaultBlockDeletableCheckerFactory
+	}
+
+	var compactionLifecycleCallbackFactory CompactionLifecycleCallbackFactory
+	if compactorCfg.ShardingStrategy == util.ShardingStrategyShuffle && compactorCfg.CompactionStrategy == util.CompactionStrategyPartitioning {
+		compactionLifecycleCallbackFactory = ShardedCompactionLifecycleCallbackFactory
+	} else {
+		compactionLifecycleCallbackFactory = DefaultCompactionLifecycleCallbackFactory
+	}
+
 	if ingestionReplicationFactor <= 0 {
 		ingestionReplicationFactor = 1
 	}
 
-	cortexCompactor, err := newCompactor(compactorCfg, storageCfg, logger, registerer, bucketClientFactory, blocksGrouperFactory, blocksCompactorFactory, limits, ingestionReplicationFactor)
+	cortexCompactor, err := newCompactor(compactorCfg, storageCfg, logger, registerer, bucketClientFactory, blocksGrouperFactory, blocksCompactorFactory, blockDeletableCheckerFactory, compactionLifecycleCallbackFactory, limits, ingestionReplicationFactor)
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to create Cortex blocks compactor")
 	}
@@ -456,6 +514,8 @@ func newCompactor(
 	bucketClientFactory func(ctx context.Context) (objstore.InstrumentedBucket, error),
 	blocksGrouperFactory BlocksGrouperFactory,
 	blocksCompactorFactory BlocksCompactorFactory,
+	blockDeletableCheckerFactory BlockDeletableCheckerFactory,
+	compactionLifecycleCallbackFactory CompactionLifecycleCallbackFactory,
 	limits *validation.Overrides,
 	ingestionReplicationFactor int,
 ) (*Compactor, error) {
@@ -466,15 +526,17 @@ func newCompactor(
 		compactorMetrics = newDefaultCompactorMetrics(registerer)
 	}
 	c := &Compactor{
-		compactorCfg:           compactorCfg,
-		storageCfg:             storageCfg,
-		parentLogger:           logger,
-		logger:                 log.With(logger, "component", "compactor"),
-		registerer:             registerer,
-		bucketClientFactory:    bucketClientFactory,
-		blocksGrouperFactory:   blocksGrouperFactory,
-		blocksCompactorFactory: blocksCompactorFactory,
-		allowedTenants:         util.NewAllowedTenants(compactorCfg.EnabledTenants, compactorCfg.DisabledTenants),
+		compactorCfg:                       compactorCfg,
+		storageCfg:                         storageCfg,
+		parentLogger:                       logger,
+		logger:                             log.With(logger, "component", "compactor"),
+		registerer:                         registerer,
+		bucketClientFactory:                bucketClientFactory,
+		blocksGrouperFactory:               blocksGrouperFactory,
+		blocksCompactorFactory:             blocksCompactorFactory,
+		blockDeletableCheckerFactory:       blockDeletableCheckerFactory,
+		compactionLifecycleCallbackFactory: compactionLifecycleCallbackFactory,
+		allowedTenants:                     util.NewAllowedTenants(compactorCfg.EnabledTenants, compactorCfg.DisabledTenants),
 
 		CompactorStartDurationSeconds: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{
 			Name: "cortex_compactor_start_duration_seconds",
@@ -662,12 +724,6 @@ func (c *Compactor) starting(ctx context.Context) error {
 	}, c.bucketClient, c.usersScanner, c.compactorCfg.CompactionVisitMarkerTimeout, c.limits, c.parentLogger, cleanerRingLifecyclerID, c.registerer, c.compactorCfg.CleanerVisitMarkerTimeout, c.compactorCfg.CleanerVisitMarkerFileUpdateInterval,
 		c.compactorMetrics.syncerBlocksMarkedForDeletion, c.compactorMetrics.remainingPlannedCompactions)
 
-	// Ensure an initial cleanup occurred before starting the compactor.
-	if err := services.StartAndAwaitRunning(ctx, c.blocksCleaner); err != nil {
-		c.ringSubservices.StopAsync()
-		return errors.Wrap(err, "failed to start the blocks cleaner")
-	}
-
 	if c.compactorCfg.CachingBucketEnabled {
 		matchers := cortex_tsdb.NewMatchers()
 		// Do not cache tenant deletion marker and block deletion marker for compactor
@@ -698,15 +754,30 @@ func (c *Compactor) stopping(_ error) error {
 }
 
 func (c *Compactor) running(ctx context.Context) error {
+	// Ensure an initial cleanup occurred as first thing when running compactor.
+	if err := services.StartAndAwaitRunning(ctx, c.blocksCleaner); err != nil {
+		c.ringSubservices.StopAsync()
+		return errors.Wrap(err, "failed to start the blocks cleaner")
+	}
+
 	// Run an initial compaction before starting the interval.
+	// Insert jitter right before compaction starts to avoid multiple starting compactor to be in sync
+	select {
+	case <-ctx.Done():
+		return ctx.Err()
+	case <-time.After(time.Duration(rand.Int63n(int64(float64(c.compactorCfg.CompactionInterval) * 0.1)))):
+	}
 	c.compactUsers(ctx)
 
-	ticker := time.NewTicker(util.DurationWithJitter(c.compactorCfg.CompactionInterval, 0.05))
+	ticker := time.NewTicker(c.compactorCfg.CompactionInterval)
 	defer ticker.Stop()
 
 	for {
 		select {
 		case <-ticker.C:
+			// Insert jitter right before compaction starts, so that there will always
+			// have jitter even compaction time is longer than CompactionInterval
+			time.Sleep(time.Duration(rand.Int63n(int64(float64(c.compactorCfg.CompactionInterval) * 0.1))))
 			c.compactUsers(ctx)
 		case <-ctx.Done():
 			return nil
@@ -717,23 +788,19 @@ func (c *Compactor) running(ctx context.Context) error {
 }
 
 func (c *Compactor) compactUsers(ctx context.Context) {
-	failed := false
+	succeeded := false
 	interrupted := false
+	compactionErrorCount := 0
 
 	c.CompactionRunsStarted.Inc()
 
 	defer func() {
-		// interruptions and successful runs are considered
-		// mutually exclusive but we consider a run failed if any
-		// tenant runs failed even if later runs are interrupted
-		if !interrupted && !failed {
+		if succeeded && compactionErrorCount == 0 {
 			c.CompactionRunsCompleted.Inc()
 			c.CompactionRunsLastSuccess.SetToCurrentTime()
-		}
-		if interrupted {
+		} else if interrupted {
 			c.CompactionRunsInterrupted.Inc()
-		}
-		if failed {
+		} else {
 			c.CompactionRunsFailed.Inc()
 		}
 
@@ -747,7 +814,6 @@ func (c *Compactor) compactUsers(ctx context.Context) {
 	level.Info(c.logger).Log("msg", "discovering users from bucket")
 	users, err := c.discoverUsersWithRetries(ctx)
 	if err != nil {
-		failed = true
 		level.Error(c.logger).Log("msg", "failed to discover users from bucket", "err", err)
 		return
 	}
@@ -816,7 +882,7 @@ func (c *Compactor) compactUsers(ctx context.Context) {
 			}
 
 			c.CompactionRunFailedTenants.Inc()
-			failed = true
+			compactionErrorCount++
 			level.Error(c.logger).Log("msg", "failed to compact user blocks", "user", userID, "err", err)
 			continue
 		}
@@ -851,6 +917,7 @@ func (c *Compactor) compactUsers(ctx context.Context) {
 			}
 		}
 	}
+	succeeded = true
 }
 
 func (c *Compactor) compactUserWithRetries(ctx context.Context, userID string) error {
@@ -885,6 +952,11 @@ func (c *Compactor) compactUserWithRetries(ctx context.Context, userID string) e
 		retries.Wait()
 	}
 
+	err := errors.Unwrap(errors.Cause(lastErr))
+	if errors.Is(err, plannerCompletedPartitionError) || errors.Is(err, plannerVisitedPartitionError) {
+		return nil
+	}
+
 	return lastErr
 }
 
@@ -898,7 +970,12 @@ func (c *Compactor) compactUser(ctx context.Context, userID string) error {
 
 	// Filters out duplicate blocks that can be formed from two or more overlapping
 	// blocks that fully submatches the source blocks of the older blocks.
-	deduplicateBlocksFilter := block.NewDeduplicateFilter(c.compactorCfg.BlockSyncConcurrency)
+	var deduplicateBlocksFilter CortexMetadataFilter
+	if c.compactorCfg.ShardingStrategy == util.ShardingStrategyShuffle && c.compactorCfg.CompactionStrategy == util.CompactionStrategyPartitioning {
+		deduplicateBlocksFilter = &disabledDeduplicateFilter{}
+	} else {
+		deduplicateBlocksFilter = block.NewDeduplicateFilter(c.compactorCfg.BlockSyncConcurrency)
+	}
 
 	// While fetching blocks, we filter out blocks that were marked for deletion by using IgnoreDeletionMarkFilter.
 	// No delay is used -- all blocks with deletion marker are ignored, and not considered for compaction.
@@ -966,12 +1043,14 @@ func (c *Compactor) compactUser(ctx context.Context, userID string) error {
 
 	currentCtx, cancel := context.WithCancel(ctx)
 	defer cancel()
-	compactor, err := compact.NewBucketCompactor(
+	compactor, err := compact.NewBucketCompactorWithCheckerAndCallback(
 		ulogger,
 		syncer,
 		c.blocksGrouperFactory(currentCtx, c.compactorCfg, bucket, ulogger, c.BlocksMarkedForNoCompaction, c.blockVisitMarkerReadFailed, c.blockVisitMarkerWriteFailed, syncerMetrics, c.compactorMetrics, c.ring, c.ringLifecycler, c.limits, userID, noCompactMarkerFilter, c.ingestionReplicationFactor),
 		c.blocksPlannerFactory(currentCtx, bucket, ulogger, c.compactorCfg, noCompactMarkerFilter, c.ringLifecycler, userID, c.blockVisitMarkerReadFailed, c.blockVisitMarkerWriteFailed, c.compactorMetrics),
 		c.blocksCompactor,
+		c.blockDeletableCheckerFactory(currentCtx, bucket, ulogger),
+		c.compactionLifecycleCallbackFactory(currentCtx, bucket, ulogger, c.compactorCfg.MetaSyncConcurrency, c.compactDirForUser(userID), userID, c.compactorMetrics),
 		c.compactDirForUser(userID),
 		bucket,
 		c.compactorCfg.CompactionConcurrency,
@@ -982,6 +1061,7 @@ func (c *Compactor) compactUser(ctx context.Context, userID string) error {
 	}
 
 	if err := compactor.Compact(ctx); err != nil {
+		level.Warn(ulogger).Log("msg", "compaction failed with error", "err", err)
 		return errors.Wrap(err, "compaction")
 	}
 
@@ -1148,3 +1228,24 @@ func (c *Compactor) isPermissionDeniedErr(err error) bool {
 	}
 	return s.Code() == codes.PermissionDenied
 }
+
+type CortexMetadataFilter interface {
+	block.DeduplicateFilter
+	block.MetadataFilter
+}
+
+// disabledDeduplicateFilter is only used by Partitioning Compaction. Because Partitioning Compaction
+// would always generate multiple result blocks (different partitions) for the same time range compaction.
+// Those result blocks would always have same source blocks. Those result blocks should not be marked
+// as duplicates when grouping for the next level of compaction. So DeduplicateFilter is disabled.
+type disabledDeduplicateFilter struct {
+}
+
+func (f *disabledDeduplicateFilter) Filter(ctx context.Context, metas map[ulid.ULID]*metadata.Meta, synced block.GaugeVec, modified block.GaugeVec) error {
+	// don't do any deduplicate filtering
+	return nil
+}
+
+func (f *disabledDeduplicateFilter) DuplicateIDs() []ulid.ULID {
+	return nil
+}
diff --git a/pkg/compactor/compactor_metrics.go b/pkg/compactor/compactor_metrics.go
index 23e7bca6c02..03d4686ee1c 100644
--- a/pkg/compactor/compactor_metrics.go
+++ b/pkg/compactor/compactor_metrics.go
@@ -40,6 +40,7 @@ type compactorMetrics struct {
 	compactionErrorsCount       *prometheus.CounterVec
 	partitionCount              *prometheus.GaugeVec
 	compactionsNotPlanned       *prometheus.CounterVec
+	compactionDuration          *prometheus.GaugeVec
 }
 
 const (
@@ -179,6 +180,10 @@ func newCompactorMetricsWithLabels(reg prometheus.Registerer, commonLabels []str
 		Name: "cortex_compactor_group_compactions_not_planned_total",
 		Help: "Total number of group compaction not planned due to error.",
 	}, compactionLabels)
+	m.compactionDuration = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
+		Name: "cortex_compact_group_compaction_duration_seconds",
+		Help: "Duration of completed compactions in seconds",
+	}, compactionLabels)
 
 	return &m
 }
@@ -231,6 +236,7 @@ func (m *compactorMetrics) initMetricWithCompactionLabelValues(labelValue ...str
 	m.verticalCompactions.WithLabelValues(labelValue...)
 	m.partitionCount.WithLabelValues(labelValue...)
 	m.compactionsNotPlanned.WithLabelValues(labelValue...)
+	m.compactionDuration.WithLabelValues(labelValue...)
 }
 
 func (m *compactorMetrics) deleteMetricsForDeletedTenant(userID string) {
@@ -243,4 +249,5 @@ func (m *compactorMetrics) deleteMetricsForDeletedTenant(userID string) {
 	m.verticalCompactions.DeleteLabelValues(userID)
 	m.partitionCount.DeleteLabelValues(userID)
 	m.compactionsNotPlanned.DeleteLabelValues(userID)
+	m.compactionDuration.DeleteLabelValues(userID)
 }
diff --git a/pkg/compactor/compactor_metrics_test.go b/pkg/compactor/compactor_metrics_test.go
index 947fd7f396a..b0e212962cc 100644
--- a/pkg/compactor/compactor_metrics_test.go
+++ b/pkg/compactor/compactor_metrics_test.go
@@ -9,7 +9,7 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
-func TestSyncerMetrics(t *testing.T) {
+func TestCompactorMetrics(t *testing.T) {
 	reg := prometheus.NewPedanticRegistry()
 	cm := newCompactorMetricsWithLabels(reg, commonLabels, commonLabels)
 
@@ -140,6 +140,11 @@ func TestSyncerMetrics(t *testing.T) {
 			cortex_compactor_group_compactions_not_planned_total{user="aaa"} 544390
 			cortex_compactor_group_compactions_not_planned_total{user="bbb"} 555500
 			cortex_compactor_group_compactions_not_planned_total{user="ccc"} 566610
+			# HELP cortex_compact_group_compaction_duration_seconds Duration of completed compactions in seconds
+			# TYPE cortex_compact_group_compaction_duration_seconds gauge
+			cortex_compact_group_compaction_duration_seconds{user="aaa"} 577720
+			cortex_compact_group_compaction_duration_seconds{user="bbb"} 588830
+			cortex_compact_group_compaction_duration_seconds{user="ccc"} 599940
 	`))
 	require.NoError(t, err)
 
@@ -199,4 +204,7 @@ func generateTestData(cm *compactorMetrics, base float64) {
 	cm.compactionsNotPlanned.WithLabelValues("aaa").Add(49 * base)
 	cm.compactionsNotPlanned.WithLabelValues("bbb").Add(50 * base)
 	cm.compactionsNotPlanned.WithLabelValues("ccc").Add(51 * base)
+	cm.compactionDuration.WithLabelValues("aaa").Add(52 * base)
+	cm.compactionDuration.WithLabelValues("bbb").Add(53 * base)
+	cm.compactionDuration.WithLabelValues("ccc").Add(54 * base)
 }
diff --git a/pkg/compactor/compactor_paritioning_test.go b/pkg/compactor/compactor_paritioning_test.go
new file mode 100644
index 00000000000..e3c78bb2c62
--- /dev/null
+++ b/pkg/compactor/compactor_paritioning_test.go
@@ -0,0 +1,1807 @@
+package compactor
+
+import (
+	"context"
+	"crypto/rand"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"os"
+	"path"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/go-kit/log"
+	"github.com/oklog/ulid"
+	"github.com/pkg/errors"
+	"github.com/prometheus/client_golang/prometheus"
+	prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/prometheus/prometheus/tsdb"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/objstore"
+	"github.com/thanos-io/thanos/pkg/block"
+	"github.com/thanos-io/thanos/pkg/block/metadata"
+	"github.com/thanos-io/thanos/pkg/compact"
+	"gopkg.in/yaml.v2"
+
+	"github.com/cortexproject/cortex/pkg/ring"
+	"github.com/cortexproject/cortex/pkg/ring/kv/consul"
+	"github.com/cortexproject/cortex/pkg/storage/bucket"
+	cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb"
+	"github.com/cortexproject/cortex/pkg/storage/tsdb/bucketindex"
+	cortex_storage_testutil "github.com/cortexproject/cortex/pkg/storage/tsdb/testutil"
+	"github.com/cortexproject/cortex/pkg/util"
+	"github.com/cortexproject/cortex/pkg/util/concurrency"
+	"github.com/cortexproject/cortex/pkg/util/flagext"
+	"github.com/cortexproject/cortex/pkg/util/services"
+	cortex_testutil "github.com/cortexproject/cortex/pkg/util/test"
+	"github.com/cortexproject/cortex/pkg/util/validation"
+)
+
+func TestPartitionConfig_ShouldSupportYamlConfig(t *testing.T) {
+	yamlCfg := `
+block_ranges: [2h, 48h]
+consistency_delay: 1h
+block_sync_concurrency: 123
+data_dir: /tmp
+compaction_interval: 15m
+compaction_retries: 123
+compaction_strategy: partitioning
+`
+
+	cfg := Config{}
+	flagext.DefaultValues(&cfg)
+	assert.NoError(t, yaml.Unmarshal([]byte(yamlCfg), &cfg))
+	assert.Equal(t, cortex_tsdb.DurationList{2 * time.Hour, 48 * time.Hour}, cfg.BlockRanges)
+	assert.Equal(t, time.Hour, cfg.ConsistencyDelay)
+	assert.Equal(t, 123, cfg.BlockSyncConcurrency)
+	assert.Equal(t, "/tmp", cfg.DataDir)
+	assert.Equal(t, 15*time.Minute, cfg.CompactionInterval)
+	assert.Equal(t, 123, cfg.CompactionRetries)
+	assert.Equal(t, util.CompactionStrategyPartitioning, cfg.CompactionStrategy)
+}
+
+func TestPartitionConfig_ShouldSupportCliFlags(t *testing.T) {
+	fs := flag.NewFlagSet("", flag.PanicOnError)
+	cfg := Config{}
+	cfg.RegisterFlags(fs)
+	require.NoError(t, fs.Parse([]string{
+		"-compactor.block-ranges=2h,48h",
+		"-compactor.consistency-delay=1h",
+		"-compactor.block-sync-concurrency=123",
+		"-compactor.data-dir=/tmp",
+		"-compactor.compaction-interval=15m",
+		"-compactor.compaction-retries=123",
+		"-compactor.compaction-strategy=partitioning",
+	}))
+
+	assert.Equal(t, cortex_tsdb.DurationList{2 * time.Hour, 48 * time.Hour}, cfg.BlockRanges)
+	assert.Equal(t, time.Hour, cfg.ConsistencyDelay)
+	assert.Equal(t, 123, cfg.BlockSyncConcurrency)
+	assert.Equal(t, "/tmp", cfg.DataDir)
+	assert.Equal(t, 15*time.Minute, cfg.CompactionInterval)
+	assert.Equal(t, 123, cfg.CompactionRetries)
+	assert.Equal(t, util.CompactionStrategyPartitioning, cfg.CompactionStrategy)
+}
+
+func TestPartitionConfig_Validate(t *testing.T) {
+	tests := map[string]struct {
+		setup      func(cfg *Config)
+		initLimits func(*validation.Limits)
+		expected   string
+	}{
+		"should pass with the default config": {
+			setup:      func(cfg *Config) {},
+			initLimits: func(_ *validation.Limits) {},
+			expected:   "",
+		},
+		"should pass with only 1 block range period": {
+			setup: func(cfg *Config) {
+				cfg.BlockRanges = cortex_tsdb.DurationList{time.Hour}
+			},
+			initLimits: func(_ *validation.Limits) {},
+			expected:   "",
+		},
+		"should fail with non divisible block range periods": {
+			setup: func(cfg *Config) {
+				cfg.BlockRanges = cortex_tsdb.DurationList{2 * time.Hour, 12 * time.Hour, 24 * time.Hour, 30 * time.Hour}
+			},
+
+			initLimits: func(_ *validation.Limits) {},
+			expected:   errors.Errorf(errInvalidBlockRanges, 30*time.Hour, 24*time.Hour).Error(),
+		},
+		"should fail with duration values of zero": {
+			setup: func(cfg *Config) {
+				cfg.BlockRanges = cortex_tsdb.DurationList{2 * time.Hour, 0, 24 * time.Hour, 30 * time.Hour}
+			},
+			initLimits: func(_ *validation.Limits) {},
+			expected:   errors.Errorf("compactor block range period cannot be zero").Error(),
+		},
+		"should pass with valid shuffle sharding config": {
+			setup: func(cfg *Config) {
+				cfg.ShardingStrategy = util.ShardingStrategyShuffle
+				cfg.ShardingEnabled = true
+			},
+			initLimits: func(limits *validation.Limits) {
+				limits.CompactorTenantShardSize = 1
+			},
+			expected: "",
+		},
+		"should fail with bad compactor tenant shard size": {
+			setup: func(cfg *Config) {
+				cfg.ShardingStrategy = util.ShardingStrategyShuffle
+				cfg.ShardingEnabled = true
+			},
+			initLimits: func(_ *validation.Limits) {},
+			expected:   errInvalidTenantShardSize.Error(),
+		},
+		"should pass with valid compaction strategy config": {
+			setup: func(cfg *Config) {
+				cfg.ShardingEnabled = true
+				cfg.CompactionStrategy = util.CompactionStrategyPartitioning
+			},
+			initLimits: func(_ *validation.Limits) {},
+			expected:   "",
+		},
+		"should fail with bad compaction strategy": {
+			setup: func(cfg *Config) {
+				cfg.CompactionStrategy = "dummy"
+			},
+			initLimits: func(_ *validation.Limits) {},
+			expected:   errInvalidCompactionStrategy.Error(),
+		},
+		"should fail with partitioning compaction strategy but sharding disabled": {
+			setup: func(cfg *Config) {
+				cfg.ShardingEnabled = false
+				cfg.CompactionStrategy = util.CompactionStrategyPartitioning
+			},
+			initLimits: func(_ *validation.Limits) {},
+			expected:   errInvalidCompactionStrategyPartitioning.Error(),
+		},
+	}
+
+	for testName, testData := range tests {
+		t.Run(testName, func(t *testing.T) {
+			cfg := &Config{}
+			limits := validation.Limits{}
+			flagext.DefaultValues(cfg, &limits)
+			testData.setup(cfg)
+			testData.initLimits(&limits)
+
+			if actualErr := cfg.Validate(limits); testData.expected != "" {
+				assert.EqualError(t, actualErr, testData.expected)
+			} else {
+				assert.NoError(t, actualErr)
+			}
+		})
+	}
+}
+
+func TestPartitionCompactor_SkipCompactionWhenCmkError(t *testing.T) {
+	t.Parallel()
+	userID := "user-1"
+
+	ss := bucketindex.Status{Status: bucketindex.CustomerManagedKeyError, Version: bucketindex.SyncStatusFileVersion}
+	content, err := json.Marshal(ss)
+	require.NoError(t, err)
+
+	// No user blocks stored in the bucket.
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("", []string{userID}, nil)
+	bucketClient.MockIter("__markers__", []string{}, nil)
+	bucketClient.MockIter(userID+"/", []string{}, nil)
+	bucketClient.MockIter(userID+"/markers/", nil, nil)
+	bucketClient.MockGet(userID+"/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload(userID+"/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete(userID+"/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockGet(userID+"/bucket-index-sync-status.json", string(content), nil)
+	bucketClient.MockGet(userID+"/bucket-index.json.gz", "", nil)
+	bucketClient.MockUpload(userID+"/bucket-index-sync-status.json", nil)
+	bucketClient.MockUpload(userID+"/bucket-index.json.gz", nil)
+	bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath(userID), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath(userID), false, nil)
+	bucketClient.MockIter(userID+"/"+PartitionedGroupDirectory, nil, nil)
+
+	cfg := prepareConfigForPartitioning()
+	c, _, _, logs, _ := prepareForPartitioning(t, cfg, bucketClient, nil, nil)
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	// Wait until a run has completed.
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c.CompactionRunsCompleted)
+	})
+
+	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), c))
+	assert.Contains(t, strings.Split(strings.TrimSpace(logs.String()), "\n"), `level=info component=compactor msg="skipping compactUser due CustomerManagedKeyError" user=user-1`)
+}
+
+func TestPartitionCompactor_ShouldDoNothingOnNoUserBlocks(t *testing.T) {
+	t.Parallel()
+
+	// No user blocks stored in the bucket.
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("", []string{}, nil)
+	bucketClient.MockIter("__markers__", []string{}, nil)
+	cfg := prepareConfigForPartitioning()
+	c, _, _, logs, registry := prepareForPartitioning(t, cfg, bucketClient, nil, nil)
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	// Wait until a run has completed.
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c.CompactionRunsCompleted)
+	})
+
+	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), c))
+
+	assert.Equal(t, prom_testutil.ToFloat64(c.CompactionRunInterval), cfg.CompactionInterval.Seconds())
+
+	assert.ElementsMatch(t, []string{
+		`level=info component=compactor msg="compactor started"`,
+		`level=info component=compactor msg="discovering users from bucket"`,
+		`level=info component=compactor msg="discovered users from bucket" users=0`,
+	}, removeIgnoredLogs(strings.Split(strings.TrimSpace(logs.String()), "\n")))
+
+	assert.NoError(t, prom_testutil.GatherAndCompare(registry, strings.NewReader(`
+		# TYPE cortex_compactor_runs_started_total counter
+		# HELP cortex_compactor_runs_started_total Total number of compaction runs started.
+		cortex_compactor_runs_started_total 1
+
+		# TYPE cortex_compactor_runs_completed_total counter
+		# HELP cortex_compactor_runs_completed_total Total number of compaction runs successfully completed.
+		cortex_compactor_runs_completed_total 1
+
+		# TYPE cortex_compactor_runs_failed_total counter
+		# HELP cortex_compactor_runs_failed_total Total number of compaction runs failed.
+		cortex_compactor_runs_failed_total 0
+
+		# TYPE cortex_compactor_block_cleanup_failures_total counter
+		# HELP cortex_compactor_block_cleanup_failures_total Total number of blocks failed to be deleted.
+		cortex_compactor_block_cleanup_failures_total 0
+
+		# HELP cortex_compactor_blocks_cleaned_total Total number of blocks deleted.
+		# TYPE cortex_compactor_blocks_cleaned_total counter
+		cortex_compactor_blocks_cleaned_total 0
+
+		# HELP cortex_compactor_blocks_marked_for_no_compaction_total Total number of blocks marked for no compact during a compaction run.
+		# TYPE cortex_compactor_blocks_marked_for_no_compaction_total counter
+		cortex_compactor_blocks_marked_for_no_compaction_total 0
+
+		# TYPE cortex_compactor_block_cleanup_started_total counter
+		# HELP cortex_compactor_block_cleanup_started_total Total number of blocks cleanup runs started.
+		cortex_compactor_block_cleanup_started_total{user_status="active"} 1
+		cortex_compactor_block_cleanup_started_total{user_status="deleted"} 1
+
+		# TYPE cortex_compactor_block_cleanup_completed_total counter
+		# HELP cortex_compactor_block_cleanup_completed_total Total number of blocks cleanup runs successfully completed.
+		cortex_compactor_block_cleanup_completed_total{user_status="active"} 1
+		cortex_compactor_block_cleanup_completed_total{user_status="deleted"} 1
+	`),
+		"cortex_compactor_runs_started_total",
+		"cortex_compactor_runs_completed_total",
+		"cortex_compactor_runs_failed_total",
+		"cortex_compactor_garbage_collected_blocks_total",
+		"cortex_compactor_block_cleanup_failures_total",
+		"cortex_compactor_blocks_cleaned_total",
+		"cortex_compactor_blocks_marked_for_deletion_total",
+		"cortex_compactor_blocks_marked_for_no_compaction_total",
+		"cortex_compactor_block_cleanup_started_total",
+		"cortex_compactor_block_cleanup_completed_total",
+		"cortex_compactor_block_cleanup_failed_total",
+	))
+}
+
+func TestPartitionCompactor_ShouldRetryCompactionOnFailureWhileDiscoveringUsersFromBucket(t *testing.T) {
+	t.Parallel()
+
+	// Fail to iterate over the bucket while discovering users.
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("__markers__", nil, errors.New("failed to iterate the bucket"))
+	bucketClient.MockIter("", nil, errors.New("failed to iterate the bucket"))
+
+	c, _, _, logs, registry := prepareForPartitioning(t, prepareConfigForPartitioning(), bucketClient, nil, nil)
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	// Wait until all retry attempts have completed.
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c.CompactionRunsFailed)
+	})
+
+	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), c))
+
+	// Ensure the bucket iteration has been retried the configured number of times.
+	bucketClient.AssertNumberOfCalls(t, "Iter", 1+3)
+
+	assert.ElementsMatch(t, []string{
+		`level=error component=cleaner msg="failed to scan users on startup" err="failed to discover users from bucket: failed to iterate the bucket"`,
+		`level=info component=compactor msg="compactor started"`,
+		`level=info component=compactor msg="discovering users from bucket"`,
+		`level=error component=compactor msg="failed to discover users from bucket" err="failed to iterate the bucket"`,
+	}, removeIgnoredLogs(strings.Split(strings.TrimSpace(logs.String()), "\n")))
+
+	assert.NoError(t, prom_testutil.GatherAndCompare(registry, strings.NewReader(`
+		# TYPE cortex_compactor_runs_started_total counter
+		# HELP cortex_compactor_runs_started_total Total number of compaction runs started.
+		cortex_compactor_runs_started_total 1
+
+		# TYPE cortex_compactor_runs_completed_total counter
+		# HELP cortex_compactor_runs_completed_total Total number of compaction runs successfully completed.
+		cortex_compactor_runs_completed_total 0
+
+		# TYPE cortex_compactor_runs_failed_total counter
+		# HELP cortex_compactor_runs_failed_total Total number of compaction runs failed.
+		cortex_compactor_runs_failed_total 1
+
+		# TYPE cortex_compactor_block_cleanup_failures_total counter
+		# HELP cortex_compactor_block_cleanup_failures_total Total number of blocks failed to be deleted.
+		cortex_compactor_block_cleanup_failures_total 0
+
+		# TYPE cortex_compactor_block_cleanup_failed_total counter
+		# HELP cortex_compactor_block_cleanup_failed_total Total number of blocks cleanup runs failed.
+		cortex_compactor_block_cleanup_failed_total{user_status="active"} 1
+		cortex_compactor_block_cleanup_failed_total{user_status="deleted"} 1
+
+		# HELP cortex_compactor_blocks_cleaned_total Total number of blocks deleted.
+		# TYPE cortex_compactor_blocks_cleaned_total counter
+		cortex_compactor_blocks_cleaned_total 0
+
+		# HELP cortex_compactor_blocks_marked_for_no_compaction_total Total number of blocks marked for no compact during a compaction run.
+		# TYPE cortex_compactor_blocks_marked_for_no_compaction_total counter
+		cortex_compactor_blocks_marked_for_no_compaction_total 0
+	`),
+		"cortex_compactor_runs_started_total",
+		"cortex_compactor_runs_completed_total",
+		"cortex_compactor_runs_failed_total",
+		"cortex_compactor_garbage_collected_blocks_total",
+		"cortex_compactor_block_cleanup_failures_total",
+		"cortex_compactor_blocks_cleaned_total",
+		"cortex_compactor_blocks_marked_for_deletion_total",
+		"cortex_compactor_blocks_marked_for_no_compaction_total",
+		"cortex_compactor_block_cleanup_started_total",
+		"cortex_compactor_block_cleanup_completed_total",
+		"cortex_compactor_block_cleanup_failed_total",
+	))
+}
+
+func TestPartitionCompactor_ShouldIncrementCompactionErrorIfFailedToCompactASingleTenant(t *testing.T) {
+	t.Parallel()
+
+	userID := "test-user"
+	partitionedGroupID := getPartitionedGroupID(userID)
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("", []string{userID}, nil)
+	bucketClient.MockIter("__markers__", []string{}, nil)
+	bucketClient.MockIter(userID+"/", []string{userID + "/01DTVP434PA9VFXSW2JKB3392D/meta.json", userID + "/01FN6CDF3PNEWWRY5MPGJPE3EX/meta.json"}, nil)
+	bucketClient.MockIter(userID+"/markers/", nil, nil)
+	bucketClient.MockGet(userID+"/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload(userID+"/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete(userID+"/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath(userID), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath(userID), false, nil)
+	bucketClient.MockGet(userID+"/01DTVP434PA9VFXSW2JKB3392D/meta.json", mockBlockMetaJSON("01DTVP434PA9VFXSW2JKB3392D"), nil)
+	bucketClient.MockGet(userID+"/01DTVP434PA9VFXSW2JKB3392D/no-compact-mark.json", "", nil)
+	bucketClient.MockGet(userID+"/01DTVP434PA9VFXSW2JKB3392D/deletion-mark.json", "", nil)
+	bucketClient.MockGet(userID+"/01DTVP434PA9VFXSW2JKB3392D/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockUpload(userID+"/01DTVP434PA9VFXSW2JKB3392D/partition-0-visit-mark.json", nil)
+	bucketClient.MockGet(userID+"/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockGet(userID+"/01FN6CDF3PNEWWRY5MPGJPE3EX/meta.json", mockBlockMetaJSON("01FN6CDF3PNEWWRY5MPGJPE3EX"), nil)
+	bucketClient.MockGet(userID+"/01FN6CDF3PNEWWRY5MPGJPE3EX/no-compact-mark.json", "", nil)
+	bucketClient.MockGet(userID+"/01FN6CDF3PNEWWRY5MPGJPE3EX/deletion-mark.json", "", nil)
+	bucketClient.MockGet(userID+"/01FN6CDF3PNEWWRY5MPGJPE3EX/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockUpload(userID+"/01FN6CDF3PNEWWRY5MPGJPE3EX/partition-0-visit-mark.json", nil)
+	bucketClient.MockGet(userID+"/bucket-index.json.gz", "", nil)
+	bucketClient.MockUpload(userID+"/bucket-index.json.gz", nil)
+	bucketClient.MockUpload(userID+"/bucket-index-sync-status.json", nil)
+	bucketClient.MockGet(userID+"/partitioned-groups/"+partitionedGroupID+".json", "", nil)
+	bucketClient.MockUpload(userID+"/partitioned-groups/"+partitionedGroupID+".json", nil)
+	bucketClient.MockIter(userID+"/"+PartitionedGroupDirectory, nil, nil)
+
+	c, _, tsdbPlannerMock, _, registry := prepareForPartitioning(t, prepareConfigForPartitioning(), bucketClient, nil, nil)
+	tsdbPlannerMock.On("Plan", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]*metadata.Meta{}, errors.New("Failed to plan"))
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	// Wait until all retry attempts have completed.
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c.CompactionRunsFailed)
+	})
+
+	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), c))
+
+	assert.NoError(t, prom_testutil.GatherAndCompare(registry, strings.NewReader(`
+		# TYPE cortex_compactor_runs_started_total counter
+		# HELP cortex_compactor_runs_started_total Total number of compaction runs started.
+		cortex_compactor_runs_started_total 1
+
+		# TYPE cortex_compactor_runs_completed_total counter
+		# HELP cortex_compactor_runs_completed_total Total number of compaction runs successfully completed.
+		cortex_compactor_runs_completed_total 0
+
+		# TYPE cortex_compactor_runs_failed_total counter
+		# HELP cortex_compactor_runs_failed_total Total number of compaction runs failed.
+		cortex_compactor_runs_failed_total 1
+	`),
+		"cortex_compactor_runs_started_total",
+		"cortex_compactor_runs_completed_total",
+		"cortex_compactor_runs_failed_total",
+	))
+}
+
+func TestPartitionCompactor_ShouldCompactAndRemoveUserFolder(t *testing.T) {
+	partitionedGroupID1 := getPartitionedGroupID("user-1")
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("", []string{"user-1"}, nil)
+	bucketClient.MockIter("__markers__", []string{}, nil)
+	bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockIter("user-1/", []string{"user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", "user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/meta.json"}, nil)
+	bucketClient.MockIter("user-1/markers/", nil, nil)
+	bucketClient.MockGet("user-1/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", mockBlockMetaJSON("01DTVP434PA9VFXSW2JKB3392D"), nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/meta.json", mockBlockMetaJSON("01FN6CDF3PNEWWRY5MPGJPE3EX"), nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockGet("user-1/bucket-index.json.gz", "", nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockIter("user-1/markers/", nil, nil)
+	bucketClient.MockUpload("user-1/bucket-index.json.gz", nil)
+	bucketClient.MockUpload("user-1/bucket-index-sync-status.json", nil)
+	bucketClient.MockGet("user-1/partitioned-groups/"+partitionedGroupID1+".json", "", nil)
+	bucketClient.MockUpload("user-1/partitioned-groups/"+partitionedGroupID1+".json", nil)
+	bucketClient.MockIter("user-1/"+PartitionedGroupDirectory, nil, nil)
+
+	c, _, tsdbPlanner, _, _ := prepareForPartitioning(t, prepareConfigForPartitioning(), bucketClient, nil, nil)
+
+	// Make sure the user folder is created and is being used
+	// This will be called during compaction
+	tsdbPlanner.On("Plan", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Run(func(args mock.Arguments) {
+		_, err := os.Stat(c.compactDirForUser("user-1"))
+		require.NoError(t, err)
+	}).Return([]*metadata.Meta{}, nil)
+
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	// Wait until a run has completed.
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c.CompactionRunsCompleted)
+	})
+
+	_, err := os.Stat(c.compactDirForUser("user-1"))
+	require.True(t, os.IsNotExist(err))
+}
+
+func TestPartitionCompactor_ShouldIterateOverUsersAndRunCompaction(t *testing.T) {
+	t.Parallel()
+
+	partitionedGroupID1 := getPartitionedGroupID("user-1")
+	partitionedGroupID2 := getPartitionedGroupID("user-2")
+
+	// Mock the bucket to contain two users, each one with one block.
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("", []string{"user-1", "user-2"}, nil)
+	bucketClient.MockIter("__markers__", []string{}, nil)
+	bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath("user-2"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath("user-2"), false, nil)
+	bucketClient.MockIter("user-1/", []string{"user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", "user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/meta.json"}, nil)
+	bucketClient.MockIter("user-2/", []string{"user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json", "user-2/01FN3V83ABR9992RF8WRJZ76ZQ/meta.json"}, nil)
+	bucketClient.MockIter("user-1/markers/", nil, nil)
+	bucketClient.MockGet("user-1/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockIter("user-2/markers/", nil, nil)
+	bucketClient.MockGet("user-2/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload("user-2/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete("user-2/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", mockBlockMetaJSON("01DTVP434PA9VFXSW2JKB3392D"), nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/meta.json", mockBlockMetaJSON("01FN6CDF3PNEWWRY5MPGJPE3EX"), nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json", mockBlockMetaJSON("01DTW0ZCPDDNV4BV83Q2SV4QAZ"), nil)
+	bucketClient.MockGet("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/meta.json", mockBlockMetaJSON("01FN3V83ABR9992RF8WRJZ76ZQ"), nil)
+	bucketClient.MockGet("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockGet("user-1/bucket-index.json.gz", "", nil)
+	bucketClient.MockGet("user-2/bucket-index.json.gz", "", nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockGet("user-2/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockIter("user-1/markers/", nil, nil)
+	bucketClient.MockIter("user-2/markers/", nil, nil)
+	bucketClient.MockUpload("user-1/bucket-index.json.gz", nil)
+	bucketClient.MockUpload("user-2/bucket-index.json.gz", nil)
+	bucketClient.MockUpload("user-1/bucket-index-sync-status.json", nil)
+	bucketClient.MockUpload("user-2/bucket-index-sync-status.json", nil)
+	bucketClient.MockGet("user-1/partitioned-groups/"+partitionedGroupID1+".json", "", nil)
+	bucketClient.MockUpload("user-1/partitioned-groups/"+partitionedGroupID1+".json", nil)
+	bucketClient.MockGet("user-2/partitioned-groups/"+partitionedGroupID2+".json", "", nil)
+	bucketClient.MockUpload("user-2/partitioned-groups/"+partitionedGroupID2+".json", nil)
+	bucketClient.MockIter("user-1/"+PartitionedGroupDirectory, nil, nil)
+	bucketClient.MockIter("user-2/"+PartitionedGroupDirectory, nil, nil)
+
+	c, _, tsdbPlanner, logs, registry := prepareForPartitioning(t, prepareConfigForPartitioning(), bucketClient, nil, nil)
+
+	// Mock the planner as if there's no compaction to do,
+	// in order to simplify tests (all in all, we just want to
+	// test our logic and not TSDB compactor which we expect to
+	// be already tested).
+	tsdbPlanner.On("Plan", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]*metadata.Meta{}, nil)
+
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	// Wait until a run has completed.
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c.CompactionRunsCompleted)
+	})
+
+	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), c))
+
+	// Ensure a plan has been executed for the blocks of each user.
+	tsdbPlanner.AssertNumberOfCalls(t, "Plan", 2)
+
+	assert.Len(t, tsdbPlanner.getNoCompactBlocks(), 0)
+
+	assert.ElementsMatch(t, []string{
+		`level=info component=compactor msg="compactor started"`,
+		`level=info component=compactor msg="discovering users from bucket"`,
+		`level=info component=compactor msg="discovered users from bucket" users=2`,
+		`level=info component=compactor msg="starting compaction of user blocks" user=user-2`,
+		`level=info component=compactor org_id=user-2 msg="start sync of metas"`,
+		`level=info component=compactor org_id=user-2 msg="start of GC"`,
+		`level=info component=compactor org_id=user-2 msg="start of compactions"`,
+		`level=info component=compactor org_id=user-2 msg="compaction iterations done"`,
+		`level=info component=compactor msg="successfully compacted user blocks" user=user-2`,
+		`level=info component=compactor msg="starting compaction of user blocks" user=user-1`,
+		`level=info component=compactor org_id=user-1 msg="start sync of metas"`,
+		`level=info component=compactor org_id=user-1 msg="start of GC"`,
+		`level=info component=compactor org_id=user-1 msg="start of compactions"`,
+		`level=info component=compactor org_id=user-1 msg="compaction iterations done"`,
+		`level=info component=compactor msg="successfully compacted user blocks" user=user-1`,
+	}, removeIgnoredLogs(strings.Split(strings.TrimSpace(logs.String()), "\n")))
+
+	// Instead of testing for shipper metrics, we only check our metrics here.
+	// Real shipper metrics are too variable to embed into a test.
+	testedMetrics := []string{
+		"cortex_compactor_runs_started_total", "cortex_compactor_runs_completed_total", "cortex_compactor_runs_failed_total",
+		"cortex_compactor_blocks_cleaned_total", "cortex_compactor_block_cleanup_failures_total", "cortex_compactor_blocks_marked_for_deletion_total",
+		"cortex_compactor_block_cleanup_started_total", "cortex_compactor_block_cleanup_completed_total", "cortex_compactor_block_cleanup_failed_total",
+		"cortex_compactor_blocks_marked_for_no_compaction_total",
+	}
+	assert.NoError(t, prom_testutil.GatherAndCompare(registry, strings.NewReader(`
+		# TYPE cortex_compactor_runs_started_total counter
+		# HELP cortex_compactor_runs_started_total Total number of compaction runs started.
+		cortex_compactor_runs_started_total 1
+
+		# TYPE cortex_compactor_runs_completed_total counter
+		# HELP cortex_compactor_runs_completed_total Total number of compaction runs successfully completed.
+		cortex_compactor_runs_completed_total 1
+
+		# TYPE cortex_compactor_runs_failed_total counter
+		# HELP cortex_compactor_runs_failed_total Total number of compaction runs failed.
+		cortex_compactor_runs_failed_total 0
+
+		# TYPE cortex_compactor_block_cleanup_failures_total counter
+		# HELP cortex_compactor_block_cleanup_failures_total Total number of blocks failed to be deleted.
+		cortex_compactor_block_cleanup_failures_total 0
+
+		# HELP cortex_compactor_blocks_cleaned_total Total number of blocks deleted.
+		# TYPE cortex_compactor_blocks_cleaned_total counter
+		cortex_compactor_blocks_cleaned_total 0
+
+		# HELP cortex_compactor_blocks_marked_for_deletion_total Total number of blocks marked for deletion in compactor.
+		# TYPE cortex_compactor_blocks_marked_for_deletion_total counter
+        cortex_compactor_blocks_marked_for_deletion_total{reason="compaction",user="user-1"} 0
+        cortex_compactor_blocks_marked_for_deletion_total{reason="compaction",user="user-2"} 0
+        cortex_compactor_blocks_marked_for_deletion_total{reason="retention",user="user-1"} 0
+        cortex_compactor_blocks_marked_for_deletion_total{reason="retention",user="user-2"} 0
+
+		# TYPE cortex_compactor_block_cleanup_started_total counter
+		# HELP cortex_compactor_block_cleanup_started_total Total number of blocks cleanup runs started.
+		cortex_compactor_block_cleanup_started_total{user_status="active"} 1
+		cortex_compactor_block_cleanup_started_total{user_status="deleted"} 1
+
+		# TYPE cortex_compactor_block_cleanup_completed_total counter
+		# HELP cortex_compactor_block_cleanup_completed_total Total number of blocks cleanup runs successfully completed.
+		cortex_compactor_block_cleanup_completed_total{user_status="active"} 1
+		cortex_compactor_block_cleanup_completed_total{user_status="deleted"} 1
+
+		# HELP cortex_compactor_blocks_marked_for_no_compaction_total Total number of blocks marked for no compact during a compaction run.
+		# TYPE cortex_compactor_blocks_marked_for_no_compaction_total counter
+		cortex_compactor_blocks_marked_for_no_compaction_total 0
+	`), testedMetrics...))
+}
+
+func TestPartitionCompactor_ShouldNotCompactBlocksMarkedForDeletion(t *testing.T) {
+	t.Parallel()
+
+	cfg := prepareConfigForPartitioning()
+	cfg.DeletionDelay = 10 * time.Minute // Delete block after 10 minutes
+
+	// Mock the bucket to contain two users, each one with one block.
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("", []string{"user-1"}, nil)
+	bucketClient.MockIter("__markers__", []string{}, nil)
+	bucketClient.MockIter("user-1/", []string{"user-1/01DTVP434PA9VFXSW2JKB3392D", "user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ"}, nil)
+	bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath("user-1"), false, nil)
+
+	// Block that has just been marked for deletion. It will not be deleted just yet, and it also will not be compacted.
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", mockBlockMetaJSON("01DTVP434PA9VFXSW2JKB3392D"), nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/deletion-mark.json", mockDeletionMarkJSON("01DTVP434PA9VFXSW2JKB3392D", time.Now()), nil)
+	bucketClient.MockGet("user-1/markers/01DTVP434PA9VFXSW2JKB3392D-deletion-mark.json", mockDeletionMarkJSON("01DTVP434PA9VFXSW2JKB3392D", time.Now()), nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", "", nil)
+
+	// This block will be deleted by cleaner.
+	bucketClient.MockGet("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json", mockBlockMetaJSON("01DTW0ZCPDDNV4BV83Q2SV4QAZ"), nil)
+	bucketClient.MockGet("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/deletion-mark.json", mockDeletionMarkJSON("01DTW0ZCPDDNV4BV83Q2SV4QAZ", time.Now().Add(-cfg.DeletionDelay)), nil)
+	bucketClient.MockGet("user-1/markers/01DTW0ZCPDDNV4BV83Q2SV4QAZ-deletion-mark.json", mockDeletionMarkJSON("01DTW0ZCPDDNV4BV83Q2SV4QAZ", time.Now().Add(-cfg.DeletionDelay)), nil)
+
+	bucketClient.MockIter("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ", []string{
+		"user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json",
+		"user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/deletion-mark.json",
+	}, nil)
+
+	bucketClient.MockIter("user-1/markers/", []string{
+		"user-1/markers/01DTVP434PA9VFXSW2JKB3392D-deletion-mark.json",
+		"user-1/markers/01DTW0ZCPDDNV4BV83Q2SV4QAZ-deletion-mark.json",
+	}, nil)
+
+	bucketClient.MockGet("user-1/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete("user-1/markers/cleaner-visit-marker.json", nil)
+
+	bucketClient.MockDelete("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json", nil)
+	bucketClient.MockDelete("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/deletion-mark.json", nil)
+	bucketClient.MockDelete("user-1/markers/01DTW0ZCPDDNV4BV83Q2SV4QAZ-deletion-mark.json", nil)
+	bucketClient.MockDelete("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ", nil)
+	bucketClient.MockGet("user-1/bucket-index.json.gz", "", nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockUpload("user-1/bucket-index.json.gz", nil)
+	bucketClient.MockUpload("user-1/bucket-index-sync-status.json", nil)
+	bucketClient.MockIter("user-1/"+PartitionedGroupDirectory, nil, nil)
+
+	c, _, tsdbPlanner, logs, registry := prepareForPartitioning(t, cfg, bucketClient, nil, nil)
+
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	// Wait until a run has completed.
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c.CompactionRunsCompleted)
+	})
+
+	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), c))
+
+	// Since both blocks are marked for deletion, none of them are going to be compacted.
+	tsdbPlanner.AssertNumberOfCalls(t, "Plan", 0)
+
+	assert.ElementsMatch(t, []string{
+		`level=info component=compactor msg="compactor started"`,
+		`level=info component=compactor msg="discovering users from bucket"`,
+		`level=info component=compactor msg="discovered users from bucket" users=1`,
+		`level=info component=compactor msg="starting compaction of user blocks" user=user-1`,
+		`level=info component=compactor org_id=user-1 msg="start sync of metas"`,
+		`level=info component=compactor org_id=user-1 msg="start of GC"`,
+		`level=info component=compactor org_id=user-1 msg="start of compactions"`,
+		`level=info component=compactor org_id=user-1 msg="compaction iterations done"`,
+		`level=info component=compactor msg="successfully compacted user blocks" user=user-1`,
+	}, removeIgnoredLogs(strings.Split(strings.TrimSpace(logs.String()), "\n")))
+
+	// Instead of testing for shipper metrics, we only check our metrics here.
+	// Real shipper metrics are too variable to embed into a test.
+	testedMetrics := []string{
+		"cortex_compactor_runs_started_total", "cortex_compactor_runs_completed_total", "cortex_compactor_runs_failed_total",
+		"cortex_compactor_blocks_cleaned_total", "cortex_compactor_block_cleanup_failures_total", "cortex_compactor_blocks_marked_for_deletion_total",
+		"cortex_compactor_block_cleanup_started_total", "cortex_compactor_block_cleanup_completed_total", "cortex_compactor_block_cleanup_failed_total",
+		"cortex_compactor_blocks_marked_for_no_compaction_total",
+	}
+	assert.NoError(t, prom_testutil.GatherAndCompare(registry, strings.NewReader(`
+		# TYPE cortex_compactor_runs_started_total counter
+		# HELP cortex_compactor_runs_started_total Total number of compaction runs started.
+		cortex_compactor_runs_started_total 1
+
+		# TYPE cortex_compactor_runs_completed_total counter
+		# HELP cortex_compactor_runs_completed_total Total number of compaction runs successfully completed.
+		cortex_compactor_runs_completed_total 1
+
+		# TYPE cortex_compactor_runs_failed_total counter
+		# HELP cortex_compactor_runs_failed_total Total number of compaction runs failed.
+		cortex_compactor_runs_failed_total 0
+
+		# TYPE cortex_compactor_block_cleanup_failures_total counter
+		# HELP cortex_compactor_block_cleanup_failures_total Total number of blocks failed to be deleted.
+		cortex_compactor_block_cleanup_failures_total 0
+
+		# HELP cortex_compactor_blocks_cleaned_total Total number of blocks deleted.
+		# TYPE cortex_compactor_blocks_cleaned_total counter
+		cortex_compactor_blocks_cleaned_total 1
+
+		# HELP cortex_compactor_blocks_marked_for_deletion_total Total number of blocks marked for deletion in compactor.
+		# TYPE cortex_compactor_blocks_marked_for_deletion_total counter
+		cortex_compactor_blocks_marked_for_deletion_total{reason="compaction",user="user-1"} 0
+        cortex_compactor_blocks_marked_for_deletion_total{reason="retention",user="user-1"} 0
+
+		# TYPE cortex_compactor_block_cleanup_started_total counter
+		# HELP cortex_compactor_block_cleanup_started_total Total number of blocks cleanup runs started.
+		cortex_compactor_block_cleanup_started_total{user_status="active"} 1
+		cortex_compactor_block_cleanup_started_total{user_status="deleted"} 1
+
+		# TYPE cortex_compactor_block_cleanup_completed_total counter
+		# HELP cortex_compactor_block_cleanup_completed_total Total number of blocks cleanup runs successfully completed.
+		cortex_compactor_block_cleanup_completed_total{user_status="active"} 1
+		cortex_compactor_block_cleanup_completed_total{user_status="deleted"} 1
+
+		# HELP cortex_compactor_blocks_marked_for_no_compaction_total Total number of blocks marked for no compact during a compaction run.
+		# TYPE cortex_compactor_blocks_marked_for_no_compaction_total counter
+		cortex_compactor_blocks_marked_for_no_compaction_total 0
+	`), testedMetrics...))
+}
+
+func TestPartitionCompactor_ShouldNotCompactBlocksMarkedForSkipCompact(t *testing.T) {
+	t.Parallel()
+
+	partitionedGroupID1 := getPartitionedGroupID("user-1")
+	partitionedGroupID2 := getPartitionedGroupID("user-2")
+	// Mock the bucket to contain two users, each one with one block.
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("", []string{"user-1", "user-2"}, nil)
+	bucketClient.MockIter("__markers__", []string{}, nil)
+	bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath("user-2"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath("user-2"), false, nil)
+	bucketClient.MockIter("user-1/", []string{"user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", "user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/meta.json"}, nil)
+	bucketClient.MockIter("user-2/", []string{"user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json", "user-2/01FN3V83ABR9992RF8WRJZ76ZQ/meta.json"}, nil)
+	bucketClient.MockIter("user-1/markers/", nil, nil)
+	bucketClient.MockGet("user-1/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockIter("user-2/markers/", nil, nil)
+	bucketClient.MockGet("user-2/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload("user-2/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete("user-2/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", mockBlockMetaJSON("01DTVP434PA9VFXSW2JKB3392D"), nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/no-compact-mark.json", mockNoCompactBlockJSON("01DTVP434PA9VFXSW2JKB3392D"), nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockUpload("user-1/01DTVP434PA9VFXSW2JKB3392D/partition-0-visit-mark.json", nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/meta.json", mockBlockMetaJSON("01FN6CDF3PNEWWRY5MPGJPE3EX"), nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/no-compact-mark.json", mockNoCompactBlockJSON("01FN6CDF3PNEWWRY5MPGJPE3EX"), nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockUpload("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/partition-0-visit-mark.json", nil)
+
+	bucketClient.MockGet("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json", mockBlockMetaJSON("01DTW0ZCPDDNV4BV83Q2SV4QAZ"), nil)
+	bucketClient.MockGet("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockUpload("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/partition-0-visit-mark.json", nil)
+	bucketClient.MockGet("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/meta.json", mockBlockMetaJSON("01FN3V83ABR9992RF8WRJZ76ZQ"), nil)
+	bucketClient.MockGet("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockUpload("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/partition-0-visit-mark.json", nil)
+
+	bucketClient.MockGet("user-1/bucket-index.json.gz", "", nil)
+	bucketClient.MockGet("user-2/bucket-index.json.gz", "", nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockGet("user-2/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockIter("user-1/markers/", nil, nil)
+	bucketClient.MockIter("user-2/markers/", nil, nil)
+	bucketClient.MockUpload("user-1/bucket-index.json.gz", nil)
+	bucketClient.MockUpload("user-2/bucket-index.json.gz", nil)
+	bucketClient.MockUpload("user-1/bucket-index-sync-status.json", nil)
+	bucketClient.MockUpload("user-2/bucket-index-sync-status.json", nil)
+	bucketClient.MockGet("user-1/partitioned-groups/"+partitionedGroupID1+".json", "", nil)
+	bucketClient.MockUpload("user-1/partitioned-groups/"+partitionedGroupID1+".json", nil)
+	bucketClient.MockGet("user-2/partitioned-groups/"+partitionedGroupID2+".json", "", nil)
+	bucketClient.MockUpload("user-2/partitioned-groups/"+partitionedGroupID2+".json", nil)
+	bucketClient.MockIter("user-1/"+PartitionedGroupDirectory, nil, nil)
+	bucketClient.MockIter("user-2/"+PartitionedGroupDirectory, nil, nil)
+
+	c, _, tsdbPlanner, _, registry := prepareForPartitioning(t, prepareConfigForPartitioning(), bucketClient, nil, nil)
+
+	tsdbPlanner.On("Plan", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]*metadata.Meta{}, nil)
+
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c.CompactionRunsCompleted)
+	})
+
+	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), c))
+
+	// Planner still called for user with all blocks makred for skip compaction.
+	tsdbPlanner.AssertNumberOfCalls(t, "Plan", 2)
+
+	assert.ElementsMatch(t, []string{"01DTVP434PA9VFXSW2JKB3392D", "01FN6CDF3PNEWWRY5MPGJPE3EX"}, tsdbPlanner.getNoCompactBlocks())
+
+	testedMetrics := []string{"cortex_compactor_blocks_marked_for_no_compaction_total"}
+
+	assert.NoError(t, prom_testutil.GatherAndCompare(registry, strings.NewReader(`
+		# HELP cortex_compactor_blocks_marked_for_no_compaction_total Total number of blocks marked for no compact during a compaction run.
+		# TYPE cortex_compactor_blocks_marked_for_no_compaction_total counter
+		cortex_compactor_blocks_marked_for_no_compaction_total 0
+	`), testedMetrics...))
+}
+
+func TestPartitionCompactor_ShouldNotCompactBlocksForUsersMarkedForDeletion(t *testing.T) {
+	t.Parallel()
+
+	cfg := prepareConfigForPartitioning()
+	cfg.DeletionDelay = 10 * time.Minute      // Delete block after 10 minutes
+	cfg.TenantCleanupDelay = 10 * time.Minute // To make sure it's not 0.
+
+	partitionedGroupID1 := getPartitionedGroupID("user-1")
+	// Mock the bucket to contain two users, each one with one block.
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("", []string{"user-1"}, nil)
+	bucketClient.MockIter("__markers__", []string{"__markers__/user-1/"}, nil)
+	bucketClient.MockIter("user-1/", []string{"user-1/01DTVP434PA9VFXSW2JKB3392D"}, nil)
+	bucketClient.MockGet(cortex_tsdb.GetGlobalDeletionMarkPath("user-1"), `{"deletion_time": 1}`, nil)
+	bucketClient.MockUpload(cortex_tsdb.GetGlobalDeletionMarkPath("user-1"), nil)
+
+	bucketClient.MockGet("user-1/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete("user-1/markers/cleaner-visit-marker.json", nil)
+
+	bucketClient.MockIter("user-1/01DTVP434PA9VFXSW2JKB3392D", []string{"user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", "user-1/01DTVP434PA9VFXSW2JKB3392D/index"}, nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", mockBlockMetaJSON("01DTVP434PA9VFXSW2JKB3392D"), nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/index", "some index content", nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockUpload("user-1/01DTVP434PA9VFXSW2JKB3392D/partition-0-visit-mark.json", nil)
+	bucketClient.MockExists("user-1/01DTVP434PA9VFXSW2JKB3392D/deletion-mark.json", false, nil)
+
+	bucketClient.MockDelete("user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", nil)
+	bucketClient.MockDelete("user-1/01DTVP434PA9VFXSW2JKB3392D/index", nil)
+	bucketClient.MockDelete("user-1/bucket-index.json.gz", nil)
+	bucketClient.MockDelete("user-1/bucket-index-sync-status.json", nil)
+	bucketClient.MockGet("user-1/partitioned-groups/"+partitionedGroupID1+".json", "", nil)
+	bucketClient.MockUpload("user-1/partitioned-groups/"+partitionedGroupID1+".json", nil)
+	bucketClient.MockIter("user-1/"+PartitionedGroupDirectory, nil, nil)
+
+	c, _, tsdbPlanner, logs, registry := prepareForPartitioning(t, cfg, bucketClient, nil, nil)
+
+	// Mock the planner as if there's no compaction to do,
+	// in order to simplify tests (all in all, we just want to
+	// test our logic and not TSDB compactor which we expect to
+	// be already tested).
+	tsdbPlanner.On("Plan", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]*metadata.Meta{}, nil)
+
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	// Wait until a run has completed.
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c.CompactionRunsCompleted)
+	})
+
+	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), c))
+
+	// No user is compacted, single user we have is marked for deletion.
+	tsdbPlanner.AssertNumberOfCalls(t, "Plan", 0)
+
+	assert.ElementsMatch(t, []string{
+		`level=info component=compactor msg="compactor started"`,
+		`level=info component=compactor msg="discovering users from bucket"`,
+		`level=info component=compactor msg="discovered users from bucket" users=1`,
+		`level=debug component=compactor msg="skipping user because it is marked for deletion" user=user-1`,
+	}, removeIgnoredLogs(strings.Split(strings.TrimSpace(logs.String()), "\n")))
+
+	// Instead of testing for shipper metrics, we only check our metrics here.
+	// Real shipper metrics are too variable to embed into a test.
+	testedMetrics := []string{
+		"cortex_compactor_runs_started_total", "cortex_compactor_runs_completed_total", "cortex_compactor_runs_failed_total",
+		"cortex_compactor_blocks_cleaned_total", "cortex_compactor_block_cleanup_failures_total", "cortex_compactor_blocks_marked_for_deletion_total",
+		"cortex_compactor_block_cleanup_started_total", "cortex_compactor_block_cleanup_completed_total", "cortex_compactor_block_cleanup_failed_total",
+		"cortex_bucket_blocks_count", "cortex_bucket_blocks_marked_for_deletion_count", "cortex_bucket_index_last_successful_update_timestamp_seconds",
+		"cortex_compactor_blocks_marked_for_no_compaction_total",
+	}
+	assert.NoError(t, prom_testutil.GatherAndCompare(registry, strings.NewReader(`
+		# TYPE cortex_compactor_runs_started_total counter
+		# HELP cortex_compactor_runs_started_total Total number of compaction runs started.
+		cortex_compactor_runs_started_total 1
+
+		# TYPE cortex_compactor_runs_completed_total counter
+		# HELP cortex_compactor_runs_completed_total Total number of compaction runs successfully completed.
+		cortex_compactor_runs_completed_total 1
+
+		# TYPE cortex_compactor_runs_failed_total counter
+		# HELP cortex_compactor_runs_failed_total Total number of compaction runs failed.
+		cortex_compactor_runs_failed_total 0
+
+		# TYPE cortex_compactor_block_cleanup_failures_total counter
+		# HELP cortex_compactor_block_cleanup_failures_total Total number of blocks failed to be deleted.
+		cortex_compactor_block_cleanup_failures_total 0
+
+		# HELP cortex_compactor_blocks_cleaned_total Total number of blocks deleted.
+		# TYPE cortex_compactor_blocks_cleaned_total counter
+		cortex_compactor_blocks_cleaned_total 1
+
+		# TYPE cortex_compactor_block_cleanup_started_total counter
+		# HELP cortex_compactor_block_cleanup_started_total Total number of blocks cleanup runs started.
+		cortex_compactor_block_cleanup_started_total{user_status="active"} 1
+		cortex_compactor_block_cleanup_started_total{user_status="deleted"} 1
+
+		# TYPE cortex_compactor_block_cleanup_completed_total counter
+		# HELP cortex_compactor_block_cleanup_completed_total Total number of blocks cleanup runs successfully completed.
+		cortex_compactor_block_cleanup_completed_total{user_status="active"} 1
+		cortex_compactor_block_cleanup_completed_total{user_status="deleted"} 1
+
+		# HELP cortex_compactor_blocks_marked_for_no_compaction_total Total number of blocks marked for no compact during a compaction run.
+		# TYPE cortex_compactor_blocks_marked_for_no_compaction_total counter
+		cortex_compactor_blocks_marked_for_no_compaction_total 0
+	`), testedMetrics...))
+}
+
+func TestPartitionCompactor_ShouldSkipOutOrOrderBlocks(t *testing.T) {
+	bucketClient, tmpDir := cortex_storage_testutil.PrepareFilesystemBucket(t)
+	bucketClient = bucketindex.BucketWithGlobalMarkers(bucketClient)
+
+	b1 := createTSDBBlock(t, bucketClient, "user-1", 10, 20, map[string]string{"__name__": "Teste"})
+	b2 := createTSDBBlock(t, bucketClient, "user-1", 20, 30, map[string]string{"__name__": "Teste"})
+
+	// Read bad index file.
+	indexFile, err := os.Open("testdata/out_of_order_chunks/index")
+	require.NoError(t, err)
+	indexFileStat, err := indexFile.Stat()
+	require.NoError(t, err)
+
+	dir := path.Join(tmpDir, "user-1", b1.String())
+	outputFile, err := os.OpenFile(path.Join(dir, "index"), os.O_RDWR|os.O_TRUNC, 0755)
+	require.NoError(t, err)
+
+	n, err := io.Copy(outputFile, indexFile)
+	require.NoError(t, err)
+	require.Equal(t, indexFileStat.Size(), n)
+
+	cfg := prepareConfigForPartitioning()
+	cfg.SkipBlocksWithOutOfOrderChunksEnabled = true
+	c, tsdbCompac, tsdbPlanner, _, registry := prepareForPartitioning(t, cfg, bucketClient, nil, nil)
+
+	tsdbCompac.On("CompactWithBlockPopulator", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(b1, nil)
+
+	tsdbPlanner.On("Plan", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]*metadata.Meta{
+		{
+			BlockMeta: tsdb.BlockMeta{
+				ULID:    b1,
+				MinTime: 10,
+				MaxTime: 20,
+			},
+		},
+		{
+			BlockMeta: tsdb.BlockMeta{
+				ULID:    b2,
+				MinTime: 20,
+				MaxTime: 30,
+			},
+		},
+	}, nil)
+
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	defer services.StopAndAwaitTerminated(context.Background(), c) //nolint:errcheck
+
+	// Wait until a run has completed.
+	cortex_testutil.Poll(t, 20*time.Second, true, func() interface{} {
+		if _, err := os.Stat(path.Join(dir, "no-compact-mark.json")); err == nil {
+			return true
+		}
+		return false
+	})
+
+	assert.NoError(t, prom_testutil.GatherAndCompare(registry, strings.NewReader(`
+			# HELP cortex_compactor_blocks_marked_for_no_compaction_total Total number of blocks marked for no compact during a compaction run.
+			# TYPE cortex_compactor_blocks_marked_for_no_compaction_total counter
+			cortex_compactor_blocks_marked_for_no_compaction_total 1
+		`), "cortex_compactor_blocks_marked_for_no_compaction_total"))
+}
+
+func TestPartitionCompactor_ShouldCompactAllUsersOnShardingEnabledButOnlyOneInstanceRunning(t *testing.T) {
+	t.Parallel()
+
+	partitionedGroupID1 := getPartitionedGroupID("user-1")
+	partitionedGroupID2 := getPartitionedGroupID("user-2")
+	// Mock the bucket to contain two users, each one with one block.
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("", []string{"user-1", "user-2"}, nil)
+	bucketClient.MockIter("__markers__", []string{}, nil)
+	bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath("user-2"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath("user-2"), false, nil)
+	bucketClient.MockIter("user-1/", []string{"user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", "user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/meta.json"}, nil)
+	bucketClient.MockIter("user-2/", []string{"user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json", "user-2/01FN3V83ABR9992RF8WRJZ76ZQ/meta.json"}, nil)
+	bucketClient.MockIter("user-1/markers/", nil, nil)
+	bucketClient.MockGet("user-1/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockIter("user-2/markers/", nil, nil)
+	bucketClient.MockGet("user-2/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload("user-2/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete("user-2/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", mockBlockMetaJSON("01DTVP434PA9VFXSW2JKB3392D"), nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockUpload("user-1/01DTVP434PA9VFXSW2JKB3392D/partition-0-visit-mark.json", nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/meta.json", mockBlockMetaJSON("01FN6CDF3PNEWWRY5MPGJPE3EX"), nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockUpload("user-1/01FN6CDF3PNEWWRY5MPGJPE3EX/partition-0-visit-mark.json", nil)
+	bucketClient.MockGet("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json", mockBlockMetaJSON("01DTW0ZCPDDNV4BV83Q2SV4QAZ"), nil)
+	bucketClient.MockGet("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockUpload("user-2/01DTW0ZCPDDNV4BV83Q2SV4QAZ/partition-0-visit-mark.json", nil)
+	bucketClient.MockGet("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/meta.json", mockBlockMetaJSON("01FN3V83ABR9992RF8WRJZ76ZQ"), nil)
+	bucketClient.MockGet("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockUpload("user-2/01FN3V83ABR9992RF8WRJZ76ZQ/partition-0-visit-mark.json", nil)
+	bucketClient.MockGet("user-1/bucket-index.json.gz", "", nil)
+	bucketClient.MockGet("user-2/bucket-index.json.gz", "", nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockGet("user-2/bucket-index-sync-status.json", "", nil)
+	bucketClient.MockUpload("user-1/bucket-index.json.gz", nil)
+	bucketClient.MockUpload("user-2/bucket-index.json.gz", nil)
+	bucketClient.MockUpload("user-1/bucket-index-sync-status.json", nil)
+	bucketClient.MockUpload("user-2/bucket-index-sync-status.json", nil)
+	bucketClient.MockGet("user-1/partitioned-groups/"+partitionedGroupID1+".json", "", nil)
+	bucketClient.MockUpload("user-1/partitioned-groups/"+partitionedGroupID1+".json", nil)
+	bucketClient.MockGet("user-2/partitioned-groups/"+partitionedGroupID2+".json", "", nil)
+	bucketClient.MockUpload("user-2/partitioned-groups/"+partitionedGroupID2+".json", nil)
+	bucketClient.MockIter("user-1/"+PartitionedGroupDirectory, nil, nil)
+	bucketClient.MockIter("user-2/"+PartitionedGroupDirectory, nil, nil)
+
+	ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+	t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+	cfg := prepareConfigForPartitioning()
+	cfg.ShardingEnabled = true
+	cfg.ShardingRing.InstanceID = "compactor-1"
+	cfg.ShardingRing.InstanceAddr = "1.2.3.4"
+	cfg.ShardingRing.KVStore.Mock = ringStore
+
+	c, _, tsdbPlanner, logs, _ := prepareForPartitioning(t, cfg, bucketClient, nil, nil)
+
+	// Mock the planner as if there's no compaction to do,
+	// in order to simplify tests (all in all, we just want to
+	// test our logic and not TSDB compactor which we expect to
+	// be already tested).
+	tsdbPlanner.On("Plan", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]*metadata.Meta{}, nil)
+
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	// Wait until a run has completed.
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c.CompactionRunsCompleted)
+	})
+
+	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), c))
+
+	// Ensure a plan has been executed for the blocks of each user.
+	tsdbPlanner.AssertNumberOfCalls(t, "Plan", 2)
+
+	assert.ElementsMatch(t, []string{
+		`level=info component=compactor msg="auto joined with new tokens" ring=compactor state=ACTIVE`,
+		`level=info component=compactor msg="waiting until compactor is ACTIVE in the ring"`,
+		`level=info component=compactor msg="compactor is ACTIVE in the ring"`,
+		`level=info component=compactor msg="compactor started"`,
+		`level=info component=compactor msg="discovering users from bucket"`,
+		`level=info component=compactor msg="discovered users from bucket" users=2`,
+		`level=info component=compactor msg="starting compaction of user blocks" user=user-1`,
+		`level=info component=compactor org_id=user-1 msg="start sync of metas"`,
+		`level=info component=compactor org_id=user-1 msg="start of GC"`,
+		`level=info component=compactor org_id=user-1 msg="start of compactions"`,
+		`level=info component=compactor org_id=user-1 msg="compaction iterations done"`,
+		`level=info component=compactor msg="successfully compacted user blocks" user=user-1`,
+		`level=info component=compactor msg="starting compaction of user blocks" user=user-2`,
+		`level=info component=compactor org_id=user-2 msg="start sync of metas"`,
+		`level=info component=compactor org_id=user-2 msg="start of GC"`,
+		`level=info component=compactor org_id=user-2 msg="start of compactions"`,
+		`level=info component=compactor org_id=user-2 msg="compaction iterations done"`,
+		`level=info component=compactor msg="successfully compacted user blocks" user=user-2`,
+	}, removeIgnoredLogs(strings.Split(strings.TrimSpace(logs.String()), "\n")))
+}
+
+func TestPartitionCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEnabledAndMultipleInstancesRunning(t *testing.T) {
+
+	numUsers := 100
+
+	// Setup user IDs
+	userIDs := make([]string, 0, numUsers)
+	for i := 1; i <= numUsers; i++ {
+		userIDs = append(userIDs, fmt.Sprintf("user-%d", i))
+	}
+
+	// Mock the bucket to contain all users, each one with one block.
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("", userIDs, nil)
+	bucketClient.MockIter("__markers__", []string{}, nil)
+	for _, userID := range userIDs {
+		partitionedGroupID := getPartitionedGroupID(userID)
+		bucketClient.MockIter(userID+"/", []string{userID + "/01DTVP434PA9VFXSW2JKB3392D"}, nil)
+		bucketClient.MockIter(userID+"/markers/", nil, nil)
+		bucketClient.MockGet(userID+"/markers/cleaner-visit-marker.json", "", nil)
+		bucketClient.MockUpload(userID+"/markers/cleaner-visit-marker.json", nil)
+		bucketClient.MockDelete(userID+"/markers/cleaner-visit-marker.json", nil)
+		bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath(userID), false, nil)
+		bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath(userID), false, nil)
+		bucketClient.MockGet(userID+"/01DTVP434PA9VFXSW2JKB3392D/meta.json", mockBlockMetaJSON("01DTVP434PA9VFXSW2JKB3392D"), nil)
+		bucketClient.MockGet(userID+"/01DTVP434PA9VFXSW2JKB3392D/deletion-mark.json", "", nil)
+		bucketClient.MockGet(userID+"/01DTVP434PA9VFXSW2JKB3392D/no-compact-mark.json", "", nil)
+		bucketClient.MockGet(userID+"/01DTVP434PA9VFXSW2JKB3392D/partition-0-visit-mark.json", "", nil)
+		bucketClient.MockGet(userID+"/bucket-index-sync-status.json", "", nil)
+		bucketClient.MockGet(userID+"/bucket-index.json.gz", "", nil)
+		bucketClient.MockUpload(userID+"/bucket-index.json.gz", nil)
+		bucketClient.MockUpload(userID+"/bucket-index-sync-status.json", nil)
+		bucketClient.MockGet(userID+"/partitioned-groups/"+partitionedGroupID+".json", "", nil)
+		bucketClient.MockUpload(userID+"/partitioned-groups/"+partitionedGroupID+".json", nil)
+		bucketClient.MockIter(userID+"/"+PartitionedGroupDirectory, nil, nil)
+	}
+
+	// Create a shared KV Store
+	kvstore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+	t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+	// Create two compactors
+	var compactors []*Compactor
+	var logs []*concurrency.SyncBuffer
+
+	for i := 1; i <= 2; i++ {
+		cfg := prepareConfigForPartitioning()
+		cfg.ShardingEnabled = true
+		cfg.ShardingRing.InstanceID = fmt.Sprintf("compactor-%d", i)
+		cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i)
+		cfg.ShardingRing.WaitStabilityMinDuration = time.Second
+		cfg.ShardingRing.WaitStabilityMaxDuration = 5 * time.Second
+		cfg.ShardingRing.KVStore.Mock = kvstore
+
+		c, _, tsdbPlanner, l, _ := prepareForPartitioning(t, cfg, bucketClient, nil, nil)
+		defer services.StopAndAwaitTerminated(context.Background(), c) //nolint:errcheck
+
+		compactors = append(compactors, c)
+		logs = append(logs, l)
+
+		// Mock the planner as if there's no compaction to do,
+		// in order to simplify tests (all in all, we just want to
+		// test our logic and not TSDB compactor which we expect to
+		// be already tested).
+		tsdbPlanner.On("Plan", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]*metadata.Meta{}, nil)
+	}
+
+	// Start all compactors
+	for _, c := range compactors {
+		require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+	}
+
+	// Wait until a run has been completed on each compactor
+	for _, c := range compactors {
+		cortex_testutil.Poll(t, 60*time.Second, true, func() interface{} {
+			return prom_testutil.ToFloat64(c.CompactionRunsCompleted) >= 1
+		})
+	}
+
+	// Ensure that each user has been compacted by the correct instance
+	for _, userID := range userIDs {
+		_, l, err := findCompactorByUserID(compactors, logs, userID)
+		require.NoError(t, err)
+		assert.Contains(t, l.String(), fmt.Sprintf(`level=info component=compactor msg="successfully compacted user blocks" user=%s`, userID))
+	}
+}
+
+func TestPartitionCompactor_ShouldCompactOnlyShardsOwnedByTheInstanceOnShardingEnabledWithShuffleShardingAndMultipleInstancesRunning(t *testing.T) {
+	t.Parallel()
+
+	numUsers := 3
+
+	// Setup user IDs
+	userIDs := make([]string, 0, numUsers)
+	for i := 1; i <= numUsers; i++ {
+		userIDs = append(userIDs, fmt.Sprintf("user-%d", i))
+	}
+
+	startTime := int64(1574776800000)
+	// Define blocks mapping block IDs to start and end times
+	blocks := map[string]map[string]int64{
+		"01DTVP434PA9VFXSW2JKB3392D": {
+			"startTime": startTime,
+			"endTime":   startTime + time.Hour.Milliseconds()*2,
+		},
+		"01DTVP434PA9VFXSW2JKB3392E": {
+			"startTime": startTime,
+			"endTime":   startTime + time.Hour.Milliseconds()*2,
+		},
+		"01DTVP434PA9VFXSW2JKB3392F": {
+			"startTime": startTime + time.Hour.Milliseconds()*2,
+			"endTime":   startTime + time.Hour.Milliseconds()*4,
+		},
+		"01DTVP434PA9VFXSW2JKB3392G": {
+			"startTime": startTime + time.Hour.Milliseconds()*2,
+			"endTime":   startTime + time.Hour.Milliseconds()*4,
+		},
+		// Add another new block as the final block so that the previous groups will be planned for compaction
+		"01DTVP434PA9VFXSW2JKB3392H": {
+			"startTime": startTime + time.Hour.Milliseconds()*4,
+			"endTime":   startTime + time.Hour.Milliseconds()*6,
+		},
+	}
+
+	// Mock the bucket to contain all users, each one with five blocks, 2 sets of overlapping blocks and 1 separate block.
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("", userIDs, nil)
+	bucketClient.MockIter("__markers__", []string{}, nil)
+
+	// Keys with a value greater than 1 will be groups that should be compacted
+	groupHashes := make(map[uint32]int)
+	for _, userID := range userIDs {
+		blockFiles := []string{}
+
+		for blockID, blockTimes := range blocks {
+			groupHash := hashGroup(userID, blockTimes["startTime"], blockTimes["endTime"])
+			visitMarker := partitionVisitMarker{
+				CompactorID:        "test-compactor",
+				VisitTime:          time.Now().Unix(),
+				PartitionedGroupID: groupHash,
+				PartitionID:        0,
+				Status:             Pending,
+				Version:            PartitionVisitMarkerVersion1,
+			}
+			visitMarkerFileContent, _ := json.Marshal(visitMarker)
+			bucketClient.MockGet(userID+"/bucket-index-sync-status.json", "", nil)
+			bucketClient.MockGet(userID+"/"+blockID+"/meta.json", mockBlockMetaJSONWithTime(blockID, userID, blockTimes["startTime"], blockTimes["endTime"]), nil)
+			bucketClient.MockGet(userID+"/"+blockID+"/deletion-mark.json", "", nil)
+			bucketClient.MockGet(userID+"/"+blockID+"/no-compact-mark.json", "", nil)
+			bucketClient.MockGet(userID+"/"+blockID+"/partition-0-visit-mark.json", "", nil)
+			bucketClient.MockGet(userID+"/partitioned-groups/visit-marks/"+fmt.Sprint(groupHash)+"/partition-0-visit-mark.json", string(visitMarkerFileContent), nil)
+			bucketClient.MockGetRequireUpload(userID+"/partitioned-groups/visit-marks/"+fmt.Sprint(groupHash)+"/partition-0-visit-mark.json", string(visitMarkerFileContent), nil)
+			bucketClient.MockUpload(userID+"/partitioned-groups/visit-marks/"+fmt.Sprint(groupHash)+"/partition-0-visit-mark.json", nil)
+			// Iter with recursive so expected to get objects rather than directories.
+			blockFiles = append(blockFiles, path.Join(userID, blockID, block.MetaFilename))
+
+			// Get all of the unique group hashes so that they can be used to ensure all groups were compacted
+			groupHashes[groupHash]++
+			bucketClient.MockGet(userID+"/partitioned-groups/"+fmt.Sprint(groupHash)+".json", "", nil)
+			bucketClient.MockUpload(userID+"/partitioned-groups/"+fmt.Sprint(groupHash)+".json", nil)
+		}
+
+		bucketClient.MockIter(userID+"/", blockFiles, nil)
+		bucketClient.MockIter(userID+"/markers/", nil, nil)
+		bucketClient.MockGet(userID+"/markers/cleaner-visit-marker.json", "", nil)
+		bucketClient.MockUpload(userID+"/markers/cleaner-visit-marker.json", nil)
+		bucketClient.MockDelete(userID+"/markers/cleaner-visit-marker.json", nil)
+		bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath(userID), false, nil)
+		bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath(userID), false, nil)
+		bucketClient.MockGet(userID+"/bucket-index.json.gz", "", nil)
+		bucketClient.MockUpload(userID+"/bucket-index.json.gz", nil)
+		bucketClient.MockUpload(userID+"/bucket-index-sync-status.json", nil)
+		bucketClient.MockIter(userID+"/"+PartitionedGroupDirectory, nil, nil)
+	}
+
+	// Create a shared KV Store
+	kvstore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+	t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+	// Create four compactors
+	var compactors []*Compactor
+	var logs []*concurrency.SyncBuffer
+
+	for i := 1; i <= 4; i++ {
+		cfg := prepareConfigForPartitioning()
+		cfg.ShardingEnabled = true
+		cfg.CompactionInterval = 15 * time.Second
+		cfg.ShardingStrategy = util.ShardingStrategyShuffle
+		cfg.ShardingRing.InstanceID = fmt.Sprintf("compactor-%d", i)
+		cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i)
+		cfg.ShardingRing.WaitStabilityMinDuration = time.Second
+		cfg.ShardingRing.WaitStabilityMaxDuration = 5 * time.Second
+		cfg.ShardingRing.KVStore.Mock = kvstore
+
+		limits := &validation.Limits{}
+		flagext.DefaultValues(limits)
+		limits.CompactorTenantShardSize = 3
+
+		c, _, tsdbPlanner, l, _ := prepareForPartitioning(t, cfg, bucketClient, limits, nil)
+		defer services.StopAndAwaitTerminated(context.Background(), c) //nolint:errcheck
+
+		compactors = append(compactors, c)
+		logs = append(logs, l)
+
+		// Mock the planner as if there's no compaction to do,
+		// in order to simplify tests (all in all, we just want to
+		// test our logic and not TSDB compactor which we expect to
+		// be already tested).
+		tsdbPlanner.On("Plan", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]*metadata.Meta{}, nil)
+	}
+
+	// Start all compactors
+	for _, c := range compactors {
+		require.NoError(t, c.StartAsync(context.Background()))
+	}
+	// Wait for all the compactors to get into the Running state without errors.
+	// Cannot use StartAndAwaitRunning as this would cause the compactions to start before
+	// all the compactors are initialized
+	for _, c := range compactors {
+		require.NoError(t, c.AwaitRunning(context.Background()))
+	}
+
+	// Wait until a run has been completed on each compactor
+	for _, c := range compactors {
+		cortex_testutil.Poll(t, 60*time.Second, 1.0, func() interface{} {
+			return prom_testutil.ToFloat64(c.CompactionRunsCompleted)
+		})
+	}
+
+	// Ensure that each group was only compacted by exactly one compactor
+	for groupHash, blockCount := range groupHashes {
+
+		l, found, err := checkLogsForPartitionCompaction(compactors, logs, groupHash)
+		require.NoError(t, err)
+
+		// If the blockCount < 2 then the group shouldn't have been compacted, therefore not found in the logs
+		if blockCount < 2 {
+			assert.False(t, found)
+		} else {
+			assert.True(t, found)
+			assert.Contains(t, l.String(), fmt.Sprintf(`group_hash=%d msg="found compactable group for user"`, groupHash))
+		}
+	}
+}
+
+// checkLogsForPartitionCompaction checks the logs to see if a compaction has happened on the groupHash,
+// if there has been a compaction it will return the logs of the compactor that handled the group
+// and will return true. Otherwise this function will return a nil value for the logs and false
+// as the group was not compacted
+func checkLogsForPartitionCompaction(compactors []*Compactor, logs []*concurrency.SyncBuffer, groupHash uint32) (*concurrency.SyncBuffer, bool, error) {
+	var log *concurrency.SyncBuffer
+
+	for _, l := range logs {
+		owned := strings.Contains(l.String(), fmt.Sprintf(`group_hash=%d msg="found compactable group for user"`, groupHash))
+		if owned {
+			log = l
+		}
+	}
+
+	// Return false if we've not been able to find it
+	if log == nil {
+		return nil, false, nil
+	}
+
+	return log, true, nil
+}
+
+func prepareConfigForPartitioning() Config {
+	compactorCfg := prepareConfig()
+
+	compactorCfg.CompactionStrategy = util.CompactionStrategyPartitioning
+
+	return compactorCfg
+}
+
+func prepareForPartitioning(t *testing.T, compactorCfg Config, bucketClient objstore.InstrumentedBucket, limits *validation.Limits, tsdbGrouper *tsdbGrouperMock) (*Compactor, *tsdbCompactorMock, *tsdbPlannerMock, *concurrency.SyncBuffer, prometheus.Gatherer) {
+	storageCfg := cortex_tsdb.BlocksStorageConfig{}
+	flagext.DefaultValues(&storageCfg)
+	storageCfg.BucketStore.BlockDiscoveryStrategy = string(cortex_tsdb.RecursiveDiscovery)
+
+	// Create a temporary directory for compactor data.
+	compactorCfg.DataDir = t.TempDir()
+
+	tsdbCompactor := &tsdbCompactorMock{}
+	tsdbPlanner := &tsdbPlannerMock{
+		noCompactMarkFilters: []*compact.GatherNoCompactionMarkFilter{},
+	}
+	logs := &concurrency.SyncBuffer{}
+	logger := log.NewLogfmtLogger(logs)
+	registry := prometheus.NewRegistry()
+
+	if limits == nil {
+		limits = &validation.Limits{}
+		flagext.DefaultValues(limits)
+	}
+
+	overrides, err := validation.NewOverrides(*limits, nil)
+	require.NoError(t, err)
+
+	bucketClientFactory := func(ctx context.Context) (objstore.InstrumentedBucket, error) {
+		return bucketClient, nil
+	}
+
+	blocksCompactorFactory := func(ctx context.Context, cfg Config, logger log.Logger, reg prometheus.Registerer) (compact.Compactor, PlannerFactory, error) {
+		return tsdbCompactor,
+			func(ctx context.Context, bkt objstore.InstrumentedBucket, _ log.Logger, _ Config, noCompactMarkFilter *compact.GatherNoCompactionMarkFilter, ringLifecycle *ring.Lifecycler, _ string, _ prometheus.Counter, _ prometheus.Counter, _ *compactorMetrics) compact.Planner {
+				tsdbPlanner.noCompactMarkFilters = append(tsdbPlanner.noCompactMarkFilters, noCompactMarkFilter)
+				return tsdbPlanner
+			},
+			nil
+	}
+
+	var blocksGrouperFactory BlocksGrouperFactory
+	if tsdbGrouper != nil {
+		blocksGrouperFactory = func(_ context.Context, _ Config, _ objstore.InstrumentedBucket, _ log.Logger, _ prometheus.Counter, _ prometheus.Counter, _ prometheus.Counter, _ *compact.SyncerMetrics, _ *compactorMetrics, _ *ring.Ring, _ *ring.Lifecycler, _ Limits, _ string, _ *compact.GatherNoCompactionMarkFilter, _ int) compact.Grouper {
+			return tsdbGrouper
+		}
+	} else {
+		if compactorCfg.ShardingStrategy == util.ShardingStrategyShuffle {
+			blocksGrouperFactory = ShuffleShardingGrouperFactory
+		} else {
+			blocksGrouperFactory = DefaultBlocksGrouperFactory
+		}
+	}
+
+	var blockDeletableCheckerFactory BlockDeletableCheckerFactory
+	if compactorCfg.ShardingStrategy == util.ShardingStrategyShuffle {
+		blockDeletableCheckerFactory = PartitionCompactionBlockDeletableCheckerFactory
+	} else {
+		blockDeletableCheckerFactory = DefaultBlockDeletableCheckerFactory
+	}
+
+	var compactionLifecycleCallbackFactory CompactionLifecycleCallbackFactory
+	if compactorCfg.ShardingStrategy == util.ShardingStrategyShuffle {
+		compactionLifecycleCallbackFactory = ShardedCompactionLifecycleCallbackFactory
+	} else {
+		compactionLifecycleCallbackFactory = DefaultCompactionLifecycleCallbackFactory
+	}
+
+	c, err := newCompactor(compactorCfg, storageCfg, logger, registry, bucketClientFactory, blocksGrouperFactory, blocksCompactorFactory, blockDeletableCheckerFactory, compactionLifecycleCallbackFactory, overrides, 1)
+	require.NoError(t, err)
+
+	return c, tsdbCompactor, tsdbPlanner, logs, registry
+}
+
+type tsdbGrouperMock struct {
+	mock.Mock
+}
+
+func (m *tsdbGrouperMock) Groups(blocks map[ulid.ULID]*metadata.Meta) (res []*compact.Group, err error) {
+	args := m.Called(blocks)
+	return args.Get(0).([]*compact.Group), args.Error(1)
+}
+
+var (
+	BlockMinTime = int64(1574776800000)
+	BlockMaxTime = int64(1574784000000)
+)
+
+func getPartitionedGroupID(userID string) string {
+	return fmt.Sprint(hashGroup(userID, BlockMinTime, BlockMaxTime))
+}
+
+func mockBlockGroup(userID string, ids []string, bkt *bucket.ClientMock) *compact.Group {
+	dummyCounter := prometheus.NewCounter(prometheus.CounterOpts{})
+	group, _ := compact.NewGroup(
+		log.NewNopLogger(),
+		bkt,
+		getPartitionedGroupID(userID),
+		nil,
+		0,
+		true,
+		true,
+		dummyCounter,
+		dummyCounter,
+		dummyCounter,
+		dummyCounter,
+		dummyCounter,
+		dummyCounter,
+		dummyCounter,
+		dummyCounter,
+		metadata.NoneFunc,
+		1,
+		1,
+	)
+	for _, id := range ids {
+		meta := mockBlockMeta(id)
+		err := group.AppendMeta(&metadata.Meta{
+			BlockMeta: meta,
+		})
+		if err != nil {
+			continue
+		}
+	}
+	return group
+}
+
+func TestPartitionCompactor_DeleteLocalSyncFiles(t *testing.T) {
+	numUsers := 10
+
+	// Setup user IDs
+	userIDs := make([]string, 0, numUsers)
+	for i := 1; i <= numUsers; i++ {
+		userIDs = append(userIDs, fmt.Sprintf("user-%d", i))
+	}
+
+	inmem := objstore.WithNoopInstr(objstore.NewInMemBucket())
+	for _, userID := range userIDs {
+		id, err := ulid.New(ulid.Now(), rand.Reader)
+		require.NoError(t, err)
+		require.NoError(t, inmem.Upload(context.Background(), userID+"/"+id.String()+"/meta.json", strings.NewReader(mockBlockMetaJSON(id.String()))))
+	}
+
+	// Create a shared KV Store
+	kvstore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+	t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+	// Create two compactors
+	var compactors []*Compactor
+
+	for i := 1; i <= 2; i++ {
+		cfg := prepareConfigForPartitioning()
+
+		cfg.ShardingEnabled = true
+		cfg.ShardingRing.InstanceID = fmt.Sprintf("compactor-%d", i)
+		cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i)
+		cfg.ShardingRing.WaitStabilityMinDuration = time.Second
+		cfg.ShardingRing.WaitStabilityMaxDuration = 5 * time.Second
+		cfg.ShardingRing.KVStore.Mock = kvstore
+
+		// Each compactor will get its own temp dir for storing local files.
+		c, _, tsdbPlanner, _, _ := prepareForPartitioning(t, cfg, inmem, nil, nil)
+		t.Cleanup(func() {
+			require.NoError(t, services.StopAndAwaitTerminated(context.Background(), c))
+		})
+
+		compactors = append(compactors, c)
+
+		// Mock the planner as if there's no compaction to do,
+		// in order to simplify tests (all in all, we just want to
+		// test our logic and not TSDB compactor which we expect to
+		// be already tested).
+		tsdbPlanner.On("Plan", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]*metadata.Meta{}, nil)
+	}
+
+	require.Equal(t, 2, len(compactors))
+	c1 := compactors[0]
+	c2 := compactors[1]
+
+	// Start first compactor
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c1))
+
+	// Wait until a run has been completed on first compactor. This happens as soon as compactor starts.
+	cortex_testutil.Poll(t, 20*time.Second, true, func() interface{} {
+		return prom_testutil.ToFloat64(c1.CompactionRunsCompleted) >= 1
+	})
+
+	require.NoError(t, os.Mkdir(c1.metaSyncDirForUser("new-user"), 0600))
+
+	// Verify that first compactor has synced all the users, plus there is one extra we have just created.
+	require.Equal(t, numUsers+1, len(c1.listTenantsWithMetaSyncDirectories()))
+
+	// Now start second compactor, and wait until it runs compaction.
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c2))
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c2.CompactionRunsCompleted)
+	})
+
+	// Let's check how many users second compactor has.
+	c2Users := len(c2.listTenantsWithMetaSyncDirectories())
+	require.NotZero(t, c2Users)
+
+	// Force new compaction cycle on first compactor. It will run the cleanup of un-owned users at the end of compaction cycle.
+	c1.compactUsers(context.Background())
+	c1Users := len(c1.listTenantsWithMetaSyncDirectories())
+
+	// Now compactor 1 should have cleaned old sync files.
+	require.NotEqual(t, numUsers, c1Users)
+	require.Equal(t, numUsers, c1Users+c2Users)
+}
+
+func TestPartitionCompactor_ShouldFailCompactionOnTimeout(t *testing.T) {
+	t.Parallel()
+
+	// Mock the bucket
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("", []string{}, nil)
+	bucketClient.MockIter("__markers__", []string{}, nil)
+
+	ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+	t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+	cfg := prepareConfigForPartitioning()
+	cfg.ShardingEnabled = true
+	cfg.ShardingRing.InstanceID = "compactor-1"
+	cfg.ShardingRing.InstanceAddr = "1.2.3.4"
+	cfg.ShardingRing.KVStore.Mock = ringStore
+
+	// Set ObservePeriod to longer than the timeout period to mock a timeout while waiting on ring to become ACTIVE
+	cfg.ShardingRing.ObservePeriod = time.Second * 10
+
+	c, _, _, logs, _ := prepareForPartitioning(t, cfg, bucketClient, nil, nil)
+
+	// Try to start the compactor with a bad consul kv-store. The
+	err := services.StartAndAwaitRunning(context.Background(), c)
+
+	// Assert that the compactor timed out
+	assert.Equal(t, context.DeadlineExceeded, err)
+
+	assert.ElementsMatch(t, []string{
+		`level=info component=compactor msg="compactor started"`,
+		`level=info component=compactor msg="waiting until compactor is ACTIVE in the ring"`,
+		`level=info component=compactor msg="auto joined with new tokens" ring=compactor state=JOINING`,
+		`level=error component=compactor msg="compactor failed to become ACTIVE in the ring" err="context deadline exceeded"`,
+	}, removeIgnoredLogs(strings.Split(strings.TrimSpace(logs.String()), "\n")))
+}
+
+func TestPartitionCompactor_ShouldNotHangIfPlannerReturnNothing(t *testing.T) {
+	t.Parallel()
+
+	ss := bucketindex.Status{Status: bucketindex.CustomerManagedKeyError, Version: bucketindex.SyncStatusFileVersion}
+	content, err := json.Marshal(ss)
+	require.NoError(t, err)
+
+	partitionedGroupID := getPartitionedGroupID("user-1")
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("__markers__", []string{}, nil)
+	bucketClient.MockIter("", []string{"user-1"}, nil)
+	bucketClient.MockIter("user-1/", []string{"user-1/01DTVP434PA9VFXSW2JKB3392D", "user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ"}, nil)
+	bucketClient.MockIter("user-1/markers/", nil, nil)
+	bucketClient.MockGet("user-1/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", mockBlockMetaJSON("01DTVP434PA9VFXSW2JKB3392D"), nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json", mockBlockMetaJSON("01DTW0ZCPDDNV4BV83Q2SV4QAZ"), nil)
+	bucketClient.MockGet("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/partition-0-visit-mark.json", "", nil)
+	bucketClient.MockGet("user-1/bucket-index.json.gz", "", nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", string(content), nil)
+	bucketClient.MockUpload("user-1/bucket-index.json.gz", nil)
+	bucketClient.MockUpload("user-1/bucket-index-sync-status.json", nil)
+	bucketClient.MockIter("user-1/"+PartitionedGroupDirectory, nil, nil)
+	bucketClient.MockGet("user-1/partitioned-groups/visit-marks/"+string(partitionedGroupID)+"/partition-0-visit-mark.json", "", nil)
+
+	ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+	t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+	cfg := prepareConfigForPartitioning()
+	cfg.ShardingEnabled = true
+	cfg.ShardingRing.InstanceID = "compactor-1"
+	cfg.ShardingRing.InstanceAddr = "1.2.3.4"
+	cfg.ShardingRing.KVStore.Mock = ringStore
+
+	tsdbGrouper := tsdbGrouperMock{}
+	mockGroups := []*compact.Group{mockBlockGroup("user-1", []string{"01DTVP434PA9VFXSW2JKB3392D", "01DTW0ZCPDDNV4BV83Q2SV4QAZ"}, bucketClient)}
+	tsdbGrouper.On("Groups", mock.Anything).Return(mockGroups, nil)
+
+	c, _, tsdbPlanner, _, _ := prepareForPartitioning(t, cfg, bucketClient, nil, &tsdbGrouper)
+	tsdbPlanner.On("Plan", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]*metadata.Meta{}, nil)
+
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	// Wait until a run has completed.
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c.CompactionRunsCompleted)
+	})
+
+	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), c))
+}
+
+func TestPartitionCompactor_ShouldNotFailCompactionIfAccessDeniedErrDuringMetaSync(t *testing.T) {
+	t.Parallel()
+
+	ss := bucketindex.Status{Status: bucketindex.Ok, Version: bucketindex.SyncStatusFileVersion}
+	content, err := json.Marshal(ss)
+	require.NoError(t, err)
+
+	partitionedGroupID := getPartitionedGroupID("user-1")
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("__markers__", []string{}, nil)
+	bucketClient.MockIter("", []string{"user-1"}, nil)
+	bucketClient.MockIter("user-1/", []string{"user-1/01DTVP434PA9VFXSW2JKB3392D", "user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ", "user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", "user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json"}, nil)
+	bucketClient.MockIter("user-1/markers/", nil, nil)
+	bucketClient.MockGet("user-1/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", mockBlockMetaJSON("01DTVP434PA9VFXSW2JKB3392D"), bucket.ErrKeyPermissionDenied)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/deletion-mark.json", "", bucket.ErrKeyPermissionDenied)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/no-compact-mark.json", "", bucket.ErrKeyPermissionDenied)
+	bucketClient.MockGet("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json", mockBlockMetaJSON("01DTW0ZCPDDNV4BV83Q2SV4QAZ"), bucket.ErrKeyPermissionDenied)
+	bucketClient.MockGet("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/deletion-mark.json", "", bucket.ErrKeyPermissionDenied)
+	bucketClient.MockGet("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/no-compact-mark.json", "", bucket.ErrKeyPermissionDenied)
+	bucketClient.MockGet("user-1/bucket-index.json.gz", "", nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", string(content), nil)
+	bucketClient.MockUpload("user-1/bucket-index.json.gz", nil)
+	bucketClient.MockUpload("user-1/bucket-index-sync-status.json", nil)
+	bucketClient.MockIter("user-1/"+PartitionedGroupDirectory, nil, nil)
+	bucketClient.MockGet("user-1/partitioned-groups/visit-marks/"+string(partitionedGroupID)+"/partition-0-visit-mark.json", "", nil)
+
+	ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+	t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+	cfg := prepareConfigForPartitioning()
+	cfg.ShardingEnabled = true
+	cfg.ShardingRing.InstanceID = "compactor-1"
+	cfg.ShardingRing.InstanceAddr = "1.2.3.4"
+	cfg.ShardingRing.KVStore.Mock = ringStore
+
+	c, _, tsdbPlanner, _, _ := prepareForPartitioning(t, cfg, bucketClient, nil, nil)
+	tsdbPlanner.On("Plan", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]*metadata.Meta{}, nil)
+
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	// Wait until a run has completed.
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c.CompactionRunsCompleted)
+	})
+
+	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), c))
+}
+
+func TestPartitionCompactor_ShouldNotFailCompactionIfAccessDeniedErrReturnedFromBucket(t *testing.T) {
+	t.Parallel()
+
+	ss := bucketindex.Status{Status: bucketindex.Ok, Version: bucketindex.SyncStatusFileVersion}
+	content, err := json.Marshal(ss)
+	require.NoError(t, err)
+
+	partitionedGroupID := getPartitionedGroupID("user-1")
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("__markers__", []string{}, nil)
+	bucketClient.MockIter("", []string{"user-1"}, nil)
+	bucketClient.MockIter("user-1/", []string{"user-1/01DTVP434PA9VFXSW2JKB3392D", "user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ", "user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", "user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json"}, nil)
+	bucketClient.MockIter("user-1/markers/", nil, nil)
+	bucketClient.MockGet("user-1/markers/cleaner-visit-marker.json", "", nil)
+	bucketClient.MockUpload("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockDelete("user-1/markers/cleaner-visit-marker.json", nil)
+	bucketClient.MockExists(cortex_tsdb.GetGlobalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockExists(cortex_tsdb.GetLocalDeletionMarkPath("user-1"), false, nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/meta.json", mockBlockMetaJSON("01DTVP434PA9VFXSW2JKB3392D"), nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTVP434PA9VFXSW2JKB3392D/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/meta.json", mockBlockMetaJSON("01DTW0ZCPDDNV4BV83Q2SV4QAZ"), nil)
+	bucketClient.MockGet("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/deletion-mark.json", "", nil)
+	bucketClient.MockGet("user-1/01DTW0ZCPDDNV4BV83Q2SV4QAZ/no-compact-mark.json", "", nil)
+	bucketClient.MockGet("user-1/bucket-index.json.gz", "", nil)
+	bucketClient.MockGet("user-1/bucket-index-sync-status.json", string(content), nil)
+	bucketClient.MockUpload("user-1/bucket-index.json.gz", nil)
+	bucketClient.MockUpload("user-1/bucket-index-sync-status.json", nil)
+	bucketClient.MockIter("user-1/"+PartitionedGroupDirectory, nil, nil)
+	bucketClient.MockGet("user-1/partitioned-groups/visit-marks/"+string(partitionedGroupID)+"/partition-0-visit-mark.json", "", nil)
+
+	ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+	t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+	cfg := prepareConfigForPartitioning()
+	cfg.ShardingEnabled = true
+	cfg.ShardingRing.InstanceID = "compactor-1"
+	cfg.ShardingRing.InstanceAddr = "1.2.3.4"
+	cfg.ShardingRing.KVStore.Mock = ringStore
+
+	c, _, tsdbPlanner, _, _ := prepareForPartitioning(t, cfg, bucketClient, nil, nil)
+	tsdbPlanner.On("Plan", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]*metadata.Meta{}, bucket.ErrKeyPermissionDenied)
+
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), c))
+
+	// Wait until a run has completed.
+	cortex_testutil.Poll(t, 20*time.Second, 1.0, func() interface{} {
+		return prom_testutil.ToFloat64(c.CompactionRunsCompleted)
+	})
+
+	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), c))
+}
diff --git a/pkg/compactor/compactor_test.go b/pkg/compactor/compactor_test.go
index fd67e6b6505..94c3ae46537 100644
--- a/pkg/compactor/compactor_test.go
+++ b/pkg/compactor/compactor_test.go
@@ -1111,8 +1111,8 @@ func TestCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEnabledAndM
 		cfg.ShardingEnabled = true
 		cfg.ShardingRing.InstanceID = fmt.Sprintf("compactor-%d", i)
 		cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i)
-		cfg.ShardingRing.WaitStabilityMinDuration = 3 * time.Second
-		cfg.ShardingRing.WaitStabilityMaxDuration = 10 * time.Second
+		cfg.ShardingRing.WaitStabilityMinDuration = time.Second
+		cfg.ShardingRing.WaitStabilityMaxDuration = 5 * time.Second
 		cfg.ShardingRing.KVStore.Mock = kvstore
 
 		c, _, tsdbPlanner, l, _ := prepare(t, cfg, bucketClient, nil)
@@ -1135,8 +1135,8 @@ func TestCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEnabledAndM
 
 	// Wait until a run has been completed on each compactor
 	for _, c := range compactors {
-		cortex_testutil.Poll(t, 10*time.Second, 1.0, func() interface{} {
-			return prom_testutil.ToFloat64(c.CompactionRunsCompleted)
+		cortex_testutil.Poll(t, 120*time.Second, true, func() interface{} {
+			return prom_testutil.ToFloat64(c.CompactionRunsCompleted) >= 1
 		})
 	}
 
@@ -1244,8 +1244,8 @@ func TestCompactor_ShouldCompactOnlyShardsOwnedByTheInstanceOnShardingEnabledWit
 		cfg.ShardingStrategy = util.ShardingStrategyShuffle
 		cfg.ShardingRing.InstanceID = fmt.Sprintf("compactor-%d", i)
 		cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i)
-		cfg.ShardingRing.WaitStabilityMinDuration = 3 * time.Second
-		cfg.ShardingRing.WaitStabilityMaxDuration = 10 * time.Second
+		cfg.ShardingRing.WaitStabilityMinDuration = time.Second
+		cfg.ShardingRing.WaitStabilityMaxDuration = 5 * time.Second
 		cfg.ShardingRing.KVStore.Mock = kvstore
 
 		limits := &validation.Limits{}
@@ -1521,6 +1521,9 @@ func prepareConfig() Config {
 	compactorCfg.retryMinBackoff = 0
 	compactorCfg.retryMaxBackoff = 0
 
+	//Avoid jitter in startup
+	compactorCfg.CompactionInterval = 5 * time.Second
+
 	// The migration is tested in a dedicated test.
 	compactorCfg.BlockDeletionMarksMigrationEnabled = false
 
@@ -1578,7 +1581,7 @@ func prepare(t *testing.T, compactorCfg Config, bucketClient objstore.Instrument
 		blocksGrouperFactory = DefaultBlocksGrouperFactory
 	}
 
-	c, err := newCompactor(compactorCfg, storageCfg, logger, registry, bucketClientFactory, blocksGrouperFactory, blocksCompactorFactory, overrides, 1)
+	c, err := newCompactor(compactorCfg, storageCfg, logger, registry, bucketClientFactory, blocksGrouperFactory, blocksCompactorFactory, DefaultBlockDeletableCheckerFactory, DefaultCompactionLifecycleCallbackFactory, overrides, 1)
 	require.NoError(t, err)
 
 	return c, tsdbCompactor, tsdbPlanner, logs, registry
@@ -1748,13 +1751,12 @@ func TestCompactor_DeleteLocalSyncFiles(t *testing.T) {
 
 	for i := 1; i <= 2; i++ {
 		cfg := prepareConfig()
-		cfg.CompactionInterval = 10 * time.Minute // We will only call compaction manually.
 
 		cfg.ShardingEnabled = true
 		cfg.ShardingRing.InstanceID = fmt.Sprintf("compactor-%d", i)
 		cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i)
-		cfg.ShardingRing.WaitStabilityMinDuration = 3 * time.Second
-		cfg.ShardingRing.WaitStabilityMaxDuration = 10 * time.Second
+		cfg.ShardingRing.WaitStabilityMinDuration = time.Second
+		cfg.ShardingRing.WaitStabilityMaxDuration = 5 * time.Second
 		cfg.ShardingRing.KVStore.Mock = kvstore
 
 		// Each compactor will get its own temp dir for storing local files.
diff --git a/pkg/compactor/partition_compaction_complete_checker.go b/pkg/compactor/partition_compaction_complete_checker.go
new file mode 100644
index 00000000000..3bb6b37b055
--- /dev/null
+++ b/pkg/compactor/partition_compaction_complete_checker.go
@@ -0,0 +1,16 @@
+package compactor
+
+import (
+	"github.com/oklog/ulid"
+	"github.com/thanos-io/thanos/pkg/compact"
+)
+
+type PartitionCompactionBlockDeletableChecker struct{}
+
+func NewPartitionCompactionBlockDeletableChecker() *PartitionCompactionBlockDeletableChecker {
+	return &PartitionCompactionBlockDeletableChecker{}
+}
+
+func (p *PartitionCompactionBlockDeletableChecker) CanDelete(_ *compact.Group, _ ulid.ULID) bool {
+	return false
+}
diff --git a/pkg/compactor/sharded_block_populator.go b/pkg/compactor/sharded_block_populator.go
new file mode 100644
index 00000000000..49a5c838b5f
--- /dev/null
+++ b/pkg/compactor/sharded_block_populator.go
@@ -0,0 +1,208 @@
+package compactor
+
+import (
+	"context"
+	"io"
+	"maps"
+	"sync"
+	"time"
+
+	"github.com/go-kit/log"
+	"github.com/go-kit/log/level"
+	"github.com/pkg/errors"
+	"github.com/prometheus/prometheus/storage"
+	"github.com/prometheus/prometheus/tsdb"
+	"github.com/prometheus/prometheus/tsdb/chunkenc"
+	"github.com/prometheus/prometheus/tsdb/chunks"
+	tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
+	"golang.org/x/exp/slices"
+	"golang.org/x/sync/errgroup"
+)
+
+type ShardedBlockPopulator struct {
+	partitionCount int
+	partitionID    int
+	logger         log.Logger
+}
+
+// PopulateBlock fills the index and chunk writers with new data gathered as the union
+// of the provided blocks. It returns meta information for the new block.
+// It expects sorted blocks input by mint.
+// The main logic is copied from tsdb.DefaultPopulateBlockFunc
+func (c ShardedBlockPopulator) PopulateBlock(ctx context.Context, metrics *tsdb.CompactorMetrics, _ log.Logger, chunkPool chunkenc.Pool, mergeFunc storage.VerticalChunkSeriesMergeFunc, blocks []tsdb.BlockReader, meta *tsdb.BlockMeta, indexw tsdb.IndexWriter, chunkw tsdb.ChunkWriter, postingsFunc tsdb.IndexReaderPostingsFunc) (err error) {
+	if len(blocks) == 0 {
+		return errors.New("cannot populate block from no readers")
+	}
+
+	var (
+		sets    []storage.ChunkSeriesSet
+		setsMtx sync.Mutex
+		symbols map[string]struct{}
+		closers []io.Closer
+	)
+	symbols = make(map[string]struct{})
+	defer func() {
+		errs := tsdb_errors.NewMulti(err)
+		if cerr := tsdb_errors.CloseAll(closers); cerr != nil {
+			errs.Add(errors.Wrap(cerr, "close"))
+		}
+		err = errs.Err()
+		metrics.PopulatingBlocks.Set(0)
+	}()
+	metrics.PopulatingBlocks.Set(1)
+
+	globalMaxt := blocks[0].Meta().MaxTime
+	g, gCtx := errgroup.WithContext(ctx)
+	g.SetLimit(8)
+	for _, b := range blocks {
+		select {
+		case <-gCtx.Done():
+			return gCtx.Err()
+		default:
+		}
+
+		if b.Meta().MaxTime > globalMaxt {
+			globalMaxt = b.Meta().MaxTime
+		}
+
+		indexr, err := b.Index()
+		if err != nil {
+			return errors.Wrapf(err, "open index reader for block %+v", b.Meta())
+		}
+		closers = append(closers, indexr)
+
+		chunkr, err := b.Chunks()
+		if err != nil {
+			return errors.Wrapf(err, "open chunk reader for block %+v", b.Meta())
+		}
+		closers = append(closers, chunkr)
+
+		tombsr, err := b.Tombstones()
+		if err != nil {
+			return errors.Wrapf(err, "open tombstone reader for block %+v", b.Meta())
+		}
+		closers = append(closers, tombsr)
+
+		all := postingsFunc(gCtx, indexr)
+		g.Go(func() error {
+			shardStart := time.Now()
+			shardedPosting, syms, err := NewShardedPosting(gCtx, all, uint64(c.partitionCount), uint64(c.partitionID), indexr.Series)
+			if err != nil {
+				return err
+			}
+			level.Debug(c.logger).Log("msg", "finished sharding", "duration", time.Since(shardStart))
+			// Blocks meta is half open: [min, max), so subtract 1 to ensure we don't hold samples with exact meta.MaxTime timestamp.
+			setsMtx.Lock()
+			sets = append(sets, tsdb.NewBlockChunkSeriesSet(meta.ULID, indexr, chunkr, tombsr, shardedPosting, meta.MinTime, meta.MaxTime-1, false))
+			maps.Copy(symbols, syms)
+			setsMtx.Unlock()
+			return nil
+		})
+	}
+	if err := g.Wait(); err != nil {
+		return err
+	}
+
+	symbolsList := make([]string, len(symbols))
+	symbolIdx := 0
+	for symbol := range symbols {
+		symbolsList[symbolIdx] = symbol
+		symbolIdx++
+	}
+	slices.Sort(symbolsList)
+	for _, symbol := range symbolsList {
+		if err := indexw.AddSymbol(symbol); err != nil {
+			return errors.Wrap(err, "add symbol")
+		}
+	}
+
+	var (
+		ref = storage.SeriesRef(0)
+		ch  = make(chan func() error, 1000)
+	)
+
+	set := sets[0]
+	if len(sets) > 1 {
+		iCtx, cancel := context.WithCancel(ctx)
+		// Merge series using specified chunk series merger.
+		// The default one is the compacting series merger.
+		set = NewBackgroundChunkSeriesSet(iCtx, storage.NewMergeChunkSeriesSet(sets, mergeFunc))
+		defer cancel()
+	}
+
+	go func() {
+		// Iterate over all sorted chunk series.
+		for set.Next() {
+			select {
+			case <-ctx.Done():
+				ch <- func() error { return ctx.Err() }
+			default:
+			}
+			s := set.At()
+			curChksIter := s.Iterator(nil)
+
+			var chks []chunks.Meta
+			var wg sync.WaitGroup
+			r := ref
+			wg.Add(1)
+			go func() {
+				for curChksIter.Next() {
+					// We are not iterating in streaming way over chunk as
+					// it's more efficient to do bulk write for index and
+					// chunk file purposes.
+					chks = append(chks, curChksIter.At())
+				}
+				wg.Done()
+			}()
+
+			ch <- func() error {
+				wg.Wait()
+				if curChksIter.Err() != nil {
+					return errors.Wrap(curChksIter.Err(), "chunk iter")
+				}
+
+				// Skip the series with all deleted chunks.
+				if len(chks) == 0 {
+					return nil
+				}
+
+				if err := chunkw.WriteChunks(chks...); err != nil {
+					return errors.Wrap(err, "write chunks")
+				}
+				if err := indexw.AddSeries(r, s.Labels(), chks...); err != nil {
+					return errors.Wrap(err, "add series")
+				}
+
+				meta.Stats.NumChunks += uint64(len(chks))
+				meta.Stats.NumSeries++
+				for _, chk := range chks {
+					meta.Stats.NumSamples += uint64(chk.Chunk.NumSamples())
+				}
+
+				for _, chk := range chks {
+					if err := chunkPool.Put(chk.Chunk); err != nil {
+						return errors.Wrap(err, "put chunk")
+					}
+				}
+
+				return nil
+			}
+
+			ref++
+		}
+		close(ch)
+	}()
+
+	for callback := range ch {
+		err := callback()
+		if err != nil {
+			return err
+		}
+	}
+
+	if set.Err() != nil {
+		return errors.Wrap(set.Err(), "iterate compaction set")
+	}
+
+	return nil
+}
diff --git a/pkg/compactor/sharded_compaction_lifecycle_callback.go b/pkg/compactor/sharded_compaction_lifecycle_callback.go
new file mode 100644
index 00000000000..ab6fc93845c
--- /dev/null
+++ b/pkg/compactor/sharded_compaction_lifecycle_callback.go
@@ -0,0 +1,108 @@
+package compactor
+
+import (
+	"context"
+	"path/filepath"
+	"time"
+
+	"github.com/go-kit/log"
+	"github.com/go-kit/log/level"
+	"github.com/oklog/ulid"
+	"github.com/prometheus/prometheus/tsdb"
+	"github.com/thanos-io/objstore"
+	"github.com/thanos-io/thanos/pkg/block/metadata"
+	"github.com/thanos-io/thanos/pkg/compact"
+	"github.com/thanos-io/thanos/pkg/runutil"
+
+	cortextsdb "github.com/cortexproject/cortex/pkg/storage/tsdb"
+)
+
+type ShardedCompactionLifecycleCallback struct {
+	ctx                 context.Context
+	userBucket          objstore.InstrumentedBucket
+	logger              log.Logger
+	metaSyncConcurrency int
+	compactDir          string
+	userID              string
+	compactorMetrics    *compactorMetrics
+
+	startTime time.Time
+}
+
+func NewShardedCompactionLifecycleCallback(
+	ctx context.Context,
+	userBucket objstore.InstrumentedBucket,
+	logger log.Logger,
+	metaSyncConcurrency int,
+	compactDir string,
+	userID string,
+	compactorMetrics *compactorMetrics,
+) *ShardedCompactionLifecycleCallback {
+	return &ShardedCompactionLifecycleCallback{
+		ctx:                 ctx,
+		userBucket:          userBucket,
+		logger:              logger,
+		metaSyncConcurrency: metaSyncConcurrency,
+		compactDir:          compactDir,
+		userID:              userID,
+		compactorMetrics:    compactorMetrics,
+	}
+}
+
+func (c *ShardedCompactionLifecycleCallback) PreCompactionCallback(_ context.Context, logger log.Logger, g *compact.Group, meta []*metadata.Meta) error {
+	c.startTime = time.Now()
+
+	metaExt, err := cortextsdb.ConvertToCortexMetaExtensions(g.Extensions())
+	if err != nil {
+		level.Warn(logger).Log("msg", "unable to get cortex meta extensions", "err", err)
+	} else if metaExt != nil {
+		c.compactorMetrics.compactionPlanned.WithLabelValues(c.userID, metaExt.TimeRangeStr()).Inc()
+	}
+
+	// Delete local files other than current group
+	var ignoreDirs []string
+	for _, m := range meta {
+		ignoreDirs = append(ignoreDirs, filepath.Join(g.Key(), m.ULID.String()))
+	}
+	if err := runutil.DeleteAll(c.compactDir, ignoreDirs...); err != nil {
+		level.Warn(logger).Log("msg", "failed deleting non-current compaction group files, disk space usage might have leaked.", "err", err, "dir", c.compactDir)
+	}
+	return nil
+}
+
+func (c *ShardedCompactionLifecycleCallback) PostCompactionCallback(_ context.Context, logger log.Logger, cg *compact.Group, _ ulid.ULID) error {
+	metaExt, err := cortextsdb.ConvertToCortexMetaExtensions(cg.Extensions())
+	if err != nil {
+		level.Warn(logger).Log("msg", "unable to get cortex meta extensions", "err", err)
+	} else if metaExt != nil {
+		c.compactorMetrics.compactionDuration.WithLabelValues(c.userID, metaExt.TimeRangeStr()).Set(time.Since(c.startTime).Seconds())
+	}
+	return nil
+}
+
+func (c *ShardedCompactionLifecycleCallback) GetBlockPopulator(_ context.Context, logger log.Logger, cg *compact.Group) (tsdb.BlockPopulator, error) {
+	partitionInfo, err := cortextsdb.ConvertToPartitionInfo(cg.Extensions())
+	if err != nil {
+		return nil, err
+	}
+	if partitionInfo == nil {
+		return tsdb.DefaultBlockPopulator{}, nil
+	}
+	if partitionInfo.PartitionCount <= 0 {
+		partitionInfo = &cortextsdb.PartitionInfo{
+			PartitionCount:               1,
+			PartitionID:                  partitionInfo.PartitionID,
+			PartitionedGroupID:           partitionInfo.PartitionedGroupID,
+			PartitionedGroupCreationTime: partitionInfo.PartitionedGroupCreationTime,
+		}
+		cg.SetExtensions(&cortextsdb.CortexMetaExtensions{
+			PartitionInfo: partitionInfo,
+		})
+	}
+	populateBlockFunc := ShardedBlockPopulator{
+		partitionCount: partitionInfo.PartitionCount,
+		partitionID:    partitionInfo.PartitionID,
+		logger:         logger,
+	}
+	return populateBlockFunc, nil
+}
diff --git a/pkg/compactor/sharded_compaction_lifecycle_callback_test.go b/pkg/compactor/sharded_compaction_lifecycle_callback_test.go
new file mode 100644
index 00000000000..09157c895b8
--- /dev/null
+++ b/pkg/compactor/sharded_compaction_lifecycle_callback_test.go
@@ -0,0 +1,96 @@
+package compactor
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/go-kit/log"
+	"github.com/oklog/ulid"
+	"github.com/prometheus/prometheus/tsdb"
+	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/thanos/pkg/block/metadata"
+	"github.com/thanos-io/thanos/pkg/compact"
+)
+
+func TestPreCompactionCallback(t *testing.T) {
+	compactDir, err := os.MkdirTemp(os.TempDir(), "compact")
+	require.NoError(t, err)
+
+	t.Cleanup(func() {
+		require.NoError(t, os.RemoveAll(compactDir))
+	})
+
+	lifecycleCallback := ShardedCompactionLifecycleCallback{
+		compactDir: compactDir,
+	}
+
+	block1 := ulid.MustNew(1, nil)
+	block2 := ulid.MustNew(2, nil)
+	block3 := ulid.MustNew(3, nil)
+	meta := []*metadata.Meta{
+		{
+			BlockMeta: tsdb.BlockMeta{ULID: block1, MinTime: 1 * time.Hour.Milliseconds(), MaxTime: 2 * time.Hour.Milliseconds()},
+		},
+		{
+			BlockMeta: tsdb.BlockMeta{ULID: block2, MinTime: 1 * time.Hour.Milliseconds(), MaxTime: 2 * time.Hour.Milliseconds()},
+		},
+		{
+			BlockMeta: tsdb.BlockMeta{ULID: block3, MinTime: 2 * time.Hour.Milliseconds(), MaxTime: 3 * time.Hour.Milliseconds()},
+		},
+	}
+	testGroupKey := "test_group_key"
+	testGroup, _ := compact.NewGroup(
+		log.NewNopLogger(),
+		nil,
+		testGroupKey,
+		nil,
+		0,
+		true,
+		true,
+		nil,
+		nil,
+		nil,
+		nil,
+		nil,
+		nil,
+		nil,
+		nil,
+		metadata.NoneFunc,
+		1,
+		1,
+	)
+	for _, m := range meta {
+		err := testGroup.AppendMeta(m)
+		require.NoError(t, err)
+	}
+
+	dummyGroupID1 := "dummy_dir_1"
+	dummyGroupID2 := "dummy_dir_2"
+	err = os.MkdirAll(filepath.Join(compactDir, testGroupKey), 0750)
+	require.NoError(t, err)
+	err = os.MkdirAll(filepath.Join(compactDir, testGroupKey, block1.String()), 0750)
+	require.NoError(t, err)
+	err = os.MkdirAll(filepath.Join(compactDir, dummyGroupID1), 0750)
+	require.NoError(t, err)
+	err = os.MkdirAll(filepath.Join(compactDir, dummyGroupID2), 0750)
+	require.NoError(t, err)
+
+	err = lifecycleCallback.PreCompactionCallback(context.Background(), log.NewNopLogger(), testGroup, meta)
+	require.NoError(t, err)
+
+	info, err := os.Stat(filepath.Join(compactDir, testGroupKey))
+	require.NoError(t, err)
+	require.True(t, info.IsDir())
+	info, err = os.Stat(filepath.Join(compactDir, testGroupKey, block1.String()))
+	require.NoError(t, err)
+	require.True(t, info.IsDir())
+	_, err = os.Stat(filepath.Join(compactDir, dummyGroupID1))
+	require.Error(t, err)
+	require.True(t, os.IsNotExist(err))
+	_, err = os.Stat(filepath.Join(compactDir, dummyGroupID2))
+	require.Error(t, err)
+	require.True(t, os.IsNotExist(err))
+}
diff --git a/pkg/compactor/sharded_posting.go b/pkg/compactor/sharded_posting.go
new file mode 100644
index 00000000000..b0c29ca1c98
--- /dev/null
+++ b/pkg/compactor/sharded_posting.go
@@ -0,0 +1,38 @@
+package compactor
+
+import (
+	"context"
+
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/storage"
+	"github.com/prometheus/prometheus/tsdb/chunks"
+	"github.com/prometheus/prometheus/tsdb/index"
+
+	"github.com/cortexproject/cortex/pkg/util"
+)
+
+func NewShardedPosting(ctx context.Context, postings index.Postings, partitionCount uint64, partitionID uint64, labelsFn func(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error) (index.Postings, map[string]struct{}, error) {
+	series := make([]storage.SeriesRef, 0)
+	symbols := make(map[string]struct{})
+	var builder labels.ScratchBuilder
+	cnt := 0
+	for postings.Next() {
+		cnt++
+		if cnt%util.CheckContextEveryNIterations == 0 && ctx.Err() != nil {
+			return nil, nil, ctx.Err()
+		}
+		err := labelsFn(postings.At(), &builder, nil)
+		if err != nil {
+			return nil, nil, err
+		}
+		if builder.Labels().Hash()%partitionCount == partitionID {
+			posting := postings.At()
+			series = append(series, posting)
+			for _, label := range builder.Labels() {
+				symbols[label.Name] = struct{}{}
+				symbols[label.Value] = struct{}{}
+			}
+		}
+	}
+	return index.NewListPostings(series), symbols, nil
+}
diff --git a/pkg/compactor/sharded_posting_test.go b/pkg/compactor/sharded_posting_test.go
new file mode 100644
index 00000000000..6c868fa940e
--- /dev/null
+++ b/pkg/compactor/sharded_posting_test.go
@@ -0,0 +1,109 @@
+package compactor
+
+import (
+	"context"
+	"io"
+	"math/rand"
+	"os"
+	"path/filepath"
+	"strconv"
+	"testing"
+	"time"
+
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/tsdb/chunks"
+	"github.com/prometheus/prometheus/tsdb/index"
+	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/thanos/pkg/block/metadata"
+	"github.com/thanos-io/thanos/pkg/testutil/e2eutil"
+)
+
+const (
+	MetricLabelName = "__name__"
+	MetricName      = "test_metric"
+	TestLabelName   = "test_label"
+	ConstLabelName  = "const_label"
+	ConstLabelValue = "const_value"
+)
+
+func TestShardPostingAndSymbolBasedOnPartitionID(t *testing.T) {
+	partitionCount := 8
+
+	tmpdir, err := os.MkdirTemp("", "sharded_posting_test")
+	require.NoError(t, err)
+	t.Cleanup(func() {
+		require.NoError(t, os.RemoveAll(tmpdir))
+	})
+
+	r := rand.New(rand.NewSource(0))
+	var series []labels.Labels
+	expectedSymbols := make(map[string]bool)
+	metricName := labels.Label{Name: MetricLabelName, Value: MetricName}
+	expectedSymbols[MetricLabelName] = false
+	expectedSymbols[MetricName] = false
+	expectedSymbols[ConstLabelName] = false
+	expectedSymbols[ConstLabelValue] = false
+	expectedSeriesCount := 10
+	for i := 0; i < expectedSeriesCount; i++ {
+		labelValue := strconv.Itoa(r.Int())
+		series = append(series, labels.Labels{
+			metricName,
+			{Name: ConstLabelName, Value: ConstLabelValue},
+			{Name: TestLabelName, Value: labelValue},
+		})
+		expectedSymbols[TestLabelName] = false
+		expectedSymbols[labelValue] = false
+	}
+	blockID, err := e2eutil.CreateBlock(context.Background(), tmpdir, series, 10, time.Now().Add(-10*time.Minute).UnixMilli(), time.Now().UnixMilli(), nil, 0, metadata.NoneFunc)
+	require.NoError(t, err)
+
+	var closers []io.Closer
+	defer func() {
+		for _, c := range closers {
+			c.Close()
+		}
+	}()
+	seriesCount := 0
+	for partitionID := 0; partitionID < partitionCount; partitionID++ {
+		ir, err := index.NewFileReader(filepath.Join(tmpdir, blockID.String(), "index"))
+		closers = append(closers, ir)
+		require.NoError(t, err)
+		k, v := index.AllPostingsKey()
+		postings, err := ir.Postings(context.Background(), k, v)
+		require.NoError(t, err)
+		postings = ir.SortedPostings(postings)
+		shardedPostings, syms, err := NewShardedPosting(context.Background(), postings, uint64(partitionCount), uint64(partitionID), ir.Series)
+		require.NoError(t, err)
+		bufChks := make([]chunks.Meta, 0)
+		expectedShardedSymbols := make(map[string]struct{})
+		for shardedPostings.Next() {
+			var builder labels.ScratchBuilder
+			err = ir.Series(shardedPostings.At(), &builder, &bufChks)
+			require.NoError(t, err)
+			require.Equal(t, uint64(partitionID), builder.Labels().Hash()%uint64(partitionCount))
+			seriesCount++
+			for _, label := range builder.Labels() {
+				expectedShardedSymbols[label.Name] = struct{}{}
+				expectedShardedSymbols[label.Value] = struct{}{}
+			}
+		}
+		err = ir.Close()
+		if err == nil {
+			closers = closers[0 : len(closers)-1]
+		}
+		symbolsCount := 0
+		for s := range syms {
+			symbolsCount++
+			_, ok := expectedSymbols[s]
+			require.True(t, ok)
+			expectedSymbols[s] = true
+			_, ok = expectedShardedSymbols[s]
+			require.True(t, ok)
+		}
+		require.Equal(t, len(expectedShardedSymbols), symbolsCount)
+	}
+	require.Equal(t, expectedSeriesCount, seriesCount)
+	for _, visited := range expectedSymbols {
+		require.True(t, visited)
+	}
+}
diff --git a/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/copy.go b/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/copy.go
new file mode 100644
index 00000000000..6464cd02ba8
--- /dev/null
+++ b/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/copy.go
@@ -0,0 +1,55 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+package e2eutil
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/efficientgo/core/testutil"
+	"github.com/pkg/errors"
+	"github.com/thanos-io/thanos/pkg/runutil"
+)
+
+func Copy(t testing.TB, src, dst string) {
+	testutil.Ok(t, copyRecursive(src, dst))
+}
+
+func copyRecursive(src, dst string) error {
+	return filepath.Walk(src, func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+
+		relPath, err := filepath.Rel(src, path)
+		if err != nil {
+			return err
+		}
+
+		if info.IsDir() {
+			return os.MkdirAll(filepath.Join(dst, relPath), os.ModePerm)
+		}
+
+		if !info.Mode().IsRegular() {
+			return errors.Errorf("%s is not a regular file", path)
+		}
+
+		source, err := os.Open(filepath.Clean(path))
+		if err != nil {
+			return err
+		}
+		defer runutil.CloseWithErrCapture(&err, source, "close file")
+
+		destination, err := os.Create(filepath.Join(dst, relPath))
+		if err != nil {
+			return err
+		}
+		defer runutil.CloseWithErrCapture(&err, destination, "close file")
+
+		_, err = io.Copy(destination, source)
+		return err
+	})
+}
diff --git a/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/port.go b/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/port.go
new file mode 100644
index 00000000000..986f1c7d7fd
--- /dev/null
+++ b/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/port.go
@@ -0,0 +1,20 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+package e2eutil
+
+import "net"
+
+// FreePort returns port that is free now.
+func FreePort() (int, error) {
+	addr, err := net.ResolveTCPAddr("tcp", ":0")
+	if err != nil {
+		return 0, err
+	}
+
+	l, err := net.ListenTCP("tcp", addr)
+	if err != nil {
+		return 0, err
+	}
+	return l.Addr().(*net.TCPAddr).Port, l.Close()
+}
diff --git a/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/prometheus.go b/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/prometheus.go
new file mode 100644
index 00000000000..5d784a9cc0b
--- /dev/null
+++ b/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/prometheus.go
@@ -0,0 +1,818 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+package e2eutil
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"math"
+	"math/rand"
+	"net/http"
+	"os"
+	"os/exec"
+	"path"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"sync"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/efficientgo/core/testutil"
+	"github.com/go-kit/log"
+	"github.com/oklog/ulid"
+	"github.com/pkg/errors"
+	"github.com/prometheus/prometheus/model/histogram"
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/model/timestamp"
+	"github.com/prometheus/prometheus/storage"
+	"github.com/prometheus/prometheus/tsdb"
+	"github.com/prometheus/prometheus/tsdb/chunkenc"
+	"github.com/prometheus/prometheus/tsdb/index"
+	"go.uber.org/atomic"
+	"golang.org/x/sync/errgroup"
+
+	"github.com/thanos-io/thanos/pkg/block/metadata"
+	"github.com/thanos-io/thanos/pkg/runutil"
+)
+
+const (
+	defaultPrometheusVersion   = "v0.54.1"
+	defaultAlertmanagerVersion = "v0.20.0"
+	defaultMinioVersion        = "RELEASE.2022-07-30T05-21-40Z"
+
+	// Space delimited list of versions.
+	promPathsEnvVar       = "THANOS_TEST_PROMETHEUS_PATHS"
+	alertmanagerBinEnvVar = "THANOS_TEST_ALERTMANAGER_PATH"
+	minioBinEnvVar        = "THANOS_TEST_MINIO_PATH"
+
+	// A placeholder for actual Prometheus instance address in the scrape config.
+	PromAddrPlaceHolder = "PROMETHEUS_ADDRESS"
+)
+
+var (
+	histogramSample = histogram.Histogram{
+		Schema:        0,
+		Count:         20,
+		Sum:           -3.1415,
+		ZeroCount:     12,
+		ZeroThreshold: 0.001,
+		NegativeSpans: []histogram.Span{
+			{Offset: 0, Length: 4},
+			{Offset: 1, Length: 1},
+		},
+		NegativeBuckets: []int64{1, 2, -2, 1, -1},
+	}
+
+	floatHistogramSample = histogram.FloatHistogram{
+		ZeroThreshold: 0.01,
+		ZeroCount:     5.5,
+		Count:         15,
+		Sum:           11.5,
+		PositiveSpans: []histogram.Span{
+			{Offset: -2, Length: 2},
+			{Offset: 1, Length: 3},
+		},
+		PositiveBuckets: []float64{0.5, 0, 1.5, 2, 3.5},
+		NegativeSpans: []histogram.Span{
+			{Offset: 3, Length: 2},
+			{Offset: 3, Length: 2},
+		},
+		NegativeBuckets: []float64{1.5, 0.5, 2.5, 3},
+	}
+)
+
+func PrometheusBinary() string {
+	return "prometheus-" + defaultPrometheusVersion
+}
+
+func AlertmanagerBinary() string {
+	b := os.Getenv(alertmanagerBinEnvVar)
+	if b == "" {
+		return fmt.Sprintf("alertmanager-%s", defaultAlertmanagerVersion)
+	}
+	return b
+}
+
+func MinioBinary() string {
+	b := os.Getenv(minioBinEnvVar)
+	if b == "" {
+		return fmt.Sprintf("minio-%s", defaultMinioVersion)
+	}
+	return b
+}
+
+// Prometheus represents a test instance for integration testing.
+// It can be populated with data before being started.
+type Prometheus struct {
+	dir     string
+	db      *tsdb.DB
+	prefix  string
+	binPath string
+
+	running            bool
+	cmd                *exec.Cmd
+	disabledCompaction bool
+	addr               string
+
+	config string
+
+	stdout, stderr bytes.Buffer
+}
+
+func NewTSDB() (*tsdb.DB, error) {
+	dir, err := os.MkdirTemp("", "prometheus-test")
+	if err != nil {
+		return nil, err
+	}
+	opts := tsdb.DefaultOptions()
+	opts.RetentionDuration = math.MaxInt64
+	return tsdb.Open(dir, nil, nil, opts, nil)
+}
+
+func ForeachPrometheus(t *testing.T, testFn func(t testing.TB, p *Prometheus)) {
+	paths := os.Getenv(promPathsEnvVar)
+	if paths == "" {
+		paths = PrometheusBinary()
+	}
+
+	for _, path := range strings.Split(paths, " ") {
+		if ok := t.Run(path, func(t *testing.T) {
+			p, err := newPrometheus(path, "")
+			testutil.Ok(t, err)
+
+			testFn(t, p)
+			testutil.Ok(t, p.Stop())
+		}); !ok {
+			return
+		}
+	}
+}
+
+// NewPrometheus creates a new test Prometheus instance that will listen on local address.
+// Use ForeachPrometheus if you want to test against set of Prometheus versions.
+// TODO(bwplotka): Improve it with https://github.com/thanos-io/thanos/issues/758.
+func NewPrometheus() (*Prometheus, error) {
+	return newPrometheus("", "")
+}
+
+// NewPrometheusOnPath creates a new test Prometheus instance that will listen on local address and given prefix path.
+func NewPrometheusOnPath(prefix string) (*Prometheus, error) {
+	return newPrometheus("", prefix)
+}
+
+func newPrometheus(binPath, prefix string) (*Prometheus, error) {
+	if binPath == "" {
+		binPath = PrometheusBinary()
+	}
+
+	db, err := NewTSDB()
+	if err != nil {
+		return nil, err
+	}
+
+	f, err := os.Create(filepath.Join(db.Dir(), "prometheus.yml"))
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	// Some well-known external labels so that we can test label resorting
+	if _, err = io.WriteString(f, "global:\n  external_labels:\n    region: eu-west"); err != nil {
+		return nil, err
+	}
+
+	return &Prometheus{
+		dir:     db.Dir(),
+		db:      db,
+		prefix:  prefix,
+		binPath: binPath,
+		addr:    "<prometheus-not-started>",
+	}, nil
+}
+
+// Start running the Prometheus instance and return.
+func (p *Prometheus) Start(ctx context.Context, l log.Logger) error {
+	if p.running {
+		return errors.New("Already started")
+	}
+
+	if err := p.db.Close(); err != nil {
+		return err
+	}
+	if err := p.start(); err != nil {
+		return err
+	}
+	if err := p.waitPrometheusUp(ctx, l, p.prefix); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (p *Prometheus) start() error {
+	port, err := FreePort()
+	if err != nil {
+		return err
+	}
+
+	var extra []string
+	if p.disabledCompaction {
+		extra = append(extra,
+			"--storage.tsdb.min-block-duration=2h",
+			"--storage.tsdb.max-block-duration=2h",
+		)
+	}
+	p.addr = fmt.Sprintf("localhost:%d", port)
+	// Write the final config to the config file.
+	// The address placeholder will be replaced with the actual address.
+	if err := p.writeConfig(strings.ReplaceAll(p.config, PromAddrPlaceHolder, p.addr)); err != nil {
+		return err
+	}
+	args := append([]string{
+		"--storage.tsdb.retention=2d", // Pass retention cause prometheus since 2.8.0 don't show default value for that flags in web/api: https://github.com/prometheus/prometheus/pull/5433.
+		"--storage.tsdb.path=" + p.db.Dir(),
+		"--web.listen-address=" + p.addr,
+		"--web.route-prefix=" + p.prefix,
+		"--web.enable-admin-api",
+		"--config.file=" + filepath.Join(p.db.Dir(), "prometheus.yml"),
+	}, extra...)
+
+	p.cmd = exec.Command(p.binPath, args...)
+	p.cmd.SysProcAttr = SysProcAttr()
+
+	p.stderr.Reset()
+	p.stdout.Reset()
+
+	p.cmd.Stdout = &p.stdout
+	p.cmd.Stderr = &p.stderr
+
+	if err := p.cmd.Start(); err != nil {
+		return fmt.Errorf("starting Prometheus failed: %w", err)
+	}
+
+	p.running = true
+	return nil
+}
+
+func (p *Prometheus) waitPrometheusUp(ctx context.Context, logger log.Logger, prefix string) error {
+	if !p.running {
+		return errors.New("method Start was not invoked.")
+	}
+	return runutil.RetryWithLog(logger, time.Second, ctx.Done(), func() error {
+		r, err := http.Get(fmt.Sprintf("http://%s%s/-/ready", p.addr, prefix))
+		if err != nil {
+			return err
+		}
+		defer runutil.ExhaustCloseWithLogOnErr(logger, r.Body, "failed to exhaust and close body")
+
+		if r.StatusCode != 200 {
+			return errors.Errorf("Got non 200 response: %v", r.StatusCode)
+		}
+		return nil
+	})
+}
+
+func (p *Prometheus) Restart(ctx context.Context, l log.Logger) error {
+	if err := p.cmd.Process.Signal(syscall.SIGTERM); err != nil {
+		return errors.Wrap(err, "failed to kill Prometheus. Kill it manually")
+	}
+	_ = p.cmd.Wait()
+	if err := p.start(); err != nil {
+		return err
+	}
+	return p.waitPrometheusUp(ctx, l, p.prefix)
+}
+
+// Dir returns TSDB dir.
+func (p *Prometheus) Dir() string {
+	return p.dir
+}
+
+// Addr returns correct address after Start method.
+func (p *Prometheus) Addr() string {
+	return p.addr + p.prefix
+}
+
+func (p *Prometheus) DisableCompaction() {
+	p.disabledCompaction = true
+}
+
+// SetConfig updates the contents of the config.
+func (p *Prometheus) SetConfig(s string) {
+	p.config = s
+}
+
+// writeConfig writes the Prometheus config to the config file.
+func (p *Prometheus) writeConfig(config string) (err error) {
+	f, err := os.Create(filepath.Join(p.dir, "prometheus.yml"))
+	if err != nil {
+		return err
+	}
+	defer runutil.CloseWithErrCapture(&err, f, "prometheus config")
+	_, err = f.Write([]byte(config))
+	return err
+}
+
+// Stop terminates Prometheus and clean up its data directory.
+func (p *Prometheus) Stop() (rerr error) {
+	if !p.running {
+		return nil
+	}
+
+	if p.cmd.Process != nil {
+		if err := p.cmd.Process.Signal(syscall.SIGTERM); err != nil {
+			return errors.Wrapf(err, "failed to Prometheus. Kill it manually and clean %s dir", p.db.Dir())
+		}
+
+		err := p.cmd.Wait()
+		if err != nil {
+			var exitErr *exec.ExitError
+			if errors.As(err, &exitErr) {
+				if exitErr.ExitCode() != -1 {
+					fmt.Fprintln(os.Stderr, "Prometheus exited with", exitErr.ExitCode())
+					fmt.Fprintln(os.Stderr, "stdout:\n", p.stdout.String(), "\nstderr:\n", p.stderr.String())
+				} else {
+					err = nil
+				}
+			}
+		}
+
+		if err != nil {
+			return fmt.Errorf("waiting for Prometheus to exit: %w", err)
+		}
+	}
+
+	return p.cleanup()
+}
+
+func (p *Prometheus) cleanup() error {
+	p.running = false
+	return os.RemoveAll(p.db.Dir())
+}
+
+// Appender returns a new appender to populate the Prometheus instance with data.
+// All appenders must be closed before Start is called and no new ones must be opened
+// afterwards.
+func (p *Prometheus) Appender() storage.Appender {
+	if p.running {
+		panic("Appender must not be called after start")
+	}
+	return p.db.Appender(context.Background())
+}
+
+// CreateEmptyBlock produces empty block like it was the case before fix: https://github.com/prometheus/tsdb/pull/374.
+// (Prometheus pre v2.7.0).
+func CreateEmptyBlock(dir string, mint, maxt int64, extLset labels.Labels, resolution int64) (ulid.ULID, error) {
+	entropy := rand.New(rand.NewSource(time.Now().UnixNano()))
+	uid := ulid.MustNew(ulid.Now(), entropy)
+
+	if err := os.Mkdir(path.Join(dir, uid.String()), os.ModePerm); err != nil {
+		return ulid.ULID{}, errors.Wrap(err, "close index")
+	}
+
+	if err := os.Mkdir(path.Join(dir, uid.String(), "chunks"), os.ModePerm); err != nil {
+		return ulid.ULID{}, errors.Wrap(err, "close index")
+	}
+
+	w, err := index.NewWriter(context.Background(), path.Join(dir, uid.String(), "index"))
+	if err != nil {
+		return ulid.ULID{}, errors.Wrap(err, "new index")
+	}
+
+	if err := w.Close(); err != nil {
+		return ulid.ULID{}, errors.Wrap(err, "close index")
+	}
+
+	m := tsdb.BlockMeta{
+		Version: 1,
+		ULID:    uid,
+		MinTime: mint,
+		MaxTime: maxt,
+		Compaction: tsdb.BlockMetaCompaction{
+			Level:   1,
+			Sources: []ulid.ULID{uid},
+		},
+	}
+	b, err := json.Marshal(&m)
+	if err != nil {
+		return ulid.ULID{}, err
+	}
+
+	if err := os.WriteFile(path.Join(dir, uid.String(), "meta.json"), b, os.ModePerm); err != nil {
+		return ulid.ULID{}, errors.Wrap(err, "saving meta.json")
+	}
+
+	if _, err = metadata.InjectThanos(log.NewNopLogger(), filepath.Join(dir, uid.String()), metadata.Thanos{
+		Labels:     extLset.Map(),
+		Downsample: metadata.ThanosDownsample{Resolution: resolution},
+		Source:     metadata.TestSource,
+	}, nil); err != nil {
+		return ulid.ULID{}, errors.Wrap(err, "finalize block")
+	}
+
+	return uid, nil
+}
+
+// CreateBlock writes a block with the given series and numSamples samples each.
+// Samples will be in the time range [mint, maxt).
+func CreateBlock(
+	ctx context.Context,
+	dir string,
+	series []labels.Labels,
+	numSamples int,
+	mint, maxt int64,
+	extLset labels.Labels,
+	resolution int64,
+	hashFunc metadata.HashFunc,
+) (id ulid.ULID, err error) {
+	return createBlock(ctx, dir, series, numSamples, mint, maxt, extLset, resolution, false, hashFunc, chunkenc.ValFloat)
+}
+
+// CreateBlockWithTombstone is same as CreateBlock but leaves tombstones which mimics the Prometheus local block.
+func CreateBlockWithTombstone(
+	ctx context.Context,
+	dir string,
+	series []labels.Labels,
+	numSamples int,
+	mint, maxt int64,
+	extLset labels.Labels,
+	resolution int64,
+	hashFunc metadata.HashFunc,
+) (id ulid.ULID, err error) {
+	return createBlock(ctx, dir, series, numSamples, mint, maxt, extLset, resolution, true, hashFunc, chunkenc.ValFloat)
+}
+
+// CreateBlockWithBlockDelay writes a block with the given series and numSamples samples each.
+// Samples will be in the time range [mint, maxt)
+// Block ID will be created with a delay of time duration blockDelay.
+func CreateBlockWithBlockDelay(
+	ctx context.Context,
+	dir string,
+	series []labels.Labels,
+	numSamples int,
+	mint, maxt int64,
+	blockDelay time.Duration,
+	extLset labels.Labels,
+	resolution int64,
+	hashFunc metadata.HashFunc,
+) (ulid.ULID, error) {
+	return createBlockWithDelay(ctx, dir, series, numSamples, mint, maxt, blockDelay, extLset, resolution, hashFunc, chunkenc.ValFloat)
+}
+
+// CreateHistogramBlockWithDelay writes a block with the given native histogram series and numSamples samples each.
+// Samples will be in the time range [mint, maxt).
+func CreateHistogramBlockWithDelay(
+	ctx context.Context,
+	dir string,
+	series []labels.Labels,
+	numSamples int,
+	mint, maxt int64,
+	blockDelay time.Duration,
+	extLset labels.Labels,
+	resolution int64,
+	hashFunc metadata.HashFunc,
+) (id ulid.ULID, err error) {
+	return createBlockWithDelay(ctx, dir, series, numSamples, mint, maxt, blockDelay, extLset, resolution, hashFunc, chunkenc.ValHistogram)
+}
+
+// CreateFloatHistogramBlockWithDelay writes a block with the given float native histogram series and numSamples samples each.
+// Samples will be in the time range [mint, maxt).
+func CreateFloatHistogramBlockWithDelay(
+	ctx context.Context,
+	dir string,
+	series []labels.Labels,
+	numSamples int,
+	mint, maxt int64,
+	blockDelay time.Duration,
+	extLset labels.Labels,
+	resolution int64,
+	hashFunc metadata.HashFunc,
+) (id ulid.ULID, err error) {
+	return createBlockWithDelay(ctx, dir, series, numSamples, mint, maxt, blockDelay, extLset, resolution, hashFunc, chunkenc.ValFloatHistogram)
+}
+
+func createBlockWithDelay(ctx context.Context, dir string, series []labels.Labels, numSamples int, mint int64, maxt int64, blockDelay time.Duration, extLset labels.Labels, resolution int64, hashFunc metadata.HashFunc, samplesType chunkenc.ValueType) (ulid.ULID, error) {
+	blockID, err := createBlock(ctx, dir, series, numSamples, mint, maxt, extLset, resolution, false, hashFunc, samplesType)
+	if err != nil {
+		return ulid.ULID{}, errors.Wrap(err, "block creation")
+	}
+
+	id, err := ulid.New(uint64(timestamp.FromTime(timestamp.Time(int64(blockID.Time())).Add(-blockDelay))), bytes.NewReader(blockID.Entropy()))
+	if err != nil {
+		return ulid.ULID{}, errors.Wrap(err, "create block id")
+	}
+
+	bdir := path.Join(dir, blockID.String())
+	m, err := metadata.ReadFromDir(bdir)
+	if err != nil {
+		return ulid.ULID{}, errors.Wrap(err, "open meta file")
+	}
+
+	logger := log.NewNopLogger()
+	m.ULID = id
+	m.Compaction.Sources = []ulid.ULID{id}
+	if err := m.WriteToDir(logger, path.Join(dir, blockID.String())); err != nil {
+		return ulid.ULID{}, errors.Wrap(err, "write meta.json file")
+	}
+
+	return id, os.Rename(path.Join(dir, blockID.String()), path.Join(dir, id.String()))
+}
+
+func createBlock(
+	ctx context.Context,
+	dir string,
+	series []labels.Labels,
+	numSamples int,
+	mint, maxt int64,
+	extLset labels.Labels,
+	resolution int64,
+	tombstones bool,
+	hashFunc metadata.HashFunc,
+	sampleType chunkenc.ValueType,
+) (id ulid.ULID, err error) {
+	headOpts := tsdb.DefaultHeadOptions()
+	headOpts.ChunkDirRoot = filepath.Join(dir, "chunks")
+	headOpts.ChunkRange = 10000000000
+	headOpts.EnableNativeHistograms = *atomic.NewBool(true)
+	h, err := tsdb.NewHead(nil, nil, nil, nil, headOpts, nil)
+	if err != nil {
+		return id, errors.Wrap(err, "create head block")
+	}
+	defer func() {
+		runutil.CloseWithErrCapture(&err, h, "TSDB Head")
+		if e := os.RemoveAll(headOpts.ChunkDirRoot); e != nil {
+			err = errors.Wrap(e, "delete chunks dir")
+		}
+	}()
+
+	var g errgroup.Group
+	var timeStepSize = (maxt - mint) / int64(numSamples+1)
+	var batchSize = len(series) / runtime.GOMAXPROCS(0)
+	r := rand.New(rand.NewSource(int64(numSamples)))
+	var randMutex sync.Mutex
+
+	for len(series) > 0 {
+		l := batchSize
+		if len(series) < 1000 {
+			l = len(series)
+		}
+		batch := series[:l]
+		series = series[l:]
+
+		g.Go(func() error {
+			t := mint
+
+			for i := 0; i < numSamples; i++ {
+				app := h.Appender(ctx)
+
+				for _, lset := range batch {
+					var err error
+					if sampleType == chunkenc.ValFloat {
+						randMutex.Lock()
+						_, err = app.Append(0, lset, t, r.Float64())
+						randMutex.Unlock()
+					} else if sampleType == chunkenc.ValHistogram {
+						_, err = app.AppendHistogram(0, lset, t, &histogramSample, nil)
+					} else if sampleType == chunkenc.ValFloatHistogram {
+						_, err = app.AppendHistogram(0, lset, t, nil, &floatHistogramSample)
+					}
+					if err != nil {
+						if rerr := app.Rollback(); rerr != nil {
+							err = errors.Wrapf(err, "rollback failed: %v", rerr)
+						}
+
+						return errors.Wrap(err, "add sample")
+					}
+				}
+				if err := app.Commit(); err != nil {
+					return errors.Wrap(err, "commit")
+				}
+				t += timeStepSize
+			}
+			return nil
+		})
+	}
+	if err := g.Wait(); err != nil {
+		return id, err
+	}
+	c, err := tsdb.NewLeveledCompactor(ctx, nil, log.NewNopLogger(), []int64{maxt - mint}, nil, nil)
+	if err != nil {
+		return id, errors.Wrap(err, "create compactor")
+	}
+
+	ids, err := c.Write(dir, h, mint, maxt, nil)
+	if err != nil {
+		return id, errors.Wrap(err, "write block")
+	}
+	if len(ids) == 0 {
+		return id, errors.Errorf("nothing to write, asked for %d samples", numSamples)
+	}
+	id = ids[0]
+
+	blockDir := filepath.Join(dir, id.String())
+	logger := log.NewNopLogger()
+	seriesSize, err := gatherMaxSeriesSize(ctx, filepath.Join(blockDir, "index"))
+	if err != nil {
+		return id, errors.Wrap(err, "gather max series size")
+	}
+
+	files := []metadata.File{}
+	if hashFunc != metadata.NoneFunc {
+		paths := []string{}
+		if err := filepath.Walk(blockDir, func(path string, info os.FileInfo, err error) error {
+			if info.IsDir() {
+				return nil
+			}
+			paths = append(paths, path)
+			return nil
+		}); err != nil {
+			return id, errors.Wrapf(err, "walking %s", dir)
+		}
+
+		for _, p := range paths {
+			pHash, err := metadata.CalculateHash(p, metadata.SHA256Func, log.NewNopLogger())
+			if err != nil {
+				return id, errors.Wrapf(err, "calculating hash of %s", blockDir+p)
+			}
+			files = append(files, metadata.File{
+				RelPath: strings.TrimPrefix(p, blockDir+"/"),
+				Hash:    &pHash,
+			})
+		}
+	}
+
+	if _, err = metadata.InjectThanos(logger, blockDir, metadata.Thanos{
+		Labels:     extLset.Map(),
+		Downsample: metadata.ThanosDownsample{Resolution: resolution},
+		Source:     metadata.TestSource,
+		Files:      files,
+		IndexStats: metadata.IndexStats{SeriesMaxSize: seriesSize},
+	}, nil); err != nil {
+		return id, errors.Wrap(err, "finalize block")
+	}
+
+	if !tombstones {
+		if err = os.Remove(filepath.Join(dir, id.String(), "tombstones")); err != nil {
+			return id, errors.Wrap(err, "remove tombstones")
+		}
+	}
+
+	return id, nil
+}
+
+func gatherMaxSeriesSize(ctx context.Context, fn string) (int64, error) {
+	r, err := index.NewFileReader(fn)
+	if err != nil {
+		return 0, errors.Wrap(err, "open index file")
+	}
+	defer runutil.CloseWithErrCapture(&err, r, "gather index issue file reader")
+
+	key, value := index.AllPostingsKey()
+	p, err := r.Postings(ctx, key, value)
+	if err != nil {
+		return 0, errors.Wrap(err, "get all postings")
+	}
+
+	// As of version two all series entries are 16 byte padded. All references
+	// we get have to account for that to get the correct offset.
+	offsetMultiplier := 1
+	version := r.Version()
+	if version >= 2 {
+		offsetMultiplier = 16
+	}
+
+	// Per series.
+	var (
+		prevId        storage.SeriesRef
+		maxSeriesSize int64
+	)
+	for p.Next() {
+		id := p.At()
+		if prevId != 0 {
+			// Approximate size.
+			seriesSize := int64(id-prevId) * int64(offsetMultiplier)
+			if seriesSize > maxSeriesSize {
+				maxSeriesSize = seriesSize
+			}
+		}
+		prevId = id
+	}
+	if p.Err() != nil {
+		return 0, errors.Wrap(err, "walk postings")
+	}
+
+	return maxSeriesSize, nil
+}
+
+// CreateBlockWithChurn writes a block with the given series. Start time of each series
+// will be randomized in the given time window to create churn. Only float chunk is supported right now.
+func CreateBlockWithChurn(
+	ctx context.Context,
+	rnd *rand.Rand,
+	dir string,
+	series []labels.Labels,
+	numSamples int,
+	mint, maxt int64,
+	extLset labels.Labels,
+	resolution int64,
+	scrapeInterval int64,
+	seriesSize int64,
+) (id ulid.ULID, err error) {
+	headOpts := tsdb.DefaultHeadOptions()
+	headOpts.ChunkDirRoot = filepath.Join(dir, "chunks")
+	headOpts.ChunkRange = 10000000000
+	h, err := tsdb.NewHead(nil, nil, nil, nil, headOpts, nil)
+	if err != nil {
+		return id, errors.Wrap(err, "create head block")
+	}
+	defer func() {
+		runutil.CloseWithErrCapture(&err, h, "TSDB Head")
+		if e := os.RemoveAll(headOpts.ChunkDirRoot); e != nil {
+			err = errors.Wrap(e, "delete chunks dir")
+		}
+	}()
+
+	app := h.Appender(ctx)
+	for i := 0; i < len(series); i++ {
+
+		var ref storage.SeriesRef
+		start := RandRange(rnd, mint, maxt)
+		for j := 0; j < numSamples; j++ {
+			if ref == 0 {
+				ref, err = app.Append(0, series[i], start, float64(i+j))
+			} else {
+				ref, err = app.Append(ref, series[i], start, float64(i+j))
+			}
+			if err != nil {
+				if rerr := app.Rollback(); rerr != nil {
+					err = errors.Wrapf(err, "rollback failed: %v", rerr)
+				}
+				return id, errors.Wrap(err, "add sample")
+			}
+			start += scrapeInterval
+			if start > maxt {
+				break
+			}
+		}
+	}
+	if err := app.Commit(); err != nil {
+		return id, errors.Wrap(err, "commit")
+	}
+
+	c, err := tsdb.NewLeveledCompactor(ctx, nil, log.NewNopLogger(), []int64{maxt - mint}, nil, nil)
+	if err != nil {
+		return id, errors.Wrap(err, "create compactor")
+	}
+
+	ids, err := c.Write(dir, h, mint, maxt, nil)
+	if err != nil {
+		return id, errors.Wrap(err, "write block")
+	}
+
+	if len(ids) == 0 {
+		return id, errors.Errorf("nothing to write, asked for %d samples", numSamples)
+	}
+	id = ids[0]
+
+	blockDir := filepath.Join(dir, id.String())
+	logger := log.NewNopLogger()
+
+	if _, err = metadata.InjectThanos(logger, blockDir, metadata.Thanos{
+		Labels:     extLset.Map(),
+		Downsample: metadata.ThanosDownsample{Resolution: resolution},
+		Source:     metadata.TestSource,
+		IndexStats: metadata.IndexStats{SeriesMaxSize: seriesSize},
+	}, nil); err != nil {
+		return id, errors.Wrap(err, "finalize block")
+	}
+
+	return id, nil
+}
+
+// AddDelay rewrites a given block with delay.
+func AddDelay(blockID ulid.ULID, dir string, blockDelay time.Duration) (ulid.ULID, error) {
+	id, err := ulid.New(uint64(timestamp.FromTime(timestamp.Time(int64(blockID.Time())).Add(-blockDelay))), bytes.NewReader(blockID.Entropy()))
+	if err != nil {
+		return ulid.ULID{}, errors.Wrap(err, "create block id")
+	}
+
+	bdir := path.Join(dir, blockID.String())
+	m, err := metadata.ReadFromDir(bdir)
+	if err != nil {
+		return ulid.ULID{}, errors.Wrap(err, "open meta file")
+	}
+
+	logger := log.NewNopLogger()
+	m.ULID = id
+	m.Compaction.Sources = []ulid.ULID{id}
+	if err := m.WriteToDir(logger, path.Join(dir, blockID.String())); err != nil {
+		return ulid.ULID{}, errors.Wrap(err, "write meta.json file")
+	}
+
+	return id, os.Rename(path.Join(dir, blockID.String()), path.Join(dir, id.String()))
+}
diff --git a/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/rand.go b/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/rand.go
new file mode 100644
index 00000000000..5cac2d6f078
--- /dev/null
+++ b/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/rand.go
@@ -0,0 +1,11 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+package e2eutil
+
+import "math/rand"
+
+// RandRange returns a random int64 from [min, max].
+func RandRange(rnd *rand.Rand, min, max int64) int64 {
+	return rnd.Int63n(max-min) + min
+}
diff --git a/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/sysprocattr.go b/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/sysprocattr.go
new file mode 100644
index 00000000000..53aaa7039f9
--- /dev/null
+++ b/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/sysprocattr.go
@@ -0,0 +1,13 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+//go:build !linux
+// +build !linux
+
+package e2eutil
+
+import "syscall"
+
+func SysProcAttr() *syscall.SysProcAttr {
+	return &syscall.SysProcAttr{}
+}
diff --git a/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/sysprocattr_linux.go b/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/sysprocattr_linux.go
new file mode 100644
index 00000000000..dd77ed32a18
--- /dev/null
+++ b/vendor/github.com/thanos-io/thanos/pkg/testutil/e2eutil/sysprocattr_linux.go
@@ -0,0 +1,13 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+package e2eutil
+
+import "syscall"
+
+func SysProcAttr() *syscall.SysProcAttr {
+	return &syscall.SysProcAttr{
+		// For linux only, kill this if the go test process dies before the cleanup.
+		Pdeathsig: syscall.SIGKILL,
+	}
+}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index a8d2eac2f3e..e9f5b61da7d 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -1038,6 +1038,7 @@ github.com/thanos-io/thanos/pkg/store/storepb/prompb
 github.com/thanos-io/thanos/pkg/strutil
 github.com/thanos-io/thanos/pkg/targets/targetspb
 github.com/thanos-io/thanos/pkg/tenancy
+github.com/thanos-io/thanos/pkg/testutil/e2eutil
 github.com/thanos-io/thanos/pkg/tls
 github.com/thanos-io/thanos/pkg/tracing
 github.com/thanos-io/thanos/pkg/tracing/interceptors

From 79adc337f8afed627ceeea3722ce7cb034c52f22 Mon Sep 17 00:00:00 2001
From: Alan Protasio <approtas@amazon.com>
Date: Tue, 21 Jan 2025 17:44:04 -0800
Subject: [PATCH 15/34] Test for nil on expire expanded postings (#6521)

* Test for nil on expire expanded postings

Signed-off-by: alanprot <alanprot@gmail.com>

* stopping ingester

Signed-off-by: alanprot <alanprot@gmail.com>

* refactor the test to not timeout

Signed-off-by: alanprot <alanprot@gmail.com>

---------

Signed-off-by: alanprot <alanprot@gmail.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 pkg/ingester/ingester_test.go | 82 +++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go
index 1de49fb8811..f9d319e187e 100644
--- a/pkg/ingester/ingester_test.go
+++ b/pkg/ingester/ingester_test.go
@@ -177,6 +177,77 @@ func TestMatcherCache(t *testing.T) {
 	`, callPerMatcher*numberOfDifferentMatchers-numberOfDifferentMatchers, cfg.MatchersCacheMaxItems, callPerMatcher*numberOfDifferentMatchers)), "ingester_matchers_cache_requests_total", "ingester_matchers_cache_hits_total", "ingester_matchers_cache_items", "ingester_matchers_cache_max_items", "ingester_matchers_cache_evicted_total"))
 }
 
+func TestIngesterDeletionRace(t *testing.T) {
+	registry := prometheus.NewRegistry()
+	limits := defaultLimitsTestConfig()
+	tenantLimits := newMockTenantLimits(map[string]*validation.Limits{userID: &limits})
+	cfg := defaultIngesterTestConfig(t)
+	cfg.BlocksStorageConfig.TSDB.PostingsCache = cortex_tsdb.TSDBPostingsCacheConfig{
+		Head: cortex_tsdb.PostingsCacheConfig{
+			Enabled:  true,
+			Ttl:      time.Hour,
+			MaxBytes: 1024 * 1024 * 1024,
+		},
+		Blocks: cortex_tsdb.PostingsCacheConfig{
+			Enabled:  true,
+			Ttl:      time.Hour,
+			MaxBytes: 1024 * 1024 * 1024,
+		},
+	}
+
+	dir := t.TempDir()
+	chunksDir := filepath.Join(dir, "chunks")
+	blocksDir := filepath.Join(dir, "blocks")
+	require.NoError(t, os.Mkdir(chunksDir, os.ModePerm))
+	require.NoError(t, os.Mkdir(blocksDir, os.ModePerm))
+
+	ing, err := prepareIngesterWithBlocksStorageAndLimits(t, cfg, limits, tenantLimits, blocksDir, registry, false)
+	require.NoError(t, err)
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), ing))
+	defer services.StopAndAwaitTerminated(context.Background(), ing) //nolint:errcheck
+	// Wait until it's ACTIVE
+	test.Poll(t, time.Second, ring.ACTIVE, func() interface{} {
+		return ing.lifecycler.GetState()
+	})
+
+	numberOfTenants := 50
+	wg := sync.WaitGroup{}
+	wg.Add(numberOfTenants)
+
+	for i := 0; i < numberOfTenants; i++ {
+		go func() {
+			defer wg.Done()
+			u := fmt.Sprintf("userId_%v", i)
+			ctx := user.InjectOrgID(context.Background(), u)
+			samples := []cortexpb.Sample{{Value: 2, TimestampMs: 10}}
+			_, err := ing.Push(ctx, cortexpb.ToWriteRequest([]labels.Labels{labels.FromStrings(labels.MetricName, "name")}, samples, nil, nil, cortexpb.API))
+			require.NoError(t, err)
+			ing.getTSDB(u).postingCache = &wrappedExpandedPostingsCache{ExpandedPostingsCache: ing.getTSDB(u).postingCache, purgeDelay: 10 * time.Millisecond}
+			ing.getTSDB(u).deletionMarkFound.Store(true) // lets force close the tenant
+		}()
+	}
+
+	wg.Wait()
+
+	ctx, c := context.WithCancel(context.Background())
+	defer c()
+
+	wg.Add(1)
+	go func() {
+		wg.Done()
+		ing.expirePostingsCache(ctx) //nolint:errcheck
+	}()
+
+	go func() {
+		wg.Wait()                            // make sure we clean after we started the purge go routine
+		ing.closeAndDeleteIdleUserTSDBs(ctx) //nolint:errcheck
+	}()
+
+	test.Poll(t, 5*time.Second, 0, func() interface{} {
+		return len(ing.getTSDBUsers())
+	})
+}
+
 func TestIngesterPerLabelsetLimitExceeded(t *testing.T) {
 	limits := defaultLimitsTestConfig()
 	userID := "1"
@@ -3528,6 +3599,17 @@ func (m *mockMetricsForLabelMatchersStreamServer) Context() context.Context {
 	return m.ctx
 }
 
+type wrappedExpandedPostingsCache struct {
+	cortex_tsdb.ExpandedPostingsCache
+
+	purgeDelay time.Duration
+}
+
+func (w *wrappedExpandedPostingsCache) PurgeExpiredItems() {
+	time.Sleep(w.purgeDelay)
+	w.ExpandedPostingsCache.PurgeExpiredItems()
+}
+
 type mockQueryStreamServer struct {
 	grpc.ServerStream
 	ctx context.Context

From 8203621e6d1228179731cd801b1f35e009a1545f Mon Sep 17 00:00:00 2001
From: Ahmed Hassan <57634502+afhassan@users.noreply.github.com>
Date: Wed, 22 Jan 2025 09:14:19 -0800
Subject: [PATCH 16/34] log when a request starts running in querier (#6525)

* log when a request starts running in querier

Signed-off-by: Ahmed Hassan <afayekhassan@gmail.com>

* log when a request starts running in querier for frontend processor

Signed-off-by: Ahmed Hassan <afayekhassan@gmail.com>

---------

Signed-off-by: Ahmed Hassan <afayekhassan@gmail.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 pkg/querier/worker/frontend_processor.go  | 3 +++
 pkg/querier/worker/scheduler_processor.go | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/pkg/querier/worker/frontend_processor.go b/pkg/querier/worker/frontend_processor.go
index 3bdaa0e9a22..17bd031acfb 100644
--- a/pkg/querier/worker/frontend_processor.go
+++ b/pkg/querier/worker/frontend_processor.go
@@ -142,6 +142,9 @@ func (fp *frontendProcessor) runRequest(ctx context.Context, request *httpgrpc.H
 	}
 	ctx = util_log.ContextWithHeaderMap(ctx, headerMap)
 	logger := util_log.WithContext(ctx, fp.log)
+	if statsEnabled {
+		level.Info(logger).Log("msg", "started running request")
+	}
 
 	response, err := fp.handler.Handle(ctx, request)
 	if err != nil {
diff --git a/pkg/querier/worker/scheduler_processor.go b/pkg/querier/worker/scheduler_processor.go
index 04d71cc69c7..aea9820153b 100644
--- a/pkg/querier/worker/scheduler_processor.go
+++ b/pkg/querier/worker/scheduler_processor.go
@@ -156,6 +156,9 @@ func (sp *schedulerProcessor) querierLoop(c schedulerpb.SchedulerForQuerier_Quer
 				ctx = spanCtx
 			}
 			logger := util_log.WithContext(ctx, sp.log)
+			if request.StatsEnabled {
+				level.Info(logger).Log("msg", "started running request")
+			}
 			sp.runRequest(ctx, logger, request.QueryID, request.FrontendAddress, request.StatsEnabled, request.HttpRequest)
 
 			if err = ctx.Err(); err != nil {

From 44032df94693826d14d5d79b5b88aeb2b092e5e2 Mon Sep 17 00:00:00 2001
From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com>
Date: Wed, 22 Jan 2025 09:55:54 -0800
Subject: [PATCH 17/34] Update build image according to
 https://github.com/cortexproject/cortex/commit/03a8f8c9807e666b606a07bd2c5f4e29e18b0428
 (#6508)

Signed-off-by: Friedrich Gonzalez <friedrichg@gmail.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 .github/workflows/test-build-deploy.yml | 14 +++++++-------
 Makefile                                |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test-build-deploy.yml b/.github/workflows/test-build-deploy.yml
index 9e6506a57a7..88aa222aad7 100644
--- a/.github/workflows/test-build-deploy.yml
+++ b/.github/workflows/test-build-deploy.yml
@@ -17,7 +17,7 @@ jobs:
   lint:
     runs-on: ubuntu-20.04
     container:
-      image: quay.io/cortexproject/build-image:master-d96523a32
+      image: quay.io/cortexproject/build-image:master-03a8f8c98
     steps:
       - name: Checkout Repo
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -46,7 +46,7 @@ jobs:
   test:
     runs-on: ubuntu-20.04
     container:
-      image: quay.io/cortexproject/build-image:master-d96523a32
+      image: quay.io/cortexproject/build-image:master-03a8f8c98
     steps:
       - name: Checkout Repo
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -64,7 +64,7 @@ jobs:
   test-no-race:
     runs-on: ubuntu-20.04
     container:
-      image: quay.io/cortexproject/build-image:master-d96523a32
+      image: quay.io/cortexproject/build-image:master-03a8f8c98
     steps:
       - name: Checkout Repo
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -107,7 +107,7 @@ jobs:
   build:
     runs-on: ubuntu-20.04
     container:
-      image: quay.io/cortexproject/build-image:master-d96523a32
+      image: quay.io/cortexproject/build-image:master-03a8f8c98
     steps:
       - name: Checkout Repo
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -247,14 +247,14 @@ jobs:
         run: |
           touch build-image/.uptodate
           MIGRATIONS_DIR=$(pwd)/cmd/cortex/migrations
-          make BUILD_IMAGE=quay.io/cortexproject/build-image:master-d96523a32 TTY='' configs-integration-test
+          make BUILD_IMAGE=quay.io/cortexproject/build-image:master-03a8f8c98 TTY='' configs-integration-test
 
   deploy_website:
     needs: [build, test]
     if: (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/')) && github.repository == 'cortexproject/cortex'
     runs-on: ubuntu-20.04
     container:
-      image: quay.io/cortexproject/build-image:master-d96523a32
+      image: quay.io/cortexproject/build-image:master-03a8f8c98
     steps:
       - name: Checkout Repo
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -296,7 +296,7 @@ jobs:
     if: (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/')) && github.repository == 'cortexproject/cortex'
     runs-on: ubuntu-20.04
     container:
-      image: quay.io/cortexproject/build-image:master-d96523a32
+      image: quay.io/cortexproject/build-image:master-03a8f8c98
     steps:
       - name: Checkout Repo
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/Makefile b/Makefile
index 05764ccb3d6..c28676a760d 100644
--- a/Makefile
+++ b/Makefile
@@ -115,7 +115,7 @@ build-image/$(UPTODATE): build-image/*
 SUDO := $(shell docker info >/dev/null 2>&1 || echo "sudo -E")
 BUILD_IN_CONTAINER := true
 BUILD_IMAGE ?= $(IMAGE_PREFIX)build-image
-LATEST_BUILD_IMAGE_TAG ?= master-d96523a32
+LATEST_BUILD_IMAGE_TAG ?= master-03a8f8c98
 
 # TTY is parameterized to allow Google Cloud Builder to run builds,
 # as it currently disallows TTY devices. This value needs to be overridden

From 6c5ce8eb5f1fa6d8f4ee75b23ad0d0c6a432304b Mon Sep 17 00:00:00 2001
From: SungJin1212 <tjdwls1201@gmail.com>
Date: Wed, 22 Jan 2025 10:15:36 +0900
Subject: [PATCH 18/34] Deprecate -blocks-storage.tsdb.wal-compression-enabled
 flag

Signed-off-by: SungJin1212 <tjdwls1201@gmail.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 CHANGELOG.md                                | 1 +
 docs/blocks-storage/querier.md              | 5 -----
 docs/blocks-storage/store-gateway.md        | 5 -----
 docs/configuration/config-file-reference.md | 5 -----
 pkg/storage/tsdb/config.go                  | 6 ++++--
 5 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c3699d14bfe..a6e57611f33 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## master / unreleased
 
+* [CHANGE] Deprecate `-blocks-storage.tsdb.wal-compression-enabled` flag (use `blocks-storage.tsdb.wal-compression-type` instead). #6529
 * [CHANGE] OTLP: Change OTLP handler to be consistent with the Prometheus OTLP handler. #6272
 - `target_info` metric is enabled by default and can be disabled via `-distributor.otlp.disable-target-info=true` flag
 - Convert all attributes to labels is disabled by default and can be enabled via `-distributor.otlp.convert-all-attributes=true` flag
diff --git a/docs/blocks-storage/querier.md b/docs/blocks-storage/querier.md
index 19317be05ff..47dd9998dd9 100644
--- a/docs/blocks-storage/querier.md
+++ b/docs/blocks-storage/querier.md
@@ -1508,11 +1508,6 @@ blocks_storage:
     # CLI flag: -blocks-storage.tsdb.stripe-size
     [stripe_size: <int> | default = 16384]
 
-    # Deprecated (use blocks-storage.tsdb.wal-compression-type instead): True to
-    # enable TSDB WAL compression.
-    # CLI flag: -blocks-storage.tsdb.wal-compression-enabled
-    [wal_compression_enabled: <boolean> | default = false]
-
     # TSDB WAL type. Supported values are: 'snappy', 'zstd' and '' (disable
     # compression)
     # CLI flag: -blocks-storage.tsdb.wal-compression-type
diff --git a/docs/blocks-storage/store-gateway.md b/docs/blocks-storage/store-gateway.md
index e7a65dd58c1..fbd3c92af2c 100644
--- a/docs/blocks-storage/store-gateway.md
+++ b/docs/blocks-storage/store-gateway.md
@@ -1612,11 +1612,6 @@ blocks_storage:
     # CLI flag: -blocks-storage.tsdb.stripe-size
     [stripe_size: <int> | default = 16384]
 
-    # Deprecated (use blocks-storage.tsdb.wal-compression-type instead): True to
-    # enable TSDB WAL compression.
-    # CLI flag: -blocks-storage.tsdb.wal-compression-enabled
-    [wal_compression_enabled: <boolean> | default = false]
-
     # TSDB WAL type. Supported values are: 'snappy', 'zstd' and '' (disable
     # compression)
     # CLI flag: -blocks-storage.tsdb.wal-compression-type
diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
index 69d6713e930..f0a69fee853 100644
--- a/docs/configuration/config-file-reference.md
+++ b/docs/configuration/config-file-reference.md
@@ -2058,11 +2058,6 @@ tsdb:
   # CLI flag: -blocks-storage.tsdb.stripe-size
   [stripe_size: <int> | default = 16384]
 
-  # Deprecated (use blocks-storage.tsdb.wal-compression-type instead): True to
-  # enable TSDB WAL compression.
-  # CLI flag: -blocks-storage.tsdb.wal-compression-enabled
-  [wal_compression_enabled: <boolean> | default = false]
-
   # TSDB WAL type. Supported values are: 'snappy', 'zstd' and '' (disable
   # compression)
   # CLI flag: -blocks-storage.tsdb.wal-compression-type
diff --git a/pkg/storage/tsdb/config.go b/pkg/storage/tsdb/config.go
index afb51d1a416..2a2e16c58d8 100644
--- a/pkg/storage/tsdb/config.go
+++ b/pkg/storage/tsdb/config.go
@@ -16,6 +16,8 @@ import (
 
 	"github.com/cortexproject/cortex/pkg/storage/bucket"
 	"github.com/cortexproject/cortex/pkg/util"
+	"github.com/cortexproject/cortex/pkg/util/flagext"
+	util_log "github.com/cortexproject/cortex/pkg/util/log"
 )
 
 const (
@@ -141,7 +143,6 @@ type TSDBConfig struct {
 	HeadCompactionIdleTimeout time.Duration `yaml:"head_compaction_idle_timeout"`
 	HeadChunksWriteBufferSize int           `yaml:"head_chunks_write_buffer_size_bytes"`
 	StripeSize                int           `yaml:"stripe_size"`
-	WALCompressionEnabled     bool          `yaml:"wal_compression_enabled"`
 	WALCompressionType        string        `yaml:"wal_compression_type"`
 	WALSegmentSizeBytes       int           `yaml:"wal_segment_size_bytes"`
 	FlushBlocksOnShutdown     bool          `yaml:"flush_blocks_on_shutdown"`
@@ -195,7 +196,6 @@ func (cfg *TSDBConfig) RegisterFlags(f *flag.FlagSet) {
 	f.DurationVar(&cfg.HeadCompactionIdleTimeout, "blocks-storage.tsdb.head-compaction-idle-timeout", 1*time.Hour, "If TSDB head is idle for this duration, it is compacted. Note that up to 25% jitter is added to the value to avoid ingesters compacting concurrently. 0 means disabled.")
 	f.IntVar(&cfg.HeadChunksWriteBufferSize, "blocks-storage.tsdb.head-chunks-write-buffer-size-bytes", chunks.DefaultWriteBufferSize, "The write buffer size used by the head chunks mapper. Lower values reduce memory utilisation on clusters with a large number of tenants at the cost of increased disk I/O operations.")
 	f.IntVar(&cfg.StripeSize, "blocks-storage.tsdb.stripe-size", 16384, "The number of shards of series to use in TSDB (must be a power of 2). Reducing this will decrease memory footprint, but can negatively impact performance.")
-	f.BoolVar(&cfg.WALCompressionEnabled, "blocks-storage.tsdb.wal-compression-enabled", false, "Deprecated (use blocks-storage.tsdb.wal-compression-type instead): True to enable TSDB WAL compression.")
 	f.StringVar(&cfg.WALCompressionType, "blocks-storage.tsdb.wal-compression-type", "", "TSDB WAL type. Supported values are: 'snappy', 'zstd' and '' (disable compression)")
 	f.IntVar(&cfg.WALSegmentSizeBytes, "blocks-storage.tsdb.wal-segment-size-bytes", wlog.DefaultSegmentSize, "TSDB WAL segments files max size (bytes).")
 	f.BoolVar(&cfg.FlushBlocksOnShutdown, "blocks-storage.tsdb.flush-blocks-on-shutdown", false, "True to flush blocks to storage on shutdown. If false, incomplete blocks will be reused after restart.")
@@ -206,6 +206,8 @@ func (cfg *TSDBConfig) RegisterFlags(f *flag.FlagSet) {
 	f.Int64Var(&cfg.OutOfOrderCapMax, "blocks-storage.tsdb.out-of-order-cap-max", tsdb.DefaultOutOfOrderCapMax, "[EXPERIMENTAL] Configures the maximum number of samples per chunk that can be out-of-order.")
 	f.BoolVar(&cfg.EnableNativeHistograms, "blocks-storage.tsdb.enable-native-histograms", false, "[EXPERIMENTAL] True to enable native histogram.")
 
+	flagext.DeprecatedFlag(f, "blocks-storage.tsdb.wal-compression-enabled", "Deprecated (use blocks-storage.tsdb.wal-compression-type instead): True to enable TSDB WAL compression.", util_log.Logger)
+
 	cfg.PostingsCache.RegisterFlagsWithPrefix("blocks-storage.", f)
 }
 

From 47af5e5141b119c66d58cfbecd4dc6c078518ef1 Mon Sep 17 00:00:00 2001
From: Daniel Blando <daniel@blando.com.br>
Date: Wed, 22 Jan 2025 17:06:01 -0800
Subject: [PATCH 19/34] Fix test (#6537)

Signed-off-by: Daniel Deluiggi <ddeluigg@amazon.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 pkg/ingester/ingester_test.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go
index f9d319e187e..3d386947fa5 100644
--- a/pkg/ingester/ingester_test.go
+++ b/pkg/ingester/ingester_test.go
@@ -6376,6 +6376,8 @@ func TestIngester_UpdateLabelSetMetrics(t *testing.T) {
 	i, err := prepareIngesterWithBlocksStorageAndLimits(t, cfg, limits, tenantLimits, blocksDir, reg, false)
 	require.NoError(t, err)
 	require.NoError(t, services.StartAndAwaitRunning(context.Background(), i))
+	defer services.StopAndAwaitTerminated(context.Background(), i) //nolint:errcheck
+
 	// Wait until it's ACTIVE
 	test.Poll(t, time.Second, ring.ACTIVE, func() interface{} {
 		return i.lifecycler.GetState()

From e3cc2977dd0f8c77a5b237268e28bc218bd95d7b Mon Sep 17 00:00:00 2001
From: Charlie Le <charlie_le@apple.com>
Date: Wed, 22 Jan 2025 12:30:43 -0800
Subject: [PATCH 20/34] Mark 1.19 release in progress

https://github.com/cortexproject/cortex/blob/master/RELEASE.md#show-that-a-release-is-in-progress

Signed-off-by: Charlie Le <charlie_le@apple.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a6e57611f33..fbd2ef17896 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## master / unreleased
 
+## 1.19.0 in progress
+
 * [CHANGE] Deprecate `-blocks-storage.tsdb.wal-compression-enabled` flag (use `blocks-storage.tsdb.wal-compression-type` instead). #6529
 * [CHANGE] OTLP: Change OTLP handler to be consistent with the Prometheus OTLP handler. #6272
 - `target_info` metric is enabled by default and can be disabled via `-distributor.otlp.disable-target-info=true` flag

From e9584c02b1cb937267ba2f9b82c5e75dfa704d91 Mon Sep 17 00:00:00 2001
From: Charlie Le <charlie_le@apple.com>
Date: Wed, 22 Jan 2025 21:10:43 -0800
Subject: [PATCH 21/34] Prepare 1.19.0-rc.0

Signed-off-by: Charlie Le <charlie_le@apple.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 CHANGELOG.md | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 VERSION      |  2 +-
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fbd2ef17896..3f24da38962 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,15 @@
 * [CHANGE] Enable Compactor and Alertmanager in target all. #6204
 * [CHANGE] Update the `cortex_ingester_inflight_push_requests` metric to represent the maximum number of inflight requests recorded in the last minute. #6437
 * [CHANGE] gRPC Client: Expose connection timeout and set default to value to 5s. #6523
+* [CHANGE] Enable analysis on Thanos PromQL engine #6472
+* [CHANGE] Log grafana headers along with query request before query execution #6391
+* [CHANGE] Log query requests in QFE before query execution #6390
+* [CHANGE] Log when a request starts running in querier #6525
+* [CHANGE] Remove openstack swfit experimental in blocks storage #6322
+* [CHANGE] Remove openstack swift experimental #6316
+* [CHANGE] Replace `cespare/xxhash` with `cespare/xxhash/v2` #6467
+* [CHANGE] Stop using global instant query codec #6328
+* [CHANGE] Unify query frontend instant and range protos #6180
 * [FEATURE] Ruler: Add an experimental flag `-ruler.query-response-format` to retrieve query response as a proto format. #6345
 * [FEATURE] Ruler: Pagination support for List Rules API. #6299
 * [FEATURE] Query Frontend/Querier: Add protobuf codec `-api.querier-default-codec` and the option to choose response compression type `-querier.response-compression`. #5527
@@ -26,6 +35,13 @@
 * [FEATURE] Query Frontend: Support a metadata federated query when `-tenant-federation.enabled=true`. #6461
 * [FEATURE] Query Frontend: Support an exemplar federated query when `-tenant-federation.enabled=true`. #6455
 * [FEATURE] Ingester/StoreGateway: Add support for cache regex query matchers via `-ingester.matchers-cache-max-items` and `-blocks-storage.bucket-store.matchers-cache-max-items`. #6477 #6491
+* [FEATURE] Add reason why the key was evicted in the `cortex_ingester_expanded_postings_cache_evicts` metric #6318
+* [FEATURE] Create feature flag to switch between current shuffle sharding group planner and partition compaction group planner #6141
+* [FEATURE] Hook up partition compaction end to end implementation #6510
+* [FEATURE] Implement partition compaction grouper #6172
+* [FEATURE] Implement partition compaction planner #6469
+* [FEATURE] Make LivenessCheck Timeout Configurable #6227
+* [FEATURE] Querier: Add day range limit for LabelNames and LabelValues #6233
 * [ENHANCEMENT] Query Frontend: Add more operation label values to the `cortex_query_frontend_queries_total` metric. #6519
 * [ENHANCEMENT] Query Frontend: Add a `source` label to query stat metrics. #6470
 * [ENHANCEMENT] Query Frontend: Add a flag `-tenant-federation.max-tenant` to limit the number of tenants for federated query. #6493
@@ -66,14 +82,50 @@
 * [ENHANCEMENT] Distributor: Added `cortex_distributor_received_samples_per_labelset_total` metric to calculate ingestion rate per label set. #6443
 * [ENHANCEMENT] Added metric name in limiter per-metric exceeded errors. #6416
 * [ENHANCEMENT] StoreGateway: Added `cortex_bucket_store_indexheader_load_duration_seconds` and `cortex_bucket_store_indexheader_download_duration_seconds` metrics for time of downloading and loading index header files. #6445
+* [ENHANCEMENT] Add cleaner logic to clean partition compaction blocks and related files #6507
+* [ENHANCEMENT] Add support for native histograms in querier protobuf codec #6368
+* [ENHANCEMENT] Add timeout on lifecycler heartbeat #6212
+* [ENHANCEMENT] Added UserReplicaGroupMetrics #6463
 * [ENHANCEMENT] Blocks Storage: Allow use of non-dualstack endpoints for S3 blocks storage via `-blocks-storage.s3.disable-dualstack`. #6522
+* [ENHANCEMENT] Discarded samples per labelset metrics for throttle by labelset #6492
+* [ENHANCEMENT] Expanded Postings Cache can cache results without the nearly created series under high load. #6417
+* [ENHANCEMENT] Ingester: Disable chunk trimming. #6270
+* [ENHANCEMENT] Improve consistency check warn log #6366
+* [ENHANCEMENT] Improve streaming on MetricsForLabelMatchersStream method #6436
+* [ENHANCEMENT] Improve validation metrics for discarded samples and exemplars #6218
+* [ENHANCEMENT] Query Frontend: add new field for dense native histogram format #6199
+* [ENHANCEMENT] Return 503 on hitting distributor instance limits #6387
+* [ENHANCEMENT] Reusing Batch Iterators #6403
+* [ENHANCEMENT] Reusing the grpc client to peform healthcheck #6260
+* [ENHANCEMENT] Store Gateway: Add pre add block ownership check #6483
+* [ENHANCEMENT] Use slice pooling to populate the query stream response #6466
+* [ENHANCEMENT] Using a single seed array for expanded postings cache on ingesters #6365
 * [BUGFIX] Runtime-config: Handle absolute file paths when working directory is not / #6224
 * [BUGFIX] Ruler: Allow rule evaluation to complete during shutdown. #6326
-* [BUGFIX] Ring: update ring with new ip address when instance is lost, rejoins, but heartbeat is disabled.  #6271
+* [BUGFIX] Ring: update ring with new ip address when instance is lost, rejoins, but heartbeat is disabled. #6271
 * [BUGFIX] Ingester: Fix regression on usage of cortex_ingester_queried_chunks. #6398
 * [BUGFIX] Ingester: Fix possible race condition when `active series per LabelSet` is configured. #6409
 * [BUGFIX] Query Frontend: Fix @ modifier not being applied correctly on sub queries. #6450
 * [BUGFIX] Cortex Redis flags with multiple dots #6476
+* [BUGFIX] Bug fix on JSON Tag #6339
+* [BUGFIX] Calculate # of concurrency only once at the runner #6506
+* [BUGFIX] Clean up ingester per labelset metrics #6439
+* [BUGFIX] Cleanup dangling request queue metrics #6433
+* [BUGFIX] Fix BenchmarkDistributor_Push benchmark #6309
+* [BUGFIX] Fix data race on expanded postings Cache #6369
+* [BUGFIX] Fix lazy postings merge bug #6415
+* [BUGFIX] Fix race on chunks multilevel cache + Optimize to avoid refetching already found keys. #6312
+* [BUGFIX] Fix race on the string interning #6408
+* [BUGFIX] Fix race that can cause nil reference when using expanded postings #6518
+* [BUGFIX] Fix regression of query range result cache unable to parse old cached results #6196
+* [BUGFIX] Fix typo in usage message for querier.split-queries-by-interval flag #6305
+* [BUGFIX] Fix: PostingCache promise should fetch data only once #6314
+* [BUGFIX] Fix: fix slice init length #6237
+* [BUGFIX] Fixed bug that blocks cannot be fully deleted from TSDB #6231
+* [BUGFIX] Fixed ingester ReadOnly state related bugs #6208
+* [BUGFIX] Preserve ingester state on restart #6301
+* [BUGFIX] Purge expired postings cache items due inactivity #6502
+* [BUGFIX] Util: Check context every N iterations #6250
 
 ## 1.18.1 2024-10-14
 
@@ -1173,7 +1225,7 @@ Note the blocks storage compactor runs a migration task at startup in this versi
 * [ENHANCEMENT] Improve performance of QueryStream() in ingesters. #3177
 * [ENHANCEMENT] Modules included in "All" target are now visible in output of `-modules` CLI flag. #3155
 * [ENHANCEMENT] Added `/debug/fgprof` endpoint to debug running Cortex process using `fgprof`. This adds up to the existing `/debug/...` endpoints. #3131
-* [ENHANCEMENT] Blocks storage: optimised `/api/v1/series` for blocks storage. (#2976)
+* [ENHANCEMENT] Blocks storage: optimised `/api/v1/series` for blocks storage. #2976
 * [BUGFIX] Ruler: when loading rules from "local" storage, check for directory after resolving symlink. #3137
 * [BUGFIX] Query-frontend: Fixed rounding for incoming query timestamps, to be 100% Prometheus compatible. #2990
 * [BUGFIX] Querier: Merge results from chunks and blocks ingesters when using streaming of results. #3013
diff --git a/VERSION b/VERSION
index ec6d649be65..d3de7a2e908 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.18.1
+1.19.0-rc.0

From 83ddfb85570cb3df6e9121c1b0313f63dd35ffee Mon Sep 17 00:00:00 2001
From: Charlie Le <charlie_le@apple.com>
Date: Thu, 23 Jan 2025 17:34:26 -0800
Subject: [PATCH 22/34] Revert "Prepare 1.19.0-rc.0"

Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 CHANGELOG.md | 56 ++--------------------------------------------------
 VERSION      |  2 +-
 2 files changed, 3 insertions(+), 55 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3f24da38962..fbd2ef17896 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,15 +14,6 @@
 * [CHANGE] Enable Compactor and Alertmanager in target all. #6204
 * [CHANGE] Update the `cortex_ingester_inflight_push_requests` metric to represent the maximum number of inflight requests recorded in the last minute. #6437
 * [CHANGE] gRPC Client: Expose connection timeout and set default to value to 5s. #6523
-* [CHANGE] Enable analysis on Thanos PromQL engine #6472
-* [CHANGE] Log grafana headers along with query request before query execution #6391
-* [CHANGE] Log query requests in QFE before query execution #6390
-* [CHANGE] Log when a request starts running in querier #6525
-* [CHANGE] Remove openstack swfit experimental in blocks storage #6322
-* [CHANGE] Remove openstack swift experimental #6316
-* [CHANGE] Replace `cespare/xxhash` with `cespare/xxhash/v2` #6467
-* [CHANGE] Stop using global instant query codec #6328
-* [CHANGE] Unify query frontend instant and range protos #6180
 * [FEATURE] Ruler: Add an experimental flag `-ruler.query-response-format` to retrieve query response as a proto format. #6345
 * [FEATURE] Ruler: Pagination support for List Rules API. #6299
 * [FEATURE] Query Frontend/Querier: Add protobuf codec `-api.querier-default-codec` and the option to choose response compression type `-querier.response-compression`. #5527
@@ -35,13 +26,6 @@
 * [FEATURE] Query Frontend: Support a metadata federated query when `-tenant-federation.enabled=true`. #6461
 * [FEATURE] Query Frontend: Support an exemplar federated query when `-tenant-federation.enabled=true`. #6455
 * [FEATURE] Ingester/StoreGateway: Add support for cache regex query matchers via `-ingester.matchers-cache-max-items` and `-blocks-storage.bucket-store.matchers-cache-max-items`. #6477 #6491
-* [FEATURE] Add reason why the key was evicted in the `cortex_ingester_expanded_postings_cache_evicts` metric #6318
-* [FEATURE] Create feature flag to switch between current shuffle sharding group planner and partition compaction group planner #6141
-* [FEATURE] Hook up partition compaction end to end implementation #6510
-* [FEATURE] Implement partition compaction grouper #6172
-* [FEATURE] Implement partition compaction planner #6469
-* [FEATURE] Make LivenessCheck Timeout Configurable #6227
-* [FEATURE] Querier: Add day range limit for LabelNames and LabelValues #6233
 * [ENHANCEMENT] Query Frontend: Add more operation label values to the `cortex_query_frontend_queries_total` metric. #6519
 * [ENHANCEMENT] Query Frontend: Add a `source` label to query stat metrics. #6470
 * [ENHANCEMENT] Query Frontend: Add a flag `-tenant-federation.max-tenant` to limit the number of tenants for federated query. #6493
@@ -82,50 +66,14 @@
 * [ENHANCEMENT] Distributor: Added `cortex_distributor_received_samples_per_labelset_total` metric to calculate ingestion rate per label set. #6443
 * [ENHANCEMENT] Added metric name in limiter per-metric exceeded errors. #6416
 * [ENHANCEMENT] StoreGateway: Added `cortex_bucket_store_indexheader_load_duration_seconds` and `cortex_bucket_store_indexheader_download_duration_seconds` metrics for time of downloading and loading index header files. #6445
-* [ENHANCEMENT] Add cleaner logic to clean partition compaction blocks and related files #6507
-* [ENHANCEMENT] Add support for native histograms in querier protobuf codec #6368
-* [ENHANCEMENT] Add timeout on lifecycler heartbeat #6212
-* [ENHANCEMENT] Added UserReplicaGroupMetrics #6463
 * [ENHANCEMENT] Blocks Storage: Allow use of non-dualstack endpoints for S3 blocks storage via `-blocks-storage.s3.disable-dualstack`. #6522
-* [ENHANCEMENT] Discarded samples per labelset metrics for throttle by labelset #6492
-* [ENHANCEMENT] Expanded Postings Cache can cache results without the nearly created series under high load. #6417
-* [ENHANCEMENT] Ingester: Disable chunk trimming. #6270
-* [ENHANCEMENT] Improve consistency check warn log #6366
-* [ENHANCEMENT] Improve streaming on MetricsForLabelMatchersStream method #6436
-* [ENHANCEMENT] Improve validation metrics for discarded samples and exemplars #6218
-* [ENHANCEMENT] Query Frontend: add new field for dense native histogram format #6199
-* [ENHANCEMENT] Return 503 on hitting distributor instance limits #6387
-* [ENHANCEMENT] Reusing Batch Iterators #6403
-* [ENHANCEMENT] Reusing the grpc client to peform healthcheck #6260
-* [ENHANCEMENT] Store Gateway: Add pre add block ownership check #6483
-* [ENHANCEMENT] Use slice pooling to populate the query stream response #6466
-* [ENHANCEMENT] Using a single seed array for expanded postings cache on ingesters #6365
 * [BUGFIX] Runtime-config: Handle absolute file paths when working directory is not / #6224
 * [BUGFIX] Ruler: Allow rule evaluation to complete during shutdown. #6326
-* [BUGFIX] Ring: update ring with new ip address when instance is lost, rejoins, but heartbeat is disabled. #6271
+* [BUGFIX] Ring: update ring with new ip address when instance is lost, rejoins, but heartbeat is disabled.  #6271
 * [BUGFIX] Ingester: Fix regression on usage of cortex_ingester_queried_chunks. #6398
 * [BUGFIX] Ingester: Fix possible race condition when `active series per LabelSet` is configured. #6409
 * [BUGFIX] Query Frontend: Fix @ modifier not being applied correctly on sub queries. #6450
 * [BUGFIX] Cortex Redis flags with multiple dots #6476
-* [BUGFIX] Bug fix on JSON Tag #6339
-* [BUGFIX] Calculate # of concurrency only once at the runner #6506
-* [BUGFIX] Clean up ingester per labelset metrics #6439
-* [BUGFIX] Cleanup dangling request queue metrics #6433
-* [BUGFIX] Fix BenchmarkDistributor_Push benchmark #6309
-* [BUGFIX] Fix data race on expanded postings Cache #6369
-* [BUGFIX] Fix lazy postings merge bug #6415
-* [BUGFIX] Fix race on chunks multilevel cache + Optimize to avoid refetching already found keys. #6312
-* [BUGFIX] Fix race on the string interning #6408
-* [BUGFIX] Fix race that can cause nil reference when using expanded postings #6518
-* [BUGFIX] Fix regression of query range result cache unable to parse old cached results #6196
-* [BUGFIX] Fix typo in usage message for querier.split-queries-by-interval flag #6305
-* [BUGFIX] Fix: PostingCache promise should fetch data only once #6314
-* [BUGFIX] Fix: fix slice init length #6237
-* [BUGFIX] Fixed bug that blocks cannot be fully deleted from TSDB #6231
-* [BUGFIX] Fixed ingester ReadOnly state related bugs #6208
-* [BUGFIX] Preserve ingester state on restart #6301
-* [BUGFIX] Purge expired postings cache items due inactivity #6502
-* [BUGFIX] Util: Check context every N iterations #6250
 
 ## 1.18.1 2024-10-14
 
@@ -1225,7 +1173,7 @@ Note the blocks storage compactor runs a migration task at startup in this versi
 * [ENHANCEMENT] Improve performance of QueryStream() in ingesters. #3177
 * [ENHANCEMENT] Modules included in "All" target are now visible in output of `-modules` CLI flag. #3155
 * [ENHANCEMENT] Added `/debug/fgprof` endpoint to debug running Cortex process using `fgprof`. This adds up to the existing `/debug/...` endpoints. #3131
-* [ENHANCEMENT] Blocks storage: optimised `/api/v1/series` for blocks storage. #2976
+* [ENHANCEMENT] Blocks storage: optimised `/api/v1/series` for blocks storage. (#2976)
 * [BUGFIX] Ruler: when loading rules from "local" storage, check for directory after resolving symlink. #3137
 * [BUGFIX] Query-frontend: Fixed rounding for incoming query timestamps, to be 100% Prometheus compatible. #2990
 * [BUGFIX] Querier: Merge results from chunks and blocks ingesters when using streaming of results. #3013
diff --git a/VERSION b/VERSION
index d3de7a2e908..ec6d649be65 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.19.0-rc.0
+1.18.1

From a9d69a65bf0e288e2eb851ab5aba5dacd4905017 Mon Sep 17 00:00:00 2001
From: Alex Le <leqiyue@amazon.com>
Date: Fri, 24 Jan 2025 15:13:31 -0800
Subject: [PATCH 23/34] Fixed blocksGroupWithPartition unable to reuse
 functions from blocksGroup (#6547)

* Fixed blocksGroupWithPartition unable to reuse functions from blocksGroup

Signed-off-by: Alex Le <leqiyue@amazon.com>

* update tests

Signed-off-by: Alex Le <leqiyue@amazon.com>

---------

Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 pkg/compactor/partition_compaction_grouper.go |  11 +-
 .../partition_compaction_grouper_test.go      | 132 +++++++++++++++++-
 2 files changed, 132 insertions(+), 11 deletions(-)

diff --git a/pkg/compactor/partition_compaction_grouper.go b/pkg/compactor/partition_compaction_grouper.go
index 1340093ab28..53f7762df87 100644
--- a/pkg/compactor/partition_compaction_grouper.go
+++ b/pkg/compactor/partition_compaction_grouper.go
@@ -499,9 +499,11 @@ func (g *PartitionCompactionGrouper) partitionBlocksGroup(partitionCount int, bl
 	addToPartitionedGroups := func(blocks []*metadata.Meta, partitionID int) {
 		if _, ok := partitionedGroups[partitionID]; !ok {
 			partitionedGroups[partitionID] = blocksGroupWithPartition{
-				rangeStart: rangeStart,
-				rangeEnd:   rangeEnd,
-				blocks:     []*metadata.Meta{},
+				blocksGroup: blocksGroup{
+					rangeStart: rangeStart,
+					rangeEnd:   rangeEnd,
+					blocks:     []*metadata.Meta{},
+				},
 			}
 		}
 		partitionedGroup := partitionedGroups[partitionID]
@@ -868,9 +870,6 @@ func (t *timeRangeStatus) previousTimeRangeDuration() time.Duration {
 
 type blocksGroupWithPartition struct {
 	blocksGroup
-	rangeStart           int64 // Included.
-	rangeEnd             int64 // Excluded.
-	blocks               []*metadata.Meta
 	groupHash            uint32
 	partitionedGroupInfo *PartitionedGroupInfo
 	partition            Partition
diff --git a/pkg/compactor/partition_compaction_grouper_test.go b/pkg/compactor/partition_compaction_grouper_test.go
index 2167a219ae1..259981c33cd 100644
--- a/pkg/compactor/partition_compaction_grouper_test.go
+++ b/pkg/compactor/partition_compaction_grouper_test.go
@@ -84,6 +84,64 @@ func TestPartitionCompactionGrouper_GenerateCompactionJobs(t *testing.T) {
 				{blocks: []ulid.ULID{block3, block4}, partitionCount: 1, partitionID: 0, rangeStart: 2 * H, rangeEnd: 4 * H},
 			},
 		},
+		"only level 1 blocks with ingestion replication factor 3": {
+			ranges: []time.Duration{2 * time.Hour, 12 * time.Hour, 24 * time.Hour},
+			blocks: map[ulid.ULID]mockBlock{
+				block1: {
+					meta: &metadata.Meta{
+						BlockMeta: tsdb.BlockMeta{ULID: block1, MinTime: 0 * H, MaxTime: 2 * H, Compaction: tsdb.BlockMetaCompaction{Level: 1}, Stats: tsdb.BlockStats{NumSeries: 1}},
+						Thanos:    metadata.Thanos{Files: []metadata.File{{RelPath: thanosblock.IndexFilename, SizeBytes: 0}}},
+					},
+					timeRange:        2 * time.Hour,
+					hasNoCompactMark: false,
+				},
+				block2: {
+					meta: &metadata.Meta{
+						BlockMeta: tsdb.BlockMeta{ULID: block2, MinTime: 0 * H, MaxTime: 2 * H, Compaction: tsdb.BlockMetaCompaction{Level: 1}, Stats: tsdb.BlockStats{NumSeries: 1}},
+						Thanos:    metadata.Thanos{Files: []metadata.File{{RelPath: thanosblock.IndexFilename, SizeBytes: 0}}},
+					},
+					timeRange:        2 * time.Hour,
+					hasNoCompactMark: false,
+				},
+				block3: {
+					meta: &metadata.Meta{
+						BlockMeta: tsdb.BlockMeta{ULID: block3, MinTime: 0 * H, MaxTime: 2 * H, Compaction: tsdb.BlockMetaCompaction{Level: 1}, Stats: tsdb.BlockStats{NumSeries: 1}},
+						Thanos:    metadata.Thanos{Files: []metadata.File{{RelPath: thanosblock.IndexFilename, SizeBytes: 0}}},
+					},
+					timeRange:        2 * time.Hour,
+					hasNoCompactMark: false,
+				},
+				block4: {
+					meta: &metadata.Meta{
+						BlockMeta: tsdb.BlockMeta{ULID: block4, MinTime: 0 * H, MaxTime: 2 * H, Compaction: tsdb.BlockMetaCompaction{Level: 1}, Stats: tsdb.BlockStats{NumSeries: 1}},
+						Thanos:    metadata.Thanos{Files: []metadata.File{{RelPath: thanosblock.IndexFilename, SizeBytes: 0}}},
+					},
+					timeRange:        2 * time.Hour,
+					hasNoCompactMark: false,
+				},
+				block5: {
+					meta: &metadata.Meta{
+						BlockMeta: tsdb.BlockMeta{ULID: block5, MinTime: 0 * H, MaxTime: 2 * H, Compaction: tsdb.BlockMetaCompaction{Level: 1}, Stats: tsdb.BlockStats{NumSeries: 1}},
+						Thanos:    metadata.Thanos{Files: []metadata.File{{RelPath: thanosblock.IndexFilename, SizeBytes: 0}}},
+					},
+					timeRange:        2 * time.Hour,
+					hasNoCompactMark: false,
+				},
+				block6: {
+					meta: &metadata.Meta{
+						BlockMeta: tsdb.BlockMeta{ULID: block6, MinTime: 0 * H, MaxTime: 2 * H, Compaction: tsdb.BlockMetaCompaction{Level: 1}, Stats: tsdb.BlockStats{NumSeries: 1}},
+						Thanos:    metadata.Thanos{Files: []metadata.File{{RelPath: thanosblock.IndexFilename, SizeBytes: 0}}},
+					},
+					timeRange:        2 * time.Hour,
+					hasNoCompactMark: false,
+				},
+			},
+			existingPartitionedGroups: []mockExistingPartitionedGroup{},
+			expected: []expectedCompactionJob{
+				{blocks: []ulid.ULID{block1, block2, block3, block4, block5, block6}, partitionCount: 1, partitionID: 0, rangeStart: 0 * H, rangeEnd: 2 * H},
+			},
+			ingestionReplicationFactor: 3,
+		},
 		"only level 1 blocks, there is existing partitioned group file": {
 			ranges: []time.Duration{2 * time.Hour, 12 * time.Hour, 24 * time.Hour},
 			blocks: map[ulid.ULID]mockBlock{
@@ -499,6 +557,65 @@ func TestPartitionCompactionGrouper_GenerateCompactionJobs(t *testing.T) {
 				{blocks: []ulid.ULID{block1, block2, block3}, partitionCount: 1, partitionID: 0, rangeStart: 0 * H, rangeEnd: 12 * H},
 			},
 		},
+		"level 2 blocks with ingestion replication factor 3": {
+			ranges: []time.Duration{2 * time.Hour, 12 * time.Hour, 24 * time.Hour},
+			blocks: map[ulid.ULID]mockBlock{
+				block1: {
+					meta: &metadata.Meta{
+						BlockMeta: tsdb.BlockMeta{ULID: block1, MinTime: 0 * H, MaxTime: 2 * H, Compaction: tsdb.BlockMetaCompaction{Level: 2}, Stats: tsdb.BlockStats{NumSeries: 1}},
+						Thanos:    metadata.Thanos{Extensions: cortextsdb.CortexMetaExtensions{PartitionInfo: &cortextsdb.PartitionInfo{PartitionCount: 2, PartitionID: 0}}, Files: []metadata.File{{RelPath: thanosblock.IndexFilename, SizeBytes: 0}}},
+					},
+					timeRange:        2 * time.Hour,
+					hasNoCompactMark: false,
+				},
+				block2: {
+					meta: &metadata.Meta{
+						BlockMeta: tsdb.BlockMeta{ULID: block2, MinTime: 0 * H, MaxTime: 2 * H, Compaction: tsdb.BlockMetaCompaction{Level: 2}, Stats: tsdb.BlockStats{NumSeries: 1}},
+						Thanos:    metadata.Thanos{Extensions: cortextsdb.CortexMetaExtensions{PartitionInfo: &cortextsdb.PartitionInfo{PartitionCount: 2, PartitionID: 1}}, Files: []metadata.File{{RelPath: thanosblock.IndexFilename, SizeBytes: 0}}},
+					},
+					timeRange:        2 * time.Hour,
+					hasNoCompactMark: false,
+				},
+				block3: {
+					meta: &metadata.Meta{
+						BlockMeta: tsdb.BlockMeta{ULID: block3, MinTime: 2 * H, MaxTime: 4 * H, Compaction: tsdb.BlockMetaCompaction{Level: 2}, Stats: tsdb.BlockStats{NumSeries: 1}},
+						Thanos:    metadata.Thanos{Extensions: cortextsdb.CortexMetaExtensions{PartitionInfo: &cortextsdb.PartitionInfo{PartitionCount: 2, PartitionID: 0}}, Files: []metadata.File{{RelPath: thanosblock.IndexFilename, SizeBytes: 0}}},
+					},
+					timeRange:        2 * time.Hour,
+					hasNoCompactMark: false,
+				},
+				block4: {
+					meta: &metadata.Meta{
+						BlockMeta: tsdb.BlockMeta{ULID: block4, MinTime: 2 * H, MaxTime: 4 * H, Compaction: tsdb.BlockMetaCompaction{Level: 2}, Stats: tsdb.BlockStats{NumSeries: 1}},
+						Thanos:    metadata.Thanos{Extensions: cortextsdb.CortexMetaExtensions{PartitionInfo: &cortextsdb.PartitionInfo{PartitionCount: 2, PartitionID: 1}}, Files: []metadata.File{{RelPath: thanosblock.IndexFilename, SizeBytes: 0}}},
+					},
+					timeRange:        2 * time.Hour,
+					hasNoCompactMark: false,
+				},
+				block5: {
+					meta: &metadata.Meta{
+						BlockMeta: tsdb.BlockMeta{ULID: block5, MinTime: 4 * H, MaxTime: 6 * H, Compaction: tsdb.BlockMetaCompaction{Level: 2}, Stats: tsdb.BlockStats{NumSeries: 1}},
+						Thanos:    metadata.Thanos{Extensions: cortextsdb.CortexMetaExtensions{PartitionInfo: &cortextsdb.PartitionInfo{PartitionCount: 2, PartitionID: 0}}, Files: []metadata.File{{RelPath: thanosblock.IndexFilename, SizeBytes: 0}}},
+					},
+					timeRange:        2 * time.Hour,
+					hasNoCompactMark: false,
+				},
+				block6: {
+					meta: &metadata.Meta{
+						BlockMeta: tsdb.BlockMeta{ULID: block6, MinTime: 4 * H, MaxTime: 6 * H, Compaction: tsdb.BlockMetaCompaction{Level: 2}, Stats: tsdb.BlockStats{NumSeries: 1}},
+						Thanos:    metadata.Thanos{Extensions: cortextsdb.CortexMetaExtensions{PartitionInfo: &cortextsdb.PartitionInfo{PartitionCount: 2, PartitionID: 1}}, Files: []metadata.File{{RelPath: thanosblock.IndexFilename, SizeBytes: 0}}},
+					},
+					timeRange:        2 * time.Hour,
+					hasNoCompactMark: false,
+				},
+			},
+			existingPartitionedGroups: []mockExistingPartitionedGroup{},
+			expected: []expectedCompactionJob{
+				{blocks: []ulid.ULID{block1, block3, block5}, partitionCount: 2, partitionID: 0, rangeStart: 0 * H, rangeEnd: 12 * H},
+				{blocks: []ulid.ULID{block2, block4, block6}, partitionCount: 2, partitionID: 1, rangeStart: 0 * H, rangeEnd: 12 * H},
+			},
+			ingestionReplicationFactor: 3,
+		},
 		"level 2 blocks along with level 3 blocks from some of partitions, level 1 blocks in different time range, there are partitioned group files for all groups": {
 			ranges: []time.Duration{2 * time.Hour, 12 * time.Hour, 24 * time.Hour},
 			blocks: map[ulid.ULID]mockBlock{
@@ -1966,6 +2083,10 @@ func TestPartitionCompactionGrouper_GenerateCompactionJobs(t *testing.T) {
 
 			ctx, cancel := context.WithCancel(context.Background())
 			defer cancel()
+			ingestionReplicationFactor := 1
+			if testCase.ingestionReplicationFactor > 1 {
+				ingestionReplicationFactor = testCase.ingestionReplicationFactor
+			}
 			g := NewPartitionCompactionGrouper(
 				ctx,
 				nil,
@@ -1988,7 +2109,7 @@ func TestPartitionCompactionGrouper_GenerateCompactionJobs(t *testing.T) {
 				false,
 				visitMarkerTimeout,
 				noCompactFilter,
-				1,
+				ingestionReplicationFactor,
 			)
 			actual, err := g.generateCompactionJobs(testCase.getBlocks())
 			require.NoError(t, err)
@@ -2011,10 +2132,11 @@ func TestPartitionCompactionGrouper_GenerateCompactionJobs(t *testing.T) {
 }
 
 type generateCompactionJobsTestCase struct {
-	ranges                    []time.Duration
-	blocks                    map[ulid.ULID]mockBlock
-	existingPartitionedGroups []mockExistingPartitionedGroup
-	expected                  []expectedCompactionJob
+	ranges                     []time.Duration
+	blocks                     map[ulid.ULID]mockBlock
+	existingPartitionedGroups  []mockExistingPartitionedGroup
+	expected                   []expectedCompactionJob
+	ingestionReplicationFactor int
 }
 
 func (g *generateCompactionJobsTestCase) setupBucketStore(t *testing.T, bkt *bucket.ClientMock, userID string, visitMarkerTimeout time.Duration) {

From 1035fa17bdd2f5f951a5d0594113f0878c0fab2b Mon Sep 17 00:00:00 2001
From: SungJin1212 <tjdwls1201@gmail.com>
Date: Sat, 25 Jan 2025 08:54:12 +0900
Subject: [PATCH 24/34] Remove TransferChunks gRPC method (#6543)

Signed-off-by: SungJin1212 <tjdwls1201@gmail.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 pkg/cortex/cortex.go | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pkg/cortex/cortex.go b/pkg/cortex/cortex.go
index ee62bf7ee53..4623ea2c48a 100644
--- a/pkg/cortex/cortex.go
+++ b/pkg/cortex/cortex.go
@@ -344,13 +344,10 @@ func New(cfg Config) (*Cortex, error) {
 		tenant.WithDefaultResolver(tenant.NewMultiResolver())
 	}
 
-	// Don't check auth header on TransferChunks, as we weren't originally
-	// sending it and this could cause transfers to fail on update.
 	cfg.API.HTTPAuthMiddleware = fakeauth.SetupAuthMiddleware(&cfg.Server, cfg.AuthEnabled,
 		// Also don't check auth for these gRPC methods, since single call is used for multiple users (or no user like health check).
 		[]string{
 			"/grpc.health.v1.Health/Check",
-			"/cortex.Ingester/TransferChunks",
 			"/frontend.Frontend/Process",
 			"/frontend.Frontend/NotifyClientShutdown",
 			"/schedulerpb.SchedulerForFrontend/FrontendLoop",

From cdc6781c6a770352c93d957057f46df4400803f6 Mon Sep 17 00:00:00 2001
From: Alan Protasio <approtas@amazon.com>
Date: Mon, 27 Jan 2025 19:28:58 -0800
Subject: [PATCH 25/34] Uupdate Ppromqlsmith (#6557)

Signed-off-by: alanprot <alanprot@gmail.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 go.mod                                        |  2 +-
 go.sum                                        |  4 +-
 .../cortexproject/promqlsmith/walk.go         | 53 +++++++++++++++----
 vendor/modules.txt                            |  2 +-
 4 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/go.mod b/go.mod
index be6de1356eb..a3fe4fdf8bb 100644
--- a/go.mod
+++ b/go.mod
@@ -9,7 +9,7 @@ require (
 	github.com/armon/go-metrics v0.4.1
 	github.com/aws/aws-sdk-go v1.55.5
 	github.com/bradfitz/gomemcache v0.0.0-20230905024940-24af94b03874
-	github.com/cortexproject/promqlsmith v0.0.0-20241121054008-8b48fe2471ef
+	github.com/cortexproject/promqlsmith v0.0.0-20250128002239-eaf3e57157fc
 	github.com/dustin/go-humanize v1.0.1
 	github.com/efficientgo/core v1.0.0-rc.3
 	github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb
diff --git a/go.sum b/go.sum
index 64dfdce3ba3..549b4e367c1 100644
--- a/go.sum
+++ b/go.sum
@@ -934,8 +934,8 @@ github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3Ee
 github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
 github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs=
 github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
-github.com/cortexproject/promqlsmith v0.0.0-20241121054008-8b48fe2471ef h1:wR21ZiKkA+wN2KG43qrK33IkFduY9JUa6th6P2KEU0o=
-github.com/cortexproject/promqlsmith v0.0.0-20241121054008-8b48fe2471ef/go.mod h1:xbYQa0KX6Eh6YWbTBfZ9kK3N4hRxX+ZPIfVIY2U/y00=
+github.com/cortexproject/promqlsmith v0.0.0-20250128002239-eaf3e57157fc h1:paM+YXwcIaz108+bFrm5aCepwVSxBfq48gb3Zc1z/nQ=
+github.com/cortexproject/promqlsmith v0.0.0-20250128002239-eaf3e57157fc/go.mod h1:xbYQa0KX6Eh6YWbTBfZ9kK3N4hRxX+ZPIfVIY2U/y00=
 github.com/cortexproject/weaveworks-common v0.0.0-20241129212437-96019edf21f1 h1:UoSixdl0sBUhfEOMpIGxFnJjp3/y/+nkw6Du7su05FE=
 github.com/cortexproject/weaveworks-common v0.0.0-20241129212437-96019edf21f1/go.mod h1:7cl8fS/nivXe2DmBUUmr/3UGTJG2jVU2NRaIayR2Zjs=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
diff --git a/vendor/github.com/cortexproject/promqlsmith/walk.go b/vendor/github.com/cortexproject/promqlsmith/walk.go
index 24268412d53..06d55067b14 100644
--- a/vendor/github.com/cortexproject/promqlsmith/walk.go
+++ b/vendor/github.com/cortexproject/promqlsmith/walk.go
@@ -2,6 +2,7 @@ package promqlsmith
 
 import (
 	"fmt"
+	"math"
 	"math/rand"
 	"sort"
 	"strings"
@@ -452,7 +453,7 @@ func (s *PromQLSmith) walkLabelMatchers() []*labels.Matcher {
 	}
 	series := s.seriesSet[s.rnd.Intn(len(s.seriesSet))]
 	orders := s.rnd.Perm(series.Len())
-	items := s.rnd.Intn((series.Len() + 1) / 2)
+	items := s.rnd.Intn(int(math.Ceil(float64(series.Len()+1) / 2)))
 	matchers := make([]*labels.Matcher, 0, items)
 	containsName := false
 	lbls := make([]labels.Label, 0, series.Len())
@@ -460,12 +461,49 @@ func (s *PromQLSmith) walkLabelMatchers() []*labels.Matcher {
 		lbls = append(lbls, l)
 	})
 
+	valF := func(v string) string {
+		val := s.rnd.Float64()
+		switch {
+		case val > 0.95:
+			return ""
+		case val > 0.90:
+			return ".*"
+		case val > 0.85:
+			return ".+"
+		case val > 0.75:
+			return fmt.Sprintf(".*%v", v[len(v)/2:])
+		default:
+			return fmt.Sprintf("%v.*", v[:len(v)/2])
+		}
+	}
+
 	for i := 0; i < items; i++ {
 
+		var matcher *labels.Matcher
+
 		if lbls[orders[i]].Name == labels.MetricName {
 			containsName = true
+			matcher = labels.MustNewMatcher(labels.MatchEqual, lbls[orders[i]].Name, lbls[orders[i]].Value)
+		} else {
+			res := s.rnd.Intn(4)
+			matchType := labels.MatchType(res)
+			switch matchType {
+			case labels.MatchEqual:
+				matcher = labels.MustNewMatcher(labels.MatchEqual, lbls[orders[i]].Name, lbls[orders[i]].Value)
+			case labels.MatchNotEqual:
+				val := lbls[orders[i]].Value
+				if s.rnd.Float64() > 0.9 {
+					val = ""
+				}
+				matcher = labels.MustNewMatcher(labels.MatchNotEqual, lbls[orders[i]].Name, val)
+			case labels.MatchRegexp:
+				matcher = labels.MustNewMatcher(labels.MatchRegexp, lbls[orders[i]].Name, valF(lbls[orders[i]].Value))
+			case labels.MatchNotRegexp:
+				matcher = labels.MustNewMatcher(labels.MatchNotRegexp, lbls[orders[i]].Name, valF(lbls[orders[i]].Value))
+			}
 		}
-		matchers = append(matchers, labels.MustNewMatcher(labels.MatchEqual, lbls[orders[i]].Name, lbls[orders[i]].Value))
+
+		matchers = append(matchers, matcher)
 	}
 
 	if !containsName {
@@ -482,8 +520,8 @@ func (s *PromQLSmith) walkLabelMatchers() []*labels.Matcher {
 	return matchers
 }
 
-// walkSelectors is similar to walkLabelMatchers, but used for generating various
-// types of matchers more than simple equal matcher.
+// walkSelectors is similar to walkLabelMatchers, but does not guarantee the equal
+// matcher on the metric name
 func (s *PromQLSmith) walkSelectors() []*labels.Matcher {
 	if len(s.seriesSet) == 0 {
 		return nil
@@ -687,13 +725,6 @@ func keepValueTypes(input []parser.ValueType, keep []parser.ValueType) []parser.
 	return out
 }
 
-func min(a, b int) int {
-	if a > b {
-		return b
-	}
-	return a
-}
-
 // generate a non-zero float64 value randomly.
 func getNonZeroFloat64(rnd *rand.Rand) float64 {
 	for {
diff --git a/vendor/modules.txt b/vendor/modules.txt
index e9f5b61da7d..aa96a865c9e 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -303,7 +303,7 @@ github.com/coreos/go-semver/semver
 ## explicit; go 1.12
 github.com/coreos/go-systemd/v22/activation
 github.com/coreos/go-systemd/v22/journal
-# github.com/cortexproject/promqlsmith v0.0.0-20241121054008-8b48fe2471ef
+# github.com/cortexproject/promqlsmith v0.0.0-20250128002239-eaf3e57157fc
 ## explicit; go 1.22.0
 github.com/cortexproject/promqlsmith
 # github.com/cristalhq/hedgedhttp v0.9.1

From 550a559cc48bcaa37296d502c61a80f93877b2c3 Mon Sep 17 00:00:00 2001
From: Justin Jung <jungjust@amazon.com>
Date: Tue, 28 Jan 2025 12:31:35 -0800
Subject: [PATCH 26/34] Query Partial Data (#6526)

* Create partial_data

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Fix lazyquery so that warning message is returned

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Add QueryPartialData limit

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Fix broken mock

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Make response with warnings to be not cached

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Updated streamingSelect in distributor_queryable

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Update query.go

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Update replication_set

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Lint

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Lint again

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Generated doc

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Changelog

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Update config description

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Do not remove warnings from seriesSet

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Avoid cache only if the warning message contains partial data error

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Remove context usage for partial data

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Refactor how partial data info is passed + apply to series and label methods as well

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Lint + fix tests

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Fix build

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Create separate config for ruler partial data

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Genereta doc

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Add more tests

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Change error

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Fix test

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Update changelog

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Update changelog

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Nit

Signed-off-by: Justin Jung <jungjust@amazon.com>

* Nit

Signed-off-by: Justin Jung <jungjust@amazon.com>

---------

Signed-off-by: Justin Jung <jungjust@amazon.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 CHANGELOG.md                                  |   2 +
 docs/configuration/config-file-reference.md   |   8 +
 pkg/cortex/modules.go                         |   4 +-
 pkg/distributor/distributor.go                |  32 ++--
 pkg/distributor/distributor_test.go           |  34 ++--
 pkg/distributor/query.go                      |  18 +-
 pkg/querier/distributor_queryable.go          | 102 +++++++++---
 pkg/querier/distributor_queryable_test.go     | 156 ++++++++++--------
 pkg/querier/lazyquery/lazyquery.go            |   5 +-
 pkg/querier/partialdata/partia_data.go        |  13 ++
 pkg/querier/partialdata/partial_data_test.go  |  13 ++
 pkg/querier/querier.go                        |   5 +-
 pkg/querier/querier_test.go                   |  58 +++----
 pkg/querier/testutils.go                      |  16 +-
 .../tripperware/queryrange/results_cache.go   |   5 +
 .../queryrange/results_cache_test.go          |  29 ++++
 pkg/ring/replication_set.go                   |  21 ++-
 pkg/ring/replication_set_test.go              |  22 ++-
 pkg/ring/replication_set_tracker.go           |  14 ++
 pkg/ring/replication_set_tracker_test.go      |  23 +++
 pkg/ruler/ruler_test.go                       |   2 +-
 pkg/util/validation/limits.go                 |  12 ++
 pkg/util/validation/limits_test.go            |  34 ++++
 23 files changed, 453 insertions(+), 175 deletions(-)
 create mode 100644 pkg/querier/partialdata/partia_data.go
 create mode 100644 pkg/querier/partialdata/partial_data_test.go

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fbd2ef17896..771bb0554a5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## master / unreleased
 
+* [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526
+
 ## 1.19.0 in progress
 
 * [CHANGE] Deprecate `-blocks-storage.tsdb.wal-compression-enabled` flag (use `blocks-storage.tsdb.wal-compression-type` instead). #6529
diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
index f0a69fee853..43948773a55 100644
--- a/docs/configuration/config-file-reference.md
+++ b/docs/configuration/config-file-reference.md
@@ -3545,6 +3545,10 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
 # CLI flag: -frontend.max-queriers-per-tenant
 [max_queriers_per_tenant: <float> | default = 0]
 
+# Enable to allow queries to be evaluated with data from a single zone, if other
+# zones are not available.
+[query_partial_data: <boolean> | default = false]
+
 # Maximum number of outstanding requests per tenant per request queue (either
 # query frontend or query scheduler); requests beyond this error with HTTP 429.
 # CLI flag: -frontend.max-outstanding-requests-per-tenant
@@ -3605,6 +3609,10 @@ query_rejection:
 # external labels for alerting rules
 [ruler_external_labels: <map of string (labelName) to string (labelValue)> | default = []]
 
+# Enable to allow rules to be evaluated with data from a single zone, if other
+# zones are not available.
+[rules_partial_data: <boolean> | default = false]
+
 # The default tenant's shard size when the shuffle-sharding strategy is used.
 # Must be set when the store-gateway sharding is enabled with the
 # shuffle-sharding strategy. When this setting is specified in the per-tenant
diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go
index 390212a313d..1bea30b7bd8 100644
--- a/pkg/cortex/modules.go
+++ b/pkg/cortex/modules.go
@@ -258,7 +258,7 @@ func (t *Cortex) initQueryable() (serv services.Service, err error) {
 	querierRegisterer := prometheus.WrapRegistererWith(prometheus.Labels{"engine": "querier"}, prometheus.DefaultRegisterer)
 
 	// Create a querier queryable and PromQL engine
-	t.QuerierQueryable, t.ExemplarQueryable, t.QuerierEngine = querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, querierRegisterer, util_log.Logger)
+	t.QuerierQueryable, t.ExemplarQueryable, t.QuerierEngine = querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, querierRegisterer, util_log.Logger, t.Overrides.QueryPartialData)
 
 	// Use distributor as default MetadataQuerier
 	t.MetadataQuerier = t.Distributor
@@ -623,7 +623,7 @@ func (t *Cortex) initRuler() (serv services.Service, err error) {
 	} else {
 		rulerRegisterer := prometheus.WrapRegistererWith(prometheus.Labels{"engine": "ruler"}, prometheus.DefaultRegisterer)
 		// TODO: Consider wrapping logger to differentiate from querier module logger
-		queryable, _, engine := querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, rulerRegisterer, util_log.Logger)
+		queryable, _, engine := querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, rulerRegisterer, util_log.Logger, t.Overrides.RulesPartialData)
 
 		managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Distributor, queryable, engine, t.Overrides, metrics, prometheus.DefaultRegisterer)
 		manager, err = ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, t.Overrides, managerFactory, metrics, prometheus.DefaultRegisterer, util_log.Logger)
diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go
index bdeeabebeb4..9aea6d8df0b 100644
--- a/pkg/distributor/distributor.go
+++ b/pkg/distributor/distributor.go
@@ -1177,8 +1177,8 @@ func getErrorStatus(err error) string {
 }
 
 // ForReplicationSet runs f, in parallel, for all ingesters in the input replication set.
-func (d *Distributor) ForReplicationSet(ctx context.Context, replicationSet ring.ReplicationSet, zoneResultsQuorum bool, f func(context.Context, ingester_client.IngesterClient) (interface{}, error)) ([]interface{}, error) {
-	return replicationSet.Do(ctx, d.cfg.ExtraQueryDelay, zoneResultsQuorum, func(ctx context.Context, ing *ring.InstanceDesc) (interface{}, error) {
+func (d *Distributor) ForReplicationSet(ctx context.Context, replicationSet ring.ReplicationSet, zoneResultsQuorum bool, partialDataEnabled bool, f func(context.Context, ingester_client.IngesterClient) (interface{}, error)) ([]interface{}, error) {
+	return replicationSet.Do(ctx, d.cfg.ExtraQueryDelay, zoneResultsQuorum, partialDataEnabled, func(ctx context.Context, ing *ring.InstanceDesc) (interface{}, error) {
 		client, err := d.ingesterPool.GetClientFor(ing.Addr)
 		if err != nil {
 			return nil, err
@@ -1228,9 +1228,9 @@ func (d *Distributor) LabelValuesForLabelNameCommon(ctx context.Context, from, t
 }
 
 // LabelValuesForLabelName returns all the label values that are associated with a given label name.
-func (d *Distributor) LabelValuesForLabelName(ctx context.Context, from, to model.Time, labelName model.LabelName, hint *storage.LabelHints, matchers ...*labels.Matcher) ([]string, error) {
+func (d *Distributor) LabelValuesForLabelName(ctx context.Context, from, to model.Time, labelName model.LabelName, hint *storage.LabelHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]string, error) {
 	return d.LabelValuesForLabelNameCommon(ctx, from, to, labelName, hint, func(ctx context.Context, rs ring.ReplicationSet, req *ingester_client.LabelValuesRequest) ([]interface{}, error) {
-		return d.ForReplicationSet(ctx, rs, d.cfg.ZoneResultsQuorumMetadata, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
+		return d.ForReplicationSet(ctx, rs, d.cfg.ZoneResultsQuorumMetadata, partialDataEnabled, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
 			resp, err := client.LabelValues(ctx, req)
 			if err != nil {
 				return nil, err
@@ -1241,9 +1241,9 @@ func (d *Distributor) LabelValuesForLabelName(ctx context.Context, from, to mode
 }
 
 // LabelValuesForLabelNameStream returns all the label values that are associated with a given label name.
-func (d *Distributor) LabelValuesForLabelNameStream(ctx context.Context, from, to model.Time, labelName model.LabelName, hint *storage.LabelHints, matchers ...*labels.Matcher) ([]string, error) {
+func (d *Distributor) LabelValuesForLabelNameStream(ctx context.Context, from, to model.Time, labelName model.LabelName, hint *storage.LabelHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]string, error) {
 	return d.LabelValuesForLabelNameCommon(ctx, from, to, labelName, hint, func(ctx context.Context, rs ring.ReplicationSet, req *ingester_client.LabelValuesRequest) ([]interface{}, error) {
-		return d.ForReplicationSet(ctx, rs, d.cfg.ZoneResultsQuorumMetadata, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
+		return d.ForReplicationSet(ctx, rs, d.cfg.ZoneResultsQuorumMetadata, partialDataEnabled, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
 			stream, err := client.LabelValuesStream(ctx, req)
 			if err != nil {
 				return nil, err
@@ -1307,9 +1307,9 @@ func (d *Distributor) LabelNamesCommon(ctx context.Context, from, to model.Time,
 	return r, nil
 }
 
-func (d *Distributor) LabelNamesStream(ctx context.Context, from, to model.Time, hints *storage.LabelHints, matchers ...*labels.Matcher) ([]string, error) {
+func (d *Distributor) LabelNamesStream(ctx context.Context, from, to model.Time, hints *storage.LabelHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]string, error) {
 	return d.LabelNamesCommon(ctx, from, to, hints, func(ctx context.Context, rs ring.ReplicationSet, req *ingester_client.LabelNamesRequest) ([]interface{}, error) {
-		return d.ForReplicationSet(ctx, rs, d.cfg.ZoneResultsQuorumMetadata, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
+		return d.ForReplicationSet(ctx, rs, d.cfg.ZoneResultsQuorumMetadata, partialDataEnabled, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
 			stream, err := client.LabelNamesStream(ctx, req)
 			if err != nil {
 				return nil, err
@@ -1333,9 +1333,9 @@ func (d *Distributor) LabelNamesStream(ctx context.Context, from, to model.Time,
 }
 
 // LabelNames returns all the label names.
-func (d *Distributor) LabelNames(ctx context.Context, from, to model.Time, hint *storage.LabelHints, matchers ...*labels.Matcher) ([]string, error) {
+func (d *Distributor) LabelNames(ctx context.Context, from, to model.Time, hint *storage.LabelHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]string, error) {
 	return d.LabelNamesCommon(ctx, from, to, hint, func(ctx context.Context, rs ring.ReplicationSet, req *ingester_client.LabelNamesRequest) ([]interface{}, error) {
-		return d.ForReplicationSet(ctx, rs, d.cfg.ZoneResultsQuorumMetadata, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
+		return d.ForReplicationSet(ctx, rs, d.cfg.ZoneResultsQuorumMetadata, partialDataEnabled, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
 			resp, err := client.LabelNames(ctx, req)
 			if err != nil {
 				return nil, err
@@ -1346,9 +1346,9 @@ func (d *Distributor) LabelNames(ctx context.Context, from, to model.Time, hint
 }
 
 // MetricsForLabelMatchers gets the metrics that match said matchers
-func (d *Distributor) MetricsForLabelMatchers(ctx context.Context, from, through model.Time, hint *storage.SelectHints, matchers ...*labels.Matcher) ([]model.Metric, error) {
+func (d *Distributor) MetricsForLabelMatchers(ctx context.Context, from, through model.Time, hint *storage.SelectHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]model.Metric, error) {
 	return d.metricsForLabelMatchersCommon(ctx, from, through, hint, func(ctx context.Context, rs ring.ReplicationSet, req *ingester_client.MetricsForLabelMatchersRequest, metrics *map[model.Fingerprint]model.Metric, mutex *sync.Mutex, queryLimiter *limiter.QueryLimiter) error {
-		_, err := d.ForReplicationSet(ctx, rs, false, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
+		_, err := d.ForReplicationSet(ctx, rs, false, partialDataEnabled, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
 			resp, err := client.MetricsForLabelMatchers(ctx, req)
 			if err != nil {
 				return nil, err
@@ -1375,9 +1375,9 @@ func (d *Distributor) MetricsForLabelMatchers(ctx context.Context, from, through
 	}, matchers...)
 }
 
-func (d *Distributor) MetricsForLabelMatchersStream(ctx context.Context, from, through model.Time, hint *storage.SelectHints, matchers ...*labels.Matcher) ([]model.Metric, error) {
+func (d *Distributor) MetricsForLabelMatchersStream(ctx context.Context, from, through model.Time, hint *storage.SelectHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]model.Metric, error) {
 	return d.metricsForLabelMatchersCommon(ctx, from, through, hint, func(ctx context.Context, rs ring.ReplicationSet, req *ingester_client.MetricsForLabelMatchersRequest, metrics *map[model.Fingerprint]model.Metric, mutex *sync.Mutex, queryLimiter *limiter.QueryLimiter) error {
-		_, err := d.ForReplicationSet(ctx, rs, false, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
+		_, err := d.ForReplicationSet(ctx, rs, false, partialDataEnabled, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
 			stream, err := client.MetricsForLabelMatchersStream(ctx, req)
 			if err != nil {
 				return nil, err
@@ -1453,7 +1453,7 @@ func (d *Distributor) MetricsMetadata(ctx context.Context) ([]scrape.MetricMetad
 
 	req := &ingester_client.MetricsMetadataRequest{}
 	// TODO(gotjosh): We only need to look in all the ingesters if shardByAllLabels is enabled.
-	resps, err := d.ForReplicationSet(ctx, replicationSet, d.cfg.ZoneResultsQuorumMetadata, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
+	resps, err := d.ForReplicationSet(ctx, replicationSet, d.cfg.ZoneResultsQuorumMetadata, false, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
 		return client.MetricsMetadata(ctx, req)
 	})
 	if err != nil {
@@ -1495,7 +1495,7 @@ func (d *Distributor) UserStats(ctx context.Context) (*ingester.UserStats, error
 	replicationSet.MaxErrors = 0
 
 	req := &ingester_client.UserStatsRequest{}
-	resps, err := d.ForReplicationSet(ctx, replicationSet, false, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
+	resps, err := d.ForReplicationSet(ctx, replicationSet, false, false, func(ctx context.Context, client ingester_client.IngesterClient) (interface{}, error) {
 		return client.UserStats(ctx, req)
 	})
 	if err != nil {
diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go
index 8df0527f2ed..4a2e04768ea 100644
--- a/pkg/distributor/distributor_test.go
+++ b/pkg/distributor/distributor_test.go
@@ -1316,7 +1316,7 @@ func TestDistributor_PushQuery(t *testing.T) {
 			assert.Nil(t, err)
 
 			var response model.Matrix
-			series, err := ds[0].QueryStream(ctx, 0, 10, tc.matchers...)
+			series, err := ds[0].QueryStream(ctx, 0, 10, false, tc.matchers...)
 			assert.Equal(t, tc.expectedError, err)
 
 			if series == nil {
@@ -1378,7 +1378,7 @@ func TestDistributor_QueryStream_ShouldReturnErrorIfMaxChunksPerQueryLimitIsReac
 
 		// Since the number of series (and thus chunks) is equal to the limit (but doesn't
 		// exceed it), we expect a query running on all series to succeed.
-		queryRes, err := ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, allSeriesMatchers...)
+		queryRes, err := ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, false, allSeriesMatchers...)
 		require.NoError(t, err)
 		assert.Len(t, queryRes.Chunkseries, initialSeries)
 
@@ -1396,7 +1396,7 @@ func TestDistributor_QueryStream_ShouldReturnErrorIfMaxChunksPerQueryLimitIsReac
 
 		// Since the number of series (and thus chunks) is exceeding to the limit, we expect
 		// a query running on all series to fail.
-		_, err = ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, allSeriesMatchers...)
+		_, err = ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, false, allSeriesMatchers...)
 		require.Error(t, err)
 		assert.Contains(t, err.Error(), "the query hit the max number of chunks limit")
 	}
@@ -1440,7 +1440,7 @@ func TestDistributor_QueryStream_ShouldReturnErrorIfMaxSeriesPerQueryLimitIsReac
 
 		// Since the number of series is equal to the limit (but doesn't
 		// exceed it), we expect a query running on all series to succeed.
-		queryRes, err := ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, allSeriesMatchers...)
+		queryRes, err := ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, false, allSeriesMatchers...)
 		require.NoError(t, err)
 		assert.Len(t, queryRes.Chunkseries, initialSeries)
 
@@ -1456,7 +1456,7 @@ func TestDistributor_QueryStream_ShouldReturnErrorIfMaxSeriesPerQueryLimitIsReac
 
 		// Since the number of series is exceeding the limit, we expect
 		// a query running on all series to fail.
-		_, err = ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, allSeriesMatchers...)
+		_, err = ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, false, allSeriesMatchers...)
 		require.Error(t, err)
 		assert.Contains(t, err.Error(), "max number of series limit")
 	}
@@ -1494,7 +1494,7 @@ func TestDistributor_QueryStream_ShouldReturnErrorIfMaxChunkBytesPerQueryLimitIs
 		writeRes, err := ds[0].Push(ctx, writeReq)
 		assert.Equal(t, &cortexpb.WriteResponse{}, writeRes)
 		assert.Nil(t, err)
-		chunkSizeResponse, err := ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, allSeriesMatchers...)
+		chunkSizeResponse, err := ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, false, allSeriesMatchers...)
 		require.NoError(t, err)
 
 		// Use the resulting chunks size to calculate the limit as (series to add + our test series) * the response chunk size.
@@ -1516,7 +1516,7 @@ func TestDistributor_QueryStream_ShouldReturnErrorIfMaxChunkBytesPerQueryLimitIs
 
 		// Since the number of chunk bytes is equal to the limit (but doesn't
 		// exceed it), we expect a query running on all series to succeed.
-		queryRes, err := ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, allSeriesMatchers...)
+		queryRes, err := ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, false, allSeriesMatchers...)
 		require.NoError(t, err)
 		assert.Len(t, queryRes.Chunkseries, seriesToAdd)
 
@@ -1532,7 +1532,7 @@ func TestDistributor_QueryStream_ShouldReturnErrorIfMaxChunkBytesPerQueryLimitIs
 
 		// Since the aggregated chunk size is exceeding the limit, we expect
 		// a query running on all series to fail.
-		_, err = ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, allSeriesMatchers...)
+		_, err = ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, false, allSeriesMatchers...)
 		require.Error(t, err)
 		assert.Equal(t, err, validation.LimitError(fmt.Sprintf(limiter.ErrMaxChunkBytesHit, maxBytesLimit)))
 	}
@@ -1571,7 +1571,7 @@ func TestDistributor_QueryStream_ShouldReturnErrorIfMaxDataBytesPerQueryLimitIsR
 		writeRes, err := ds[0].Push(ctx, writeReq)
 		assert.Equal(t, &cortexpb.WriteResponse{}, writeRes)
 		assert.Nil(t, err)
-		dataSizeResponse, err := ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, allSeriesMatchers...)
+		dataSizeResponse, err := ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, false, allSeriesMatchers...)
 		require.NoError(t, err)
 
 		// Use the resulting chunks size to calculate the limit as (series to add + our test series) * the response chunk size.
@@ -1593,7 +1593,7 @@ func TestDistributor_QueryStream_ShouldReturnErrorIfMaxDataBytesPerQueryLimitIsR
 
 		// Since the number of chunk bytes is equal to the limit (but doesn't
 		// exceed it), we expect a query running on all series to succeed.
-		queryRes, err := ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, allSeriesMatchers...)
+		queryRes, err := ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, false, allSeriesMatchers...)
 		require.NoError(t, err)
 		assert.Len(t, queryRes.Chunkseries, seriesToAdd)
 
@@ -1609,7 +1609,7 @@ func TestDistributor_QueryStream_ShouldReturnErrorIfMaxDataBytesPerQueryLimitIsR
 
 		// Since the aggregated chunk size is exceeding the limit, we expect
 		// a query running on all series to fail.
-		_, err = ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, allSeriesMatchers...)
+		_, err = ds[0].QueryStream(ctx, math.MinInt32, math.MaxInt32, false, allSeriesMatchers...)
 		require.Error(t, err)
 		assert.Equal(t, err, validation.LimitError(fmt.Sprintf(limiter.ErrMaxDataBytesHit, maxBytesLimit)))
 	}
@@ -2065,7 +2065,7 @@ func BenchmarkDistributor_GetLabelsValues(b *testing.B) {
 			b.ResetTimer()
 			b.ReportAllocs()
 			for i := 0; i < b.N; i++ {
-				_, err := ds[0].LabelValuesForLabelName(ctx, model.Time(time.Now().UnixMilli()), model.Time(time.Now().UnixMilli()), "__name__", nil)
+				_, err := ds[0].LabelValuesForLabelName(ctx, model.Time(time.Now().UnixMilli()), model.Time(time.Now().UnixMilli()), "__name__", nil, false)
 				require.NoError(b, err)
 			}
 		})
@@ -2397,7 +2397,7 @@ func TestSlowQueries(t *testing.T) {
 					shardByAllLabels: shardByAllLabels,
 				})
 
-				_, err := ds[0].QueryStream(ctx, 0, 10, nameMatcher)
+				_, err := ds[0].QueryStream(ctx, 0, 10, false, nameMatcher)
 				assert.Equal(t, expectedErr, err)
 			})
 		}
@@ -2431,7 +2431,7 @@ func TestDistributor_MetricsForLabelMatchers_SingleSlowIngester(t *testing.T) {
 		}
 
 		for i := 0; i < 50; i++ {
-			_, err := ds[0].MetricsForLabelMatchers(ctx, now, now, nil, mustNewMatcher(labels.MatchEqual, model.MetricNameLabel, "test"))
+			_, err := ds[0].MetricsForLabelMatchers(ctx, now, now, nil, false, mustNewMatcher(labels.MatchEqual, model.MetricNameLabel, "test"))
 			require.NoError(t, err)
 		}
 	}
@@ -2600,7 +2600,7 @@ func TestDistributor_MetricsForLabelMatchers(t *testing.T) {
 				}
 
 				{
-					metrics, err := ds[0].MetricsForLabelMatchers(ctx, now, now, nil, testData.matchers...)
+					metrics, err := ds[0].MetricsForLabelMatchers(ctx, now, now, nil, false, testData.matchers...)
 
 					if testData.expectedErr != nil {
 						assert.ErrorIs(t, err, testData.expectedErr)
@@ -2618,7 +2618,7 @@ func TestDistributor_MetricsForLabelMatchers(t *testing.T) {
 				}
 
 				{
-					metrics, err := ds[0].MetricsForLabelMatchersStream(ctx, now, now, nil, testData.matchers...)
+					metrics, err := ds[0].MetricsForLabelMatchersStream(ctx, now, now, nil, false, testData.matchers...)
 					if testData.expectedErr != nil {
 						assert.ErrorIs(t, err, testData.expectedErr)
 						return
@@ -2705,7 +2705,7 @@ func BenchmarkDistributor_MetricsForLabelMatchers(b *testing.B) {
 
 			for n := 0; n < b.N; n++ {
 				now := model.Now()
-				metrics, err := ds[0].MetricsForLabelMatchers(ctx, now, now, nil, testData.matchers...)
+				metrics, err := ds[0].MetricsForLabelMatchers(ctx, now, now, nil, false, testData.matchers...)
 
 				if testData.expectedErr != nil {
 					assert.EqualError(b, err, testData.expectedErr.Error())
diff --git a/pkg/distributor/query.go b/pkg/distributor/query.go
index c81e9f3e776..3733d9dc353 100644
--- a/pkg/distributor/query.go
+++ b/pkg/distributor/query.go
@@ -14,6 +14,7 @@ import (
 
 	"github.com/cortexproject/cortex/pkg/cortexpb"
 	ingester_client "github.com/cortexproject/cortex/pkg/ingester/client"
+	"github.com/cortexproject/cortex/pkg/querier/partialdata"
 	"github.com/cortexproject/cortex/pkg/querier/stats"
 	"github.com/cortexproject/cortex/pkg/ring"
 	"github.com/cortexproject/cortex/pkg/tenant"
@@ -52,7 +53,7 @@ func (d *Distributor) QueryExemplars(ctx context.Context, from, to model.Time, m
 }
 
 // QueryStream multiple ingesters via the streaming interface and returns big ol' set of chunks.
-func (d *Distributor) QueryStream(ctx context.Context, from, to model.Time, matchers ...*labels.Matcher) (*ingester_client.QueryStreamResponse, error) {
+func (d *Distributor) QueryStream(ctx context.Context, from, to model.Time, partialDataEnabled bool, matchers ...*labels.Matcher) (*ingester_client.QueryStreamResponse, error) {
 	var result *ingester_client.QueryStreamResponse
 	err := instrument.CollectedRequest(ctx, "Distributor.QueryStream", d.queryDuration, instrument.ErrorCode, func(ctx context.Context) error {
 		req, err := ingester_client.ToQueryRequest(from, to, matchers)
@@ -65,7 +66,7 @@ func (d *Distributor) QueryStream(ctx context.Context, from, to model.Time, matc
 			return err
 		}
 
-		result, err = d.queryIngesterStream(ctx, replicationSet, req)
+		result, err = d.queryIngesterStream(ctx, replicationSet, req, partialDataEnabled)
 		if err != nil {
 			return err
 		}
@@ -160,7 +161,7 @@ func mergeExemplarSets(a, b []cortexpb.Exemplar) []cortexpb.Exemplar {
 func (d *Distributor) queryIngestersExemplars(ctx context.Context, replicationSet ring.ReplicationSet, req *ingester_client.ExemplarQueryRequest) (*ingester_client.ExemplarQueryResponse, error) {
 	// Fetch exemplars from multiple ingesters in parallel, using the replicationSet
 	// to deal with consistency.
-	results, err := replicationSet.Do(ctx, d.cfg.ExtraQueryDelay, false, func(ctx context.Context, ing *ring.InstanceDesc) (interface{}, error) {
+	results, err := replicationSet.Do(ctx, d.cfg.ExtraQueryDelay, false, false, func(ctx context.Context, ing *ring.InstanceDesc) (interface{}, error) {
 		client, err := d.ingesterPool.GetClientFor(ing.Addr)
 		if err != nil {
 			return nil, err
@@ -220,14 +221,14 @@ func mergeExemplarQueryResponses(results []interface{}) *ingester_client.Exempla
 }
 
 // queryIngesterStream queries the ingesters using the new streaming API.
-func (d *Distributor) queryIngesterStream(ctx context.Context, replicationSet ring.ReplicationSet, req *ingester_client.QueryRequest) (*ingester_client.QueryStreamResponse, error) {
+func (d *Distributor) queryIngesterStream(ctx context.Context, replicationSet ring.ReplicationSet, req *ingester_client.QueryRequest, partialDataEnabled bool) (*ingester_client.QueryStreamResponse, error) {
 	var (
 		queryLimiter = limiter.QueryLimiterFromContextWithFallback(ctx)
 		reqStats     = stats.FromContext(ctx)
 	)
 
 	// Fetch samples from multiple ingesters
-	results, err := replicationSet.Do(ctx, d.cfg.ExtraQueryDelay, false, func(ctx context.Context, ing *ring.InstanceDesc) (interface{}, error) {
+	results, err := replicationSet.Do(ctx, d.cfg.ExtraQueryDelay, false, partialDataEnabled, func(ctx context.Context, ing *ring.InstanceDesc) (interface{}, error) {
 		client, err := d.ingesterPool.GetClientFor(ing.Addr)
 		if err != nil {
 			return nil, err
@@ -287,7 +288,7 @@ func (d *Distributor) queryIngesterStream(ctx context.Context, replicationSet ri
 		}
 		return result, nil
 	})
-	if err != nil {
+	if err != nil && !partialdata.IsPartialDataError(err) {
 		return nil, err
 	}
 
@@ -328,5 +329,10 @@ func (d *Distributor) queryIngesterStream(ctx context.Context, replicationSet ri
 	reqStats.AddFetchedChunks(uint64(chksCount))
 	reqStats.AddFetchedSamples(uint64(resp.SamplesCount()))
 
+	if partialdata.IsPartialDataError(err) {
+		level.Info(d.log).Log("msg", "returning partial data")
+		return resp, err
+	}
+
 	return resp, nil
 }
diff --git a/pkg/querier/distributor_queryable.go b/pkg/querier/distributor_queryable.go
index 709294e6fa7..46965c8eee3 100644
--- a/pkg/querier/distributor_queryable.go
+++ b/pkg/querier/distributor_queryable.go
@@ -16,7 +16,9 @@ import (
 
 	"github.com/cortexproject/cortex/pkg/cortexpb"
 	"github.com/cortexproject/cortex/pkg/ingester/client"
+	"github.com/cortexproject/cortex/pkg/querier/partialdata"
 	"github.com/cortexproject/cortex/pkg/querier/series"
+	"github.com/cortexproject/cortex/pkg/tenant"
 	"github.com/cortexproject/cortex/pkg/util"
 	"github.com/cortexproject/cortex/pkg/util/chunkcompat"
 	"github.com/cortexproject/cortex/pkg/util/spanlogger"
@@ -25,24 +27,25 @@ import (
 // Distributor is the read interface to the distributor, made an interface here
 // to reduce package coupling.
 type Distributor interface {
-	QueryStream(ctx context.Context, from, to model.Time, matchers ...*labels.Matcher) (*client.QueryStreamResponse, error)
+	QueryStream(ctx context.Context, from, to model.Time, partialDataEnabled bool, matchers ...*labels.Matcher) (*client.QueryStreamResponse, error)
 	QueryExemplars(ctx context.Context, from, to model.Time, matchers ...[]*labels.Matcher) (*client.ExemplarQueryResponse, error)
-	LabelValuesForLabelName(ctx context.Context, from, to model.Time, label model.LabelName, hint *storage.LabelHints, matchers ...*labels.Matcher) ([]string, error)
-	LabelValuesForLabelNameStream(ctx context.Context, from, to model.Time, label model.LabelName, hint *storage.LabelHints, matchers ...*labels.Matcher) ([]string, error)
-	LabelNames(context.Context, model.Time, model.Time, *storage.LabelHints, ...*labels.Matcher) ([]string, error)
-	LabelNamesStream(context.Context, model.Time, model.Time, *storage.LabelHints, ...*labels.Matcher) ([]string, error)
-	MetricsForLabelMatchers(ctx context.Context, from, through model.Time, hint *storage.SelectHints, matchers ...*labels.Matcher) ([]model.Metric, error)
-	MetricsForLabelMatchersStream(ctx context.Context, from, through model.Time, hint *storage.SelectHints, matchers ...*labels.Matcher) ([]model.Metric, error)
+	LabelValuesForLabelName(ctx context.Context, from, to model.Time, label model.LabelName, hint *storage.LabelHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]string, error)
+	LabelValuesForLabelNameStream(ctx context.Context, from, to model.Time, label model.LabelName, hint *storage.LabelHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]string, error)
+	LabelNames(context.Context, model.Time, model.Time, *storage.LabelHints, bool, ...*labels.Matcher) ([]string, error)
+	LabelNamesStream(context.Context, model.Time, model.Time, *storage.LabelHints, bool, ...*labels.Matcher) ([]string, error)
+	MetricsForLabelMatchers(ctx context.Context, from, through model.Time, hint *storage.SelectHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]model.Metric, error)
+	MetricsForLabelMatchersStream(ctx context.Context, from, through model.Time, hint *storage.SelectHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]model.Metric, error)
 	MetricsMetadata(ctx context.Context) ([]scrape.MetricMetadata, error)
 }
 
-func newDistributorQueryable(distributor Distributor, streamingMetdata bool, labelNamesWithMatchers bool, iteratorFn chunkIteratorFunc, queryIngestersWithin time.Duration) QueryableWithFilter {
+func newDistributorQueryable(distributor Distributor, streamingMetdata bool, labelNamesWithMatchers bool, iteratorFn chunkIteratorFunc, queryIngestersWithin time.Duration, isPartialDataEnabled partialdata.IsCfgEnabledFunc) QueryableWithFilter {
 	return distributorQueryable{
 		distributor:            distributor,
 		streamingMetdata:       streamingMetdata,
 		labelNamesWithMatchers: labelNamesWithMatchers,
 		iteratorFn:             iteratorFn,
 		queryIngestersWithin:   queryIngestersWithin,
+		isPartialDataEnabled:   isPartialDataEnabled,
 	}
 }
 
@@ -52,6 +55,7 @@ type distributorQueryable struct {
 	labelNamesWithMatchers bool
 	iteratorFn             chunkIteratorFunc
 	queryIngestersWithin   time.Duration
+	isPartialDataEnabled   partialdata.IsCfgEnabledFunc
 }
 
 func (d distributorQueryable) Querier(mint, maxt int64) (storage.Querier, error) {
@@ -63,6 +67,7 @@ func (d distributorQueryable) Querier(mint, maxt int64) (storage.Querier, error)
 		labelNamesMatchers:   d.labelNamesWithMatchers,
 		chunkIterFn:          d.iteratorFn,
 		queryIngestersWithin: d.queryIngestersWithin,
+		isPartialDataEnabled: d.isPartialDataEnabled,
 	}, nil
 }
 
@@ -78,6 +83,7 @@ type distributorQuerier struct {
 	labelNamesMatchers   bool
 	chunkIterFn          chunkIteratorFunc
 	queryIngestersWithin time.Duration
+	isPartialDataEnabled partialdata.IsCfgEnabledFunc
 }
 
 // Select implements storage.Querier interface.
@@ -110,6 +116,8 @@ func (q *distributorQuerier) Select(ctx context.Context, sortSeries bool, sp *st
 		}
 	}
 
+	partialDataEnabled := q.partialDataEnabled(ctx)
+
 	// In the recent versions of Prometheus, we pass in the hint but with Func set to "series".
 	// See: https://github.com/prometheus/prometheus/pull/8050
 	if sp != nil && sp.Func == "series" {
@@ -119,23 +127,32 @@ func (q *distributorQuerier) Select(ctx context.Context, sortSeries bool, sp *st
 		)
 
 		if q.streamingMetadata {
-			ms, err = q.distributor.MetricsForLabelMatchersStream(ctx, model.Time(minT), model.Time(maxT), sp, matchers...)
+			ms, err = q.distributor.MetricsForLabelMatchersStream(ctx, model.Time(minT), model.Time(maxT), sp, partialDataEnabled, matchers...)
 		} else {
-			ms, err = q.distributor.MetricsForLabelMatchers(ctx, model.Time(minT), model.Time(maxT), sp, matchers...)
+			ms, err = q.distributor.MetricsForLabelMatchers(ctx, model.Time(minT), model.Time(maxT), sp, partialDataEnabled, matchers...)
 		}
 
-		if err != nil {
+		if err != nil && !partialdata.IsPartialDataError(err) {
 			return storage.ErrSeriesSet(err)
 		}
+
+		seriesSet := series.MetricsToSeriesSet(ctx, sortSeries, ms)
+
+		if partialdata.IsPartialDataError(err) {
+			warning := seriesSet.Warnings()
+			return series.NewSeriesSetWithWarnings(seriesSet, warning.Add(err))
+		}
+
 		return series.MetricsToSeriesSet(ctx, sortSeries, ms)
 	}
 
-	return q.streamingSelect(ctx, sortSeries, minT, maxT, matchers)
+	return q.streamingSelect(ctx, sortSeries, partialDataEnabled, minT, maxT, matchers)
 }
 
-func (q *distributorQuerier) streamingSelect(ctx context.Context, sortSeries bool, minT, maxT int64, matchers []*labels.Matcher) storage.SeriesSet {
-	results, err := q.distributor.QueryStream(ctx, model.Time(minT), model.Time(maxT), matchers...)
-	if err != nil {
+func (q *distributorQuerier) streamingSelect(ctx context.Context, sortSeries, partialDataEnabled bool, minT, maxT int64, matchers []*labels.Matcher) storage.SeriesSet {
+	results, err := q.distributor.QueryStream(ctx, model.Time(minT), model.Time(maxT), partialDataEnabled, matchers...)
+
+	if err != nil && !partialdata.IsPartialDataError(err) {
 		return storage.ErrSeriesSet(err)
 	}
 
@@ -165,7 +182,14 @@ func (q *distributorQuerier) streamingSelect(ctx context.Context, sortSeries boo
 		return storage.EmptySeriesSet()
 	}
 
-	return series.NewConcreteSeriesSet(sortSeries, serieses)
+	seriesSet := series.NewConcreteSeriesSet(sortSeries, serieses)
+
+	if partialdata.IsPartialDataError(err) {
+		warnings := seriesSet.Warnings()
+		return series.NewSeriesSetWithWarnings(seriesSet, warnings.Add(err))
+	}
+
+	return seriesSet
 }
 
 func (q *distributorQuerier) LabelValues(ctx context.Context, name string, hints *storage.LabelHints, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) {
@@ -174,18 +198,27 @@ func (q *distributorQuerier) LabelValues(ctx context.Context, name string, hints
 		err error
 	)
 
+	partialDataEnabled := q.partialDataEnabled(ctx)
+
 	if q.streamingMetadata {
-		lvs, err = q.distributor.LabelValuesForLabelNameStream(ctx, model.Time(q.mint), model.Time(q.maxt), model.LabelName(name), hints, matchers...)
+		lvs, err = q.distributor.LabelValuesForLabelNameStream(ctx, model.Time(q.mint), model.Time(q.maxt), model.LabelName(name), hints, partialDataEnabled, matchers...)
 	} else {
-		lvs, err = q.distributor.LabelValuesForLabelName(ctx, model.Time(q.mint), model.Time(q.maxt), model.LabelName(name), hints, matchers...)
+		lvs, err = q.distributor.LabelValuesForLabelName(ctx, model.Time(q.mint), model.Time(q.maxt), model.LabelName(name), hints, partialDataEnabled, matchers...)
+	}
+
+	if partialdata.IsPartialDataError(err) {
+		warnings := annotations.Annotations(nil)
+		return lvs, warnings.Add(err), nil
 	}
 
 	return lvs, nil, err
 }
 
 func (q *distributorQuerier) LabelNames(ctx context.Context, hints *storage.LabelHints, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) {
+	partialDataEnabled := q.partialDataEnabled(ctx)
+
 	if len(matchers) > 0 && !q.labelNamesMatchers {
-		return q.labelNamesWithMatchers(ctx, hints, matchers...)
+		return q.labelNamesWithMatchers(ctx, hints, partialDataEnabled, matchers...)
 	}
 
 	log, ctx := spanlogger.New(ctx, "distributorQuerier.LabelNames")
@@ -197,16 +230,21 @@ func (q *distributorQuerier) LabelNames(ctx context.Context, hints *storage.Labe
 	)
 
 	if q.streamingMetadata {
-		ln, err = q.distributor.LabelNamesStream(ctx, model.Time(q.mint), model.Time(q.maxt), hints, matchers...)
+		ln, err = q.distributor.LabelNamesStream(ctx, model.Time(q.mint), model.Time(q.maxt), hints, partialDataEnabled, matchers...)
 	} else {
-		ln, err = q.distributor.LabelNames(ctx, model.Time(q.mint), model.Time(q.maxt), hints, matchers...)
+		ln, err = q.distributor.LabelNames(ctx, model.Time(q.mint), model.Time(q.maxt), hints, partialDataEnabled, matchers...)
+	}
+
+	if partialdata.IsPartialDataError(err) {
+		warnings := annotations.Annotations(nil)
+		return ln, warnings.Add(err), nil
 	}
 
 	return ln, nil, err
 }
 
 // labelNamesWithMatchers performs the LabelNames call by calling ingester's MetricsForLabelMatchers method
-func (q *distributorQuerier) labelNamesWithMatchers(ctx context.Context, hints *storage.LabelHints, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) {
+func (q *distributorQuerier) labelNamesWithMatchers(ctx context.Context, hints *storage.LabelHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) {
 	log, ctx := spanlogger.New(ctx, "distributorQuerier.labelNamesWithMatchers")
 	defer log.Span.Finish()
 
@@ -216,12 +254,12 @@ func (q *distributorQuerier) labelNamesWithMatchers(ctx context.Context, hints *
 	)
 
 	if q.streamingMetadata {
-		ms, err = q.distributor.MetricsForLabelMatchersStream(ctx, model.Time(q.mint), model.Time(q.maxt), labelHintsToSelectHints(hints), matchers...)
+		ms, err = q.distributor.MetricsForLabelMatchersStream(ctx, model.Time(q.mint), model.Time(q.maxt), labelHintsToSelectHints(hints), partialDataEnabled, matchers...)
 	} else {
-		ms, err = q.distributor.MetricsForLabelMatchers(ctx, model.Time(q.mint), model.Time(q.maxt), labelHintsToSelectHints(hints), matchers...)
+		ms, err = q.distributor.MetricsForLabelMatchers(ctx, model.Time(q.mint), model.Time(q.maxt), labelHintsToSelectHints(hints), partialDataEnabled, matchers...)
 	}
 
-	if err != nil {
+	if err != nil && !partialdata.IsPartialDataError(err) {
 		return nil, nil, err
 	}
 	namesMap := make(map[string]struct{})
@@ -238,6 +276,11 @@ func (q *distributorQuerier) labelNamesWithMatchers(ctx context.Context, hints *
 	}
 	sort.Strings(names)
 
+	if partialdata.IsPartialDataError(err) {
+		warnings := annotations.Annotations(nil)
+		return names, warnings.Add(err), nil
+	}
+
 	return names, nil, nil
 }
 
@@ -245,6 +288,15 @@ func (q *distributorQuerier) Close() error {
 	return nil
 }
 
+func (q *distributorQuerier) partialDataEnabled(ctx context.Context) bool {
+	userID, err := tenant.TenantID(ctx)
+	if err != nil {
+		return false
+	}
+
+	return q.isPartialDataEnabled != nil && q.isPartialDataEnabled(userID)
+}
+
 type distributorExemplarQueryable struct {
 	distributor Distributor
 }
diff --git a/pkg/querier/distributor_queryable_test.go b/pkg/querier/distributor_queryable_test.go
index 1914c49e88e..457fba03cbc 100644
--- a/pkg/querier/distributor_queryable_test.go
+++ b/pkg/querier/distributor_queryable_test.go
@@ -18,6 +18,7 @@ import (
 	"github.com/cortexproject/cortex/pkg/cortexpb"
 	"github.com/cortexproject/cortex/pkg/ingester/client"
 	"github.com/cortexproject/cortex/pkg/querier/batch"
+	"github.com/cortexproject/cortex/pkg/querier/partialdata"
 	"github.com/cortexproject/cortex/pkg/util"
 	"github.com/cortexproject/cortex/pkg/util/chunkcompat"
 	"github.com/cortexproject/cortex/pkg/util/validation"
@@ -89,7 +90,7 @@ func TestDistributorQuerier_SelectShouldHonorQueryIngestersWithin(t *testing.T)
 				distributor.On("MetricsForLabelMatchersStream", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]model.Metric{}, nil)
 
 				ctx := user.InjectOrgID(context.Background(), "test")
-				queryable := newDistributorQueryable(distributor, streamingMetadataEnabled, true, nil, testData.queryIngestersWithin)
+				queryable := newDistributorQueryable(distributor, streamingMetadataEnabled, true, nil, testData.queryIngestersWithin, nil)
 				querier, err := queryable.Querier(testData.queryMinT, testData.queryMaxT)
 				require.NoError(t, err)
 
@@ -128,7 +129,7 @@ func TestDistributorQueryableFilter(t *testing.T) {
 	t.Parallel()
 
 	d := &MockDistributor{}
-	dq := newDistributorQueryable(d, false, true, nil, 1*time.Hour)
+	dq := newDistributorQueryable(d, false, true, nil, 1*time.Hour, nil)
 
 	now := time.Now()
 
@@ -146,14 +147,15 @@ func TestIngesterStreaming(t *testing.T) {
 	t.Parallel()
 
 	now := time.Now()
+
 	for _, enc := range encodings {
-		promChunk := util.GenerateChunk(t, time.Second, model.TimeFromUnix(now.Unix()), 10, enc)
-		clientChunks, err := chunkcompat.ToChunks([]chunk.Chunk{promChunk})
-		require.NoError(t, err)
+		for _, partialDataEnabled := range []bool{false, true} {
+			promChunk := util.GenerateChunk(t, time.Second, model.TimeFromUnix(now.Unix()), 10, enc)
+			clientChunks, err := chunkcompat.ToChunks([]chunk.Chunk{promChunk})
+			require.NoError(t, err)
 
-		d := &MockDistributor{}
-		d.On("QueryStream", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(
-			&client.QueryStreamResponse{
+			d := &MockDistributor{}
+			queryResponse := &client.QueryStreamResponse{
 				Chunkseries: []client.TimeSeriesChunk{
 					{
 						Labels: []cortexpb.LabelAdapter{
@@ -168,31 +170,43 @@ func TestIngesterStreaming(t *testing.T) {
 						Chunks: clientChunks,
 					},
 				},
-			},
-			nil)
-
-		ctx := user.InjectOrgID(context.Background(), "0")
-		queryable := newDistributorQueryable(d, true, true, batch.NewChunkMergeIterator, 0)
-		querier, err := queryable.Querier(mint, maxt)
-		require.NoError(t, err)
-
-		seriesSet := querier.Select(ctx, true, &storage.SelectHints{Start: mint, End: maxt})
-		require.NoError(t, seriesSet.Err())
-
-		require.True(t, seriesSet.Next())
-		series := seriesSet.At()
-		require.Equal(t, labels.Labels{{Name: "bar", Value: "baz"}}, series.Labels())
-		chkIter := series.Iterator(nil)
-		require.Equal(t, enc.ChunkValueType(), chkIter.Next())
-
-		require.True(t, seriesSet.Next())
-		series = seriesSet.At()
-		require.Equal(t, labels.Labels{{Name: "foo", Value: "bar"}}, series.Labels())
-		chkIter = series.Iterator(chkIter)
-		require.Equal(t, enc.ChunkValueType(), chkIter.Next())
-
-		require.False(t, seriesSet.Next())
-		require.NoError(t, seriesSet.Err())
+			}
+			var partialDataErr error
+			if partialDataEnabled {
+				partialDataErr = partialdata.ErrPartialData
+			}
+			d.On("QueryStream", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(queryResponse, partialDataErr)
+
+			ctx := user.InjectOrgID(context.Background(), "0")
+
+			queryable := newDistributorQueryable(d, true, true, batch.NewChunkMergeIterator, 0, func(string) bool {
+				return partialDataEnabled
+			})
+			querier, err := queryable.Querier(mint, maxt)
+			require.NoError(t, err)
+
+			seriesSet := querier.Select(ctx, true, &storage.SelectHints{Start: mint, End: maxt})
+			require.NoError(t, seriesSet.Err())
+
+			require.True(t, seriesSet.Next())
+			series := seriesSet.At()
+			require.Equal(t, labels.Labels{{Name: "bar", Value: "baz"}}, series.Labels())
+			chkIter := series.Iterator(nil)
+			require.Equal(t, enc.ChunkValueType(), chkIter.Next())
+
+			require.True(t, seriesSet.Next())
+			series = seriesSet.At()
+			require.Equal(t, labels.Labels{{Name: "foo", Value: "bar"}}, series.Labels())
+			chkIter = series.Iterator(chkIter)
+			require.Equal(t, enc.ChunkValueType(), chkIter.Next())
+
+			require.False(t, seriesSet.Next())
+			require.NoError(t, seriesSet.Err())
+
+			if partialDataEnabled {
+				require.Contains(t, seriesSet.Warnings(), partialdata.ErrPartialData.Error())
+			}
+		}
 	}
 }
 
@@ -204,40 +218,52 @@ func TestDistributorQuerier_LabelNames(t *testing.T) {
 
 	for _, labelNamesWithMatchers := range []bool{false, true} {
 		for _, streamingEnabled := range []bool{false, true} {
-			streamingEnabled := streamingEnabled
-			labelNamesWithMatchers := labelNamesWithMatchers
-			t.Run("with matchers", func(t *testing.T) {
-				t.Parallel()
-
-				metrics := []model.Metric{
-					{"foo": "bar"},
-					{"job": "baz"},
-					{"job": "baz", "foo": "boom"},
-				}
-				d := &MockDistributor{}
-
-				if labelNamesWithMatchers {
-					d.On("LabelNames", mock.Anything, model.Time(mint), model.Time(maxt), mock.Anything, someMatchers).
-						Return(labelNames, nil)
-					d.On("LabelNamesStream", mock.Anything, model.Time(mint), model.Time(maxt), mock.Anything, someMatchers).
-						Return(labelNames, nil)
-				} else {
-					d.On("MetricsForLabelMatchers", mock.Anything, model.Time(mint), model.Time(maxt), mock.Anything, someMatchers).
-						Return(metrics, nil)
-					d.On("MetricsForLabelMatchersStream", mock.Anything, model.Time(mint), model.Time(maxt), mock.Anything, someMatchers).
-						Return(metrics, nil)
-				}
+			for _, partialDataEnabled := range []bool{false, true} {
+				streamingEnabled := streamingEnabled
+				labelNamesWithMatchers := labelNamesWithMatchers
+				t.Run("with matchers", func(t *testing.T) {
+					t.Parallel()
+
+					metrics := []model.Metric{
+						{"foo": "bar"},
+						{"job": "baz"},
+						{"job": "baz", "foo": "boom"},
+					}
+					d := &MockDistributor{}
 
-				queryable := newDistributorQueryable(d, streamingEnabled, labelNamesWithMatchers, nil, 0)
-				querier, err := queryable.Querier(mint, maxt)
-				require.NoError(t, err)
+					var partialDataErr error
+					if partialDataEnabled {
+						partialDataErr = partialdata.ErrPartialData
+					}
+					if labelNamesWithMatchers {
+						d.On("LabelNames", mock.Anything, model.Time(mint), model.Time(maxt), mock.Anything, someMatchers).
+							Return(labelNames, partialDataErr)
+						d.On("LabelNamesStream", mock.Anything, model.Time(mint), model.Time(maxt), mock.Anything, someMatchers).
+							Return(labelNames, partialDataErr)
+					} else {
+						d.On("MetricsForLabelMatchers", mock.Anything, model.Time(mint), model.Time(maxt), mock.Anything, someMatchers).
+							Return(metrics, partialDataErr)
+						d.On("MetricsForLabelMatchersStream", mock.Anything, model.Time(mint), model.Time(maxt), mock.Anything, someMatchers).
+							Return(metrics, partialDataErr)
+					}
 
-				ctx := context.Background()
-				names, warnings, err := querier.LabelNames(ctx, nil, someMatchers...)
-				require.NoError(t, err)
-				assert.Empty(t, warnings)
-				assert.Equal(t, labelNames, names)
-			})
+					queryable := newDistributorQueryable(d, streamingEnabled, labelNamesWithMatchers, nil, 0, func(string) bool {
+						return partialDataEnabled
+					})
+					querier, err := queryable.Querier(mint, maxt)
+					require.NoError(t, err)
+
+					ctx := context.Background()
+					names, warnings, err := querier.LabelNames(ctx, nil, someMatchers...)
+					require.NoError(t, err)
+					if partialDataEnabled {
+						assert.Contains(t, warnings, partialdata.ErrPartialData.Error())
+					} else {
+						assert.Empty(t, warnings)
+					}
+					assert.Equal(t, labelNames, names)
+				})
+			}
 		}
 	}
 }
diff --git a/pkg/querier/lazyquery/lazyquery.go b/pkg/querier/lazyquery/lazyquery.go
index d00c974551b..5ccbc769058 100644
--- a/pkg/querier/lazyquery/lazyquery.go
+++ b/pkg/querier/lazyquery/lazyquery.go
@@ -79,5 +79,8 @@ func (s *lazySeriesSet) Err() error {
 
 // Warnings implements storage.SeriesSet.
 func (s *lazySeriesSet) Warnings() annotations.Annotations {
-	return nil
+	if s.next == nil {
+		s.next = <-s.future
+	}
+	return s.next.Warnings()
 }
diff --git a/pkg/querier/partialdata/partia_data.go b/pkg/querier/partialdata/partia_data.go
new file mode 100644
index 00000000000..29968d94112
--- /dev/null
+++ b/pkg/querier/partialdata/partia_data.go
@@ -0,0 +1,13 @@
+package partialdata
+
+import (
+	"errors"
+)
+
+type IsCfgEnabledFunc func(userID string) bool
+
+var ErrPartialData = errors.New("query result may contain partial data")
+
+func IsPartialDataError(err error) bool {
+	return errors.Is(err, ErrPartialData)
+}
diff --git a/pkg/querier/partialdata/partial_data_test.go b/pkg/querier/partialdata/partial_data_test.go
new file mode 100644
index 00000000000..d11ffc967fb
--- /dev/null
+++ b/pkg/querier/partialdata/partial_data_test.go
@@ -0,0 +1,13 @@
+package partialdata
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestPartialData_ReturnPartialData(t *testing.T) {
+	assert.False(t, IsPartialDataError(fmt.Errorf("")))
+	assert.True(t, IsPartialDataError(ErrPartialData))
+}
diff --git a/pkg/querier/querier.go b/pkg/querier/querier.go
index 8927cd13f42..74b360a086d 100644
--- a/pkg/querier/querier.go
+++ b/pkg/querier/querier.go
@@ -26,6 +26,7 @@ import (
 
 	"github.com/cortexproject/cortex/pkg/querier/batch"
 	"github.com/cortexproject/cortex/pkg/querier/lazyquery"
+	"github.com/cortexproject/cortex/pkg/querier/partialdata"
 	querier_stats "github.com/cortexproject/cortex/pkg/querier/stats"
 	"github.com/cortexproject/cortex/pkg/tenant"
 	"github.com/cortexproject/cortex/pkg/util"
@@ -172,10 +173,10 @@ func getChunksIteratorFunction(_ Config) chunkIteratorFunc {
 }
 
 // New builds a queryable and promql engine.
-func New(cfg Config, limits *validation.Overrides, distributor Distributor, stores []QueryableWithFilter, reg prometheus.Registerer, logger log.Logger) (storage.SampleAndChunkQueryable, storage.ExemplarQueryable, promql.QueryEngine) {
+func New(cfg Config, limits *validation.Overrides, distributor Distributor, stores []QueryableWithFilter, reg prometheus.Registerer, logger log.Logger, isPartialDataEnabled partialdata.IsCfgEnabledFunc) (storage.SampleAndChunkQueryable, storage.ExemplarQueryable, promql.QueryEngine) {
 	iteratorFunc := getChunksIteratorFunction(cfg)
 
-	distributorQueryable := newDistributorQueryable(distributor, cfg.IngesterMetadataStreaming, cfg.IngesterLabelNamesWithMatchers, iteratorFunc, cfg.QueryIngestersWithin)
+	distributorQueryable := newDistributorQueryable(distributor, cfg.IngesterMetadataStreaming, cfg.IngesterLabelNamesWithMatchers, iteratorFunc, cfg.QueryIngestersWithin, isPartialDataEnabled)
 
 	ns := make([]QueryableWithFilter, len(stores))
 	for ix, s := range stores {
diff --git a/pkg/querier/querier_test.go b/pkg/querier/querier_test.go
index 69b542e2d9c..eaeb7457231 100644
--- a/pkg/querier/querier_test.go
+++ b/pkg/querier/querier_test.go
@@ -298,7 +298,7 @@ func TestShouldSortSeriesIfQueryingMultipleQueryables(t *testing.T) {
 		}
 
 		distributor.On("QueryStream", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&unorderedResponse, nil)
-		distributorQueryable := newDistributorQueryable(distributor, cfg.IngesterMetadataStreaming, cfg.IngesterLabelNamesWithMatchers, batch.NewChunkMergeIterator, cfg.QueryIngestersWithin)
+		distributorQueryable := newDistributorQueryable(distributor, cfg.IngesterMetadataStreaming, cfg.IngesterLabelNamesWithMatchers, batch.NewChunkMergeIterator, cfg.QueryIngestersWithin, nil)
 
 		tCases := []struct {
 			name                 string
@@ -444,7 +444,7 @@ func TestLimits(t *testing.T) {
 			response: &streamResponse,
 		}
 
-		distributorQueryableStreaming := newDistributorQueryable(distributor, cfg.IngesterMetadataStreaming, cfg.IngesterLabelNamesWithMatchers, batch.NewChunkMergeIterator, cfg.QueryIngestersWithin)
+		distributorQueryableStreaming := newDistributorQueryable(distributor, cfg.IngesterMetadataStreaming, cfg.IngesterLabelNamesWithMatchers, batch.NewChunkMergeIterator, cfg.QueryIngestersWithin, nil)
 
 		tCases := []struct {
 			name                 string
@@ -580,7 +580,7 @@ func TestQuerier(t *testing.T) {
 					require.NoError(t, err)
 
 					queryables := []QueryableWithFilter{UseAlwaysQueryable(NewMockStoreQueryable(chunkStore))}
-					queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger())
+					queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger(), nil)
 					testRangeQuery(t, queryable, queryEngine, through, query, enc)
 				})
 			}
@@ -602,7 +602,7 @@ func TestQuerierMetric(t *testing.T) {
 	queryables := []QueryableWithFilter{}
 	r := prometheus.NewRegistry()
 	reg := prometheus.WrapRegistererWith(prometheus.Labels{"engine": "querier"}, r)
-	New(cfg, overrides, distributor, queryables, reg, log.NewNopLogger())
+	New(cfg, overrides, distributor, queryables, reg, log.NewNopLogger(), nil)
 	assert.NoError(t, promutil.GatherAndCompare(r, strings.NewReader(`
 		# HELP cortex_max_concurrent_queries The maximum number of concurrent queries.
 		# TYPE cortex_max_concurrent_queries gauge
@@ -684,7 +684,7 @@ func TestNoHistoricalQueryToIngester(t *testing.T) {
 					require.NoError(t, err)
 
 					ctx := user.InjectOrgID(context.Background(), "0")
-					queryable, _, _ := New(cfg, overrides, distributor, []QueryableWithFilter{UseAlwaysQueryable(NewMockStoreQueryable(chunkStore))}, nil, log.NewNopLogger())
+					queryable, _, _ := New(cfg, overrides, distributor, []QueryableWithFilter{UseAlwaysQueryable(NewMockStoreQueryable(chunkStore))}, nil, log.NewNopLogger(), nil)
 					query, err := queryEngine.NewRangeQuery(ctx, queryable, nil, "dummy", c.mint, c.maxt, 1*time.Minute)
 					require.NoError(t, err)
 
@@ -778,7 +778,7 @@ func TestQuerier_ValidateQueryTimeRange_MaxQueryIntoFuture(t *testing.T) {
 
 			ctx := user.InjectOrgID(context.Background(), "0")
 			queryables := []QueryableWithFilter{UseAlwaysQueryable(NewMockStoreQueryable(chunkStore))}
-			queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger())
+			queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger(), nil)
 			query, err := queryEngine.NewRangeQuery(ctx, queryable, nil, "dummy", c.queryStartTime, c.queryEndTime, time.Minute)
 			require.NoError(t, err)
 
@@ -871,7 +871,7 @@ func TestQuerier_ValidateQueryTimeRange_MaxQueryLength(t *testing.T) {
 			distributor := &emptyDistributor{}
 
 			queryables := []QueryableWithFilter{UseAlwaysQueryable(NewMockStoreQueryable(chunkStore))}
-			queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger())
+			queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger(), nil)
 
 			queryEngine := promql.NewEngine(opts)
 			ctx := user.InjectOrgID(context.Background(), "test")
@@ -910,7 +910,7 @@ func TestQuerier_ValidateQueryTimeRange_MaxQueryLength_Series(t *testing.T) {
 	distributor := &emptyDistributor{}
 
 	queryables := []QueryableWithFilter{UseAlwaysQueryable(NewMockStoreQueryable(chunkStore))}
-	queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger())
+	queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger(), nil)
 
 	ctx := user.InjectOrgID(context.Background(), "test")
 	now := time.Now()
@@ -969,7 +969,7 @@ func TestQuerier_ValidateQueryTimeRange_MaxQueryLength_Labels(t *testing.T) {
 			distributor := &emptyDistributor{}
 
 			queryables := []QueryableWithFilter{UseAlwaysQueryable(NewMockStoreQueryable(chunkStore))}
-			queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger())
+			queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger(), nil)
 
 			ctx := user.InjectOrgID(context.Background(), "test")
 
@@ -1120,7 +1120,7 @@ func TestQuerier_ValidateQueryTimeRange_MaxQueryLookback(t *testing.T) {
 					distributor := &MockDistributor{}
 					distributor.On("QueryStream", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&client.QueryStreamResponse{}, nil)
 
-					queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger())
+					queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger(), nil)
 					require.NoError(t, err)
 
 					query, err := queryEngine.NewRangeQuery(ctx, queryable, nil, testData.query, testData.queryStartTime, testData.queryEndTime, time.Minute)
@@ -1149,7 +1149,7 @@ func TestQuerier_ValidateQueryTimeRange_MaxQueryLookback(t *testing.T) {
 					distributor.On("MetricsForLabelMatchers", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]model.Metric{}, nil)
 					distributor.On("MetricsForLabelMatchersStream", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]model.Metric{}, nil)
 
-					queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger())
+					queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger(), nil)
 					q, err := queryable.Querier(util.TimeToMillis(testData.queryStartTime), util.TimeToMillis(testData.queryEndTime))
 					require.NoError(t, err)
 
@@ -1190,7 +1190,7 @@ func TestQuerier_ValidateQueryTimeRange_MaxQueryLookback(t *testing.T) {
 					distributor.On("LabelNames", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]string{}, nil)
 					distributor.On("LabelNamesStream", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]string{}, nil)
 
-					queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger())
+					queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger(), nil)
 					q, err := queryable.Querier(util.TimeToMillis(testData.queryStartTime), util.TimeToMillis(testData.queryEndTime))
 					require.NoError(t, err)
 
@@ -1218,7 +1218,7 @@ func TestQuerier_ValidateQueryTimeRange_MaxQueryLookback(t *testing.T) {
 					distributor.On("MetricsForLabelMatchers", mock.Anything, mock.Anything, mock.Anything, mock.Anything, matchers).Return([]model.Metric{}, nil)
 					distributor.On("MetricsForLabelMatchersStream", mock.Anything, mock.Anything, mock.Anything, mock.Anything, matchers).Return([]model.Metric{}, nil)
 
-					queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger())
+					queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger(), nil)
 					q, err := queryable.Querier(util.TimeToMillis(testData.queryStartTime), util.TimeToMillis(testData.queryEndTime))
 					require.NoError(t, err)
 
@@ -1245,7 +1245,7 @@ func TestQuerier_ValidateQueryTimeRange_MaxQueryLookback(t *testing.T) {
 					distributor.On("LabelValuesForLabelName", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]string{}, nil)
 					distributor.On("LabelValuesForLabelNameStream", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return([]string{}, nil)
 
-					queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger())
+					queryable, _, _ := New(cfg, overrides, distributor, queryables, nil, log.NewNopLogger(), nil)
 					q, err := queryable.Querier(util.TimeToMillis(testData.queryStartTime), util.TimeToMillis(testData.queryEndTime))
 					require.NoError(t, err)
 
@@ -1365,28 +1365,28 @@ type errDistributor struct{}
 
 var errDistributorError = fmt.Errorf("errDistributorError")
 
-func (m *errDistributor) QueryStream(ctx context.Context, from, to model.Time, matchers ...*labels.Matcher) (*client.QueryStreamResponse, error) {
+func (m *errDistributor) QueryStream(ctx context.Context, from, to model.Time, partialDataEnabled bool, matchers ...*labels.Matcher) (*client.QueryStreamResponse, error) {
 	return nil, errDistributorError
 }
 func (m *errDistributor) QueryExemplars(ctx context.Context, from, to model.Time, matchers ...[]*labels.Matcher) (*client.ExemplarQueryResponse, error) {
 	return nil, errDistributorError
 }
-func (m *errDistributor) LabelValuesForLabelName(context.Context, model.Time, model.Time, model.LabelName, *storage.LabelHints, ...*labels.Matcher) ([]string, error) {
+func (m *errDistributor) LabelValuesForLabelName(context.Context, model.Time, model.Time, model.LabelName, *storage.LabelHints, bool, ...*labels.Matcher) ([]string, error) {
 	return nil, errDistributorError
 }
-func (m *errDistributor) LabelValuesForLabelNameStream(context.Context, model.Time, model.Time, model.LabelName, *storage.LabelHints, ...*labels.Matcher) ([]string, error) {
+func (m *errDistributor) LabelValuesForLabelNameStream(context.Context, model.Time, model.Time, model.LabelName, *storage.LabelHints, bool, ...*labels.Matcher) ([]string, error) {
 	return nil, errDistributorError
 }
-func (m *errDistributor) LabelNames(context.Context, model.Time, model.Time, *storage.LabelHints, ...*labels.Matcher) ([]string, error) {
+func (m *errDistributor) LabelNames(context.Context, model.Time, model.Time, *storage.LabelHints, bool, ...*labels.Matcher) ([]string, error) {
 	return nil, errDistributorError
 }
-func (m *errDistributor) LabelNamesStream(context.Context, model.Time, model.Time, *storage.LabelHints, ...*labels.Matcher) ([]string, error) {
+func (m *errDistributor) LabelNamesStream(context.Context, model.Time, model.Time, *storage.LabelHints, bool, ...*labels.Matcher) ([]string, error) {
 	return nil, errDistributorError
 }
-func (m *errDistributor) MetricsForLabelMatchers(ctx context.Context, from, through model.Time, hints *storage.SelectHints, matchers ...*labels.Matcher) ([]model.Metric, error) {
+func (m *errDistributor) MetricsForLabelMatchers(ctx context.Context, from, through model.Time, hint *storage.SelectHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]model.Metric, error) {
 	return nil, errDistributorError
 }
-func (m *errDistributor) MetricsForLabelMatchersStream(ctx context.Context, from, through model.Time, hints *storage.SelectHints, matchers ...*labels.Matcher) ([]model.Metric, error) {
+func (m *errDistributor) MetricsForLabelMatchersStream(ctx context.Context, from, through model.Time, hint *storage.SelectHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]model.Metric, error) {
 	return nil, errDistributorError
 }
 
@@ -1414,7 +1414,7 @@ func (c *emptyChunkStore) IsCalled() bool {
 
 type emptyDistributor struct{}
 
-func (d *emptyDistributor) QueryStream(ctx context.Context, from, to model.Time, matchers ...*labels.Matcher) (*client.QueryStreamResponse, error) {
+func (d *emptyDistributor) QueryStream(ctx context.Context, from, to model.Time, partialDataEnabled bool, matchers ...*labels.Matcher) (*client.QueryStreamResponse, error) {
 	return &client.QueryStreamResponse{}, nil
 }
 
@@ -1422,27 +1422,27 @@ func (d *emptyDistributor) QueryExemplars(ctx context.Context, from, to model.Ti
 	return nil, nil
 }
 
-func (d *emptyDistributor) LabelValuesForLabelName(context.Context, model.Time, model.Time, model.LabelName, *storage.LabelHints, ...*labels.Matcher) ([]string, error) {
+func (d *emptyDistributor) LabelValuesForLabelName(context.Context, model.Time, model.Time, model.LabelName, *storage.LabelHints, bool, ...*labels.Matcher) ([]string, error) {
 	return nil, nil
 }
 
-func (d *emptyDistributor) LabelValuesForLabelNameStream(context.Context, model.Time, model.Time, model.LabelName, *storage.LabelHints, ...*labels.Matcher) ([]string, error) {
+func (d *emptyDistributor) LabelValuesForLabelNameStream(context.Context, model.Time, model.Time, model.LabelName, *storage.LabelHints, bool, ...*labels.Matcher) ([]string, error) {
 	return nil, nil
 }
 
-func (d *emptyDistributor) LabelNames(context.Context, model.Time, model.Time, *storage.LabelHints, ...*labels.Matcher) ([]string, error) {
+func (d *emptyDistributor) LabelNames(context.Context, model.Time, model.Time, *storage.LabelHints, bool, ...*labels.Matcher) ([]string, error) {
 	return nil, nil
 }
 
-func (d *emptyDistributor) LabelNamesStream(context.Context, model.Time, model.Time, *storage.LabelHints, ...*labels.Matcher) ([]string, error) {
+func (d *emptyDistributor) LabelNamesStream(context.Context, model.Time, model.Time, *storage.LabelHints, bool, ...*labels.Matcher) ([]string, error) {
 	return nil, nil
 }
 
-func (d *emptyDistributor) MetricsForLabelMatchers(ctx context.Context, from, through model.Time, hints *storage.SelectHints, matchers ...*labels.Matcher) ([]model.Metric, error) {
+func (d *emptyDistributor) MetricsForLabelMatchers(ctx context.Context, from, through model.Time, hint *storage.SelectHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]model.Metric, error) {
 	return nil, nil
 }
 
-func (d *emptyDistributor) MetricsForLabelMatchersStream(ctx context.Context, from, through model.Time, hints *storage.SelectHints, matchers ...*labels.Matcher) ([]model.Metric, error) {
+func (d *emptyDistributor) MetricsForLabelMatchersStream(ctx context.Context, from, through model.Time, hint *storage.SelectHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]model.Metric, error) {
 	return nil, nil
 }
 
@@ -1582,7 +1582,7 @@ func TestShortTermQueryToLTS(t *testing.T) {
 			overrides, err := validation.NewOverrides(DefaultLimitsConfig(), nil)
 			require.NoError(t, err)
 
-			queryable, _, _ := New(cfg, overrides, distributor, []QueryableWithFilter{UseAlwaysQueryable(NewMockStoreQueryable(chunkStore))}, nil, log.NewNopLogger())
+			queryable, _, _ := New(cfg, overrides, distributor, []QueryableWithFilter{UseAlwaysQueryable(NewMockStoreQueryable(chunkStore))}, nil, log.NewNopLogger(), nil)
 			ctx := user.InjectOrgID(context.Background(), "0")
 			query, err := engine.NewRangeQuery(ctx, queryable, nil, "dummy", c.mint, c.maxt, 1*time.Minute)
 			require.NoError(t, err)
diff --git a/pkg/querier/testutils.go b/pkg/querier/testutils.go
index 478e61ff0c3..0ee4a414640 100644
--- a/pkg/querier/testutils.go
+++ b/pkg/querier/testutils.go
@@ -29,31 +29,31 @@ func (m *MockDistributor) QueryExemplars(ctx context.Context, from, to model.Tim
 	args := m.Called(ctx, from, to, matchers)
 	return args.Get(0).(*client.ExemplarQueryResponse), args.Error(1)
 }
-func (m *MockDistributor) QueryStream(ctx context.Context, from, to model.Time, matchers ...*labels.Matcher) (*client.QueryStreamResponse, error) {
+func (m *MockDistributor) QueryStream(ctx context.Context, from, to model.Time, partialDataEnabled bool, matchers ...*labels.Matcher) (*client.QueryStreamResponse, error) {
 	args := m.Called(ctx, from, to, matchers)
 	return args.Get(0).(*client.QueryStreamResponse), args.Error(1)
 }
-func (m *MockDistributor) LabelValuesForLabelName(ctx context.Context, from, to model.Time, lbl model.LabelName, hints *storage.LabelHints, matchers ...*labels.Matcher) ([]string, error) {
+func (m *MockDistributor) LabelValuesForLabelName(ctx context.Context, from, to model.Time, lbl model.LabelName, hints *storage.LabelHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]string, error) {
 	args := m.Called(ctx, from, to, lbl, hints, matchers)
 	return args.Get(0).([]string), args.Error(1)
 }
-func (m *MockDistributor) LabelValuesForLabelNameStream(ctx context.Context, from, to model.Time, lbl model.LabelName, hints *storage.LabelHints, matchers ...*labels.Matcher) ([]string, error) {
+func (m *MockDistributor) LabelValuesForLabelNameStream(ctx context.Context, from, to model.Time, lbl model.LabelName, hints *storage.LabelHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]string, error) {
 	args := m.Called(ctx, from, to, lbl, hints, matchers)
 	return args.Get(0).([]string), args.Error(1)
 }
-func (m *MockDistributor) LabelNames(ctx context.Context, from, to model.Time, hints *storage.LabelHints, matchers ...*labels.Matcher) ([]string, error) {
+func (m *MockDistributor) LabelNames(ctx context.Context, from model.Time, to model.Time, hints *storage.LabelHints, b bool, matchers ...*labels.Matcher) ([]string, error) {
 	args := m.Called(ctx, from, to, hints, matchers)
 	return args.Get(0).([]string), args.Error(1)
 }
-func (m *MockDistributor) LabelNamesStream(ctx context.Context, from, to model.Time, hints *storage.LabelHints, matchers ...*labels.Matcher) ([]string, error) {
+func (m *MockDistributor) LabelNamesStream(ctx context.Context, from model.Time, to model.Time, hints *storage.LabelHints, b bool, matchers ...*labels.Matcher) ([]string, error) {
 	args := m.Called(ctx, from, to, hints, matchers)
 	return args.Get(0).([]string), args.Error(1)
 }
-func (m *MockDistributor) MetricsForLabelMatchers(ctx context.Context, from, to model.Time, hints *storage.SelectHints, matchers ...*labels.Matcher) ([]model.Metric, error) {
+func (m *MockDistributor) MetricsForLabelMatchers(ctx context.Context, from, to model.Time, hints *storage.SelectHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]model.Metric, error) {
 	args := m.Called(ctx, from, to, hints, matchers)
 	return args.Get(0).([]model.Metric), args.Error(1)
 }
-func (m *MockDistributor) MetricsForLabelMatchersStream(ctx context.Context, from, to model.Time, hints *storage.SelectHints, matchers ...*labels.Matcher) ([]model.Metric, error) {
+func (m *MockDistributor) MetricsForLabelMatchersStream(ctx context.Context, from, to model.Time, hints *storage.SelectHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]model.Metric, error) {
 	args := m.Called(ctx, from, to, hints, matchers)
 	return args.Get(0).([]model.Metric), args.Error(1)
 }
@@ -68,7 +68,7 @@ type MockLimitingDistributor struct {
 	response *client.QueryStreamResponse
 }
 
-func (m *MockLimitingDistributor) QueryStream(ctx context.Context, from, to model.Time, matchers ...*labels.Matcher) (*client.QueryStreamResponse, error) {
+func (m *MockLimitingDistributor) QueryStream(ctx context.Context, from, to model.Time, partialDataEnabled bool, matchers ...*labels.Matcher) (*client.QueryStreamResponse, error) {
 	var (
 		queryLimiter = limiter.QueryLimiterFromContextWithFallback(ctx)
 	)
diff --git a/pkg/querier/tripperware/queryrange/results_cache.go b/pkg/querier/tripperware/queryrange/results_cache.go
index b3a474ba8e2..d065ac9201b 100644
--- a/pkg/querier/tripperware/queryrange/results_cache.go
+++ b/pkg/querier/tripperware/queryrange/results_cache.go
@@ -5,6 +5,7 @@ import (
 	"flag"
 	"fmt"
 	"net/http"
+	"slices"
 	"sort"
 	"strings"
 	"time"
@@ -27,6 +28,7 @@ import (
 	"github.com/cortexproject/cortex/pkg/chunk/cache"
 	"github.com/cortexproject/cortex/pkg/cortexpb"
 	"github.com/cortexproject/cortex/pkg/querier"
+	"github.com/cortexproject/cortex/pkg/querier/partialdata"
 	"github.com/cortexproject/cortex/pkg/querier/tripperware"
 	"github.com/cortexproject/cortex/pkg/tenant"
 	"github.com/cortexproject/cortex/pkg/util/flagext"
@@ -295,6 +297,9 @@ func (s resultsCache) shouldCacheResponse(ctx context.Context, req tripperware.R
 	if !s.isOffsetCachable(ctx, req) {
 		return false
 	}
+	if res, ok := r.(*tripperware.PrometheusResponse); ok {
+		return !slices.Contains(res.Warnings, partialdata.ErrPartialData.Error())
+	}
 
 	return true
 }
diff --git a/pkg/querier/tripperware/queryrange/results_cache_test.go b/pkg/querier/tripperware/queryrange/results_cache_test.go
index 51b68531b5f..1b448f371f0 100644
--- a/pkg/querier/tripperware/queryrange/results_cache_test.go
+++ b/pkg/querier/tripperware/queryrange/results_cache_test.go
@@ -18,6 +18,7 @@ import (
 
 	"github.com/cortexproject/cortex/pkg/chunk/cache"
 	"github.com/cortexproject/cortex/pkg/cortexpb"
+	"github.com/cortexproject/cortex/pkg/querier/partialdata"
 	"github.com/cortexproject/cortex/pkg/querier/tripperware"
 	"github.com/cortexproject/cortex/pkg/tenant"
 	"github.com/cortexproject/cortex/pkg/util/flagext"
@@ -554,6 +555,34 @@ func TestShouldCache(t *testing.T) {
 			input:    tripperware.Response(&tripperware.PrometheusResponse{}),
 			expected: false,
 		},
+		{
+			name:    "contains partial data warning",
+			request: &tripperware.PrometheusRequest{Query: "metric"},
+			input: tripperware.Response(&tripperware.PrometheusResponse{
+				Headers: []*tripperware.PrometheusResponseHeader{
+					{
+						Name:   "meaninglessheader",
+						Values: []string{},
+					},
+				},
+				Warnings: []string{partialdata.ErrPartialData.Error()},
+			}),
+			expected: false,
+		},
+		{
+			name:    "contains other warning",
+			request: &tripperware.PrometheusRequest{Query: "metric"},
+			input: tripperware.Response(&tripperware.PrometheusResponse{
+				Headers: []*tripperware.PrometheusResponseHeader{
+					{
+						Name:   "meaninglessheader",
+						Values: []string{},
+					},
+				},
+				Warnings: []string{"other warning"},
+			}),
+			expected: true,
+		},
 	} {
 		{
 			t.Run(tc.name, func(t *testing.T) {
diff --git a/pkg/ring/replication_set.go b/pkg/ring/replication_set.go
index 0182207fd7a..497e7287930 100644
--- a/pkg/ring/replication_set.go
+++ b/pkg/ring/replication_set.go
@@ -4,6 +4,8 @@ import (
 	"context"
 	"sort"
 	"time"
+
+	"github.com/cortexproject/cortex/pkg/querier/partialdata"
 )
 
 // ReplicationSet describes the instances to talk to for a given key, and how
@@ -23,7 +25,7 @@ type ReplicationSet struct {
 // Do function f in parallel for all replicas in the set, erroring is we exceed
 // MaxErrors and returning early otherwise. zoneResultsQuorum allows only include
 // results from zones that already reach quorum to improve performance.
-func (r ReplicationSet) Do(ctx context.Context, delay time.Duration, zoneResultsQuorum bool, f func(context.Context, *InstanceDesc) (interface{}, error)) ([]interface{}, error) {
+func (r ReplicationSet) Do(ctx context.Context, delay time.Duration, zoneResultsQuorum bool, partialDataEnabled bool, f func(context.Context, *InstanceDesc) (interface{}, error)) ([]interface{}, error) {
 	type instanceResult struct {
 		res      interface{}
 		err      error
@@ -68,13 +70,20 @@ func (r ReplicationSet) Do(ctx context.Context, delay time.Duration, zoneResults
 		}(i, &r.Instances[i])
 	}
 
+	trackerFailed := false
+	cnt := 0
+
+track:
 	for !tracker.succeeded() {
 		select {
 		case res := <-ch:
 			tracker.done(res.instance, res.res, res.err)
 			if res.err != nil {
 				if tracker.failed() {
-					return nil, res.err
+					if !partialDataEnabled || tracker.failedInAllZones() {
+						return nil, res.err
+					}
+					trackerFailed = true
 				}
 
 				// force one of the delayed requests to start
@@ -82,12 +91,20 @@ func (r ReplicationSet) Do(ctx context.Context, delay time.Duration, zoneResults
 					forceStart <- struct{}{}
 				}
 			}
+			cnt++
+			if cnt == len(r.Instances) {
+				break track
+			}
 
 		case <-ctx.Done():
 			return nil, ctx.Err()
 		}
 	}
 
+	if partialDataEnabled && trackerFailed {
+		return tracker.getResults(), partialdata.ErrPartialData
+	}
+
 	return tracker.getResults(), nil
 }
 
diff --git a/pkg/ring/replication_set_test.go b/pkg/ring/replication_set_test.go
index 7e764bdf547..b5764563797 100644
--- a/pkg/ring/replication_set_test.go
+++ b/pkg/ring/replication_set_test.go
@@ -9,6 +9,8 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"go.uber.org/atomic"
+
+	"github.com/cortexproject/cortex/pkg/querier/partialdata"
 )
 
 func TestReplicationSet_GetAddresses(t *testing.T) {
@@ -121,6 +123,7 @@ func TestReplicationSet_Do(t *testing.T) {
 		want                []interface{}
 		expectedError       error
 		zoneResultsQuorum   bool
+		queryPartialData    bool
 	}{
 		{
 			name: "max errors = 0, no errors no delay",
@@ -191,6 +194,23 @@ func TestReplicationSet_Do(t *testing.T) {
 			maxUnavailableZones: 1,
 			expectedError:       errZoneFailure,
 		},
+		{
+			name:                "with partial data enabled and max unavailable zones = 1, should succeed on instances failing in 2 out of 3 zones (3 instances)",
+			instances:           []InstanceDesc{{Zone: "zone1"}, {Zone: "zone2"}, {Zone: "zone3"}},
+			f:                   failingFunctionOnZones("zone1", "zone2"),
+			maxUnavailableZones: 1,
+			queryPartialData:    true,
+			want:                []interface{}{1},
+			expectedError:       partialdata.ErrPartialData,
+		},
+		{
+			name:                "with partial data enabled, should fail on instances failing in all zones",
+			instances:           []InstanceDesc{{Zone: "zone1"}, {Zone: "zone2"}, {Zone: "zone3"}, {Zone: "zone2"}, {Zone: "zone3"}},
+			f:                   failingFunctionOnZones("zone1", "zone2", "zone3"),
+			maxUnavailableZones: 1,
+			expectedError:       errZoneFailure,
+			queryPartialData:    true,
+		},
 		{
 			name:                "max unavailable zones = 1, should succeed on instances failing in 1 out of 3 zones (6 instances)",
 			instances:           []InstanceDesc{{Zone: "zone1"}, {Zone: "zone1"}, {Zone: "zone2"}, {Zone: "zone2"}, {Zone: "zone3"}, {Zone: "zone3"}},
@@ -242,7 +262,7 @@ func TestReplicationSet_Do(t *testing.T) {
 					cancel()
 				})
 			}
-			got, err := r.Do(ctx, tt.delay, tt.zoneResultsQuorum, tt.f)
+			got, err := r.Do(ctx, tt.delay, tt.zoneResultsQuorum, tt.queryPartialData, tt.f)
 			if tt.expectedError != nil {
 				assert.Equal(t, tt.expectedError, err)
 			} else {
diff --git a/pkg/ring/replication_set_tracker.go b/pkg/ring/replication_set_tracker.go
index dd229097472..3d9d67901f9 100644
--- a/pkg/ring/replication_set_tracker.go
+++ b/pkg/ring/replication_set_tracker.go
@@ -12,6 +12,9 @@ type replicationSetResultTracker interface {
 	// Returns true if the maximum number of failed executions have been reached.
 	failed() bool
 
+	// Returns true if executions failed in all zones. Only relevant for zoneAwareResultTracker.
+	failedInAllZones() bool
+
 	// Returns recorded results.
 	getResults() []interface{}
 }
@@ -51,6 +54,10 @@ func (t *defaultResultTracker) failed() bool {
 	return t.numErrors > t.maxErrors
 }
 
+func (t *defaultResultTracker) failedInAllZones() bool {
+	return false
+}
+
 func (t *defaultResultTracker) getResults() []interface{} {
 	return t.results
 }
@@ -65,6 +72,7 @@ type zoneAwareResultTracker struct {
 	resultsPerZone      map[string][]interface{}
 	numInstances        int
 	zoneResultsQuorum   bool
+	zoneCount           int
 }
 
 func newZoneAwareResultTracker(instances []InstanceDesc, maxUnavailableZones int, zoneResultsQuorum bool) *zoneAwareResultTracker {
@@ -81,6 +89,7 @@ func newZoneAwareResultTracker(instances []InstanceDesc, maxUnavailableZones int
 	}
 	t.minSuccessfulZones = len(t.waitingByZone) - maxUnavailableZones
 	t.resultsPerZone = make(map[string][]interface{}, len(t.waitingByZone))
+	t.zoneCount = len(t.waitingByZone)
 
 	return t
 }
@@ -119,6 +128,11 @@ func (t *zoneAwareResultTracker) failed() bool {
 	return failedZones > t.maxUnavailableZones
 }
 
+func (t *zoneAwareResultTracker) failedInAllZones() bool {
+	failedZones := len(t.failuresByZone)
+	return failedZones == t.zoneCount
+}
+
 func (t *zoneAwareResultTracker) getResults() []interface{} {
 	results := make([]interface{}, 0, t.numInstances)
 	if t.zoneResultsQuorum {
diff --git a/pkg/ring/replication_set_tracker_test.go b/pkg/ring/replication_set_tracker_test.go
index a0d04f1279e..1e22418ecbd 100644
--- a/pkg/ring/replication_set_tracker_test.go
+++ b/pkg/ring/replication_set_tracker_test.go
@@ -399,6 +399,29 @@ func TestZoneAwareResultTracker(t *testing.T) {
 				assert.False(t, tracker.failed())
 			},
 		},
+		"failInAllZones should return true only if all zones have failed, regardless of max unavailable zones": {
+			instances:           []InstanceDesc{instance1, instance2, instance3, instance4, instance5, instance6},
+			maxUnavailableZones: 1,
+			run: func(t *testing.T, tracker *zoneAwareResultTracker) {
+				// Zone-a
+				tracker.done(&instance1, nil, errors.New("test"))
+				assert.False(t, tracker.succeeded())
+				assert.False(t, tracker.failed())
+				assert.False(t, tracker.failedInAllZones())
+
+				// Zone-b
+				tracker.done(&instance3, nil, errors.New("test"))
+				assert.False(t, tracker.succeeded())
+				assert.True(t, tracker.failed())
+				assert.False(t, tracker.failedInAllZones())
+
+				// Zone-c
+				tracker.done(&instance5, nil, errors.New("test"))
+				assert.False(t, tracker.succeeded())
+				assert.True(t, tracker.failed())
+				assert.True(t, tracker.failedInAllZones())
+			},
+		},
 	}
 
 	for testName, testCase := range tests {
diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go
index 7aceeeac22a..1c515ebd4cb 100644
--- a/pkg/ruler/ruler_test.go
+++ b/pkg/ruler/ruler_test.go
@@ -223,7 +223,7 @@ func testQueryableFunc(querierTestConfig *querier.TestConfig, reg prometheus.Reg
 		querierTestConfig.Cfg.ActiveQueryTrackerDir = ""
 
 		overrides, _ := validation.NewOverrides(querier.DefaultLimitsConfig(), nil)
-		q, _, _ := querier.New(querierTestConfig.Cfg, overrides, querierTestConfig.Distributor, querierTestConfig.Stores, reg, logger)
+		q, _, _ := querier.New(querierTestConfig.Cfg, overrides, querierTestConfig.Distributor, querierTestConfig.Stores, reg, logger, nil)
 		return func(mint, maxt int64) (storage.Querier, error) {
 			return q.Querier(mint, maxt)
 		}
diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go
index 7d2ab8518d3..729f62566e2 100644
--- a/pkg/util/validation/limits.go
+++ b/pkg/util/validation/limits.go
@@ -171,6 +171,7 @@ type Limits struct {
 	MaxCacheFreshness            model.Duration `yaml:"max_cache_freshness" json:"max_cache_freshness"`
 	MaxQueriersPerTenant         float64        `yaml:"max_queriers_per_tenant" json:"max_queriers_per_tenant"`
 	QueryVerticalShardSize       int            `yaml:"query_vertical_shard_size" json:"query_vertical_shard_size" doc:"hidden"`
+	QueryPartialData             bool           `yaml:"query_partial_data" json:"query_partial_data" doc:"nocli|description=Enable to allow queries to be evaluated with data from a single zone, if other zones are not available.|default=false"`
 
 	// Query Frontend / Scheduler enforced limits.
 	MaxOutstandingPerTenant     int           `yaml:"max_outstanding_requests_per_tenant" json:"max_outstanding_requests_per_tenant"`
@@ -186,6 +187,7 @@ type Limits struct {
 	RulerMaxRuleGroupsPerTenant int            `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"`
 	RulerQueryOffset            model.Duration `yaml:"ruler_query_offset" json:"ruler_query_offset"`
 	RulerExternalLabels         labels.Labels  `yaml:"ruler_external_labels" json:"ruler_external_labels" doc:"nocli|description=external labels for alerting rules"`
+	RulesPartialData            bool           `yaml:"rules_partial_data" json:"rules_partial_data" doc:"nocli|description=Enable to allow rules to be evaluated with data from a single zone, if other zones are not available.|default=false"`
 
 	// Store-gateway.
 	StoreGatewayTenantShardSize  float64 `yaml:"store_gateway_tenant_shard_size" json:"store_gateway_tenant_shard_size"`
@@ -726,6 +728,11 @@ func (o *Overrides) QueryVerticalShardSize(userID string) int {
 	return o.GetOverridesForUser(userID).QueryVerticalShardSize
 }
 
+// QueryPartialData returns whether query may be evaluated with data from a single zone, if other zones are not available.
+func (o *Overrides) QueryPartialData(userID string) bool {
+	return o.GetOverridesForUser(userID).QueryPartialData
+}
+
 // MaxQueryParallelism returns the limit to the number of split queries the
 // frontend will process in parallel.
 func (o *Overrides) MaxQueryParallelism(userID string) int {
@@ -845,6 +852,11 @@ func (o *Overrides) RulerQueryOffset(userID string) time.Duration {
 	return ruleOffset
 }
 
+// RulesPartialData returns whether rule may be evaluated with data from a single zone, if other zones are not available.
+func (o *Overrides) RulesPartialData(userID string) bool {
+	return o.GetOverridesForUser(userID).RulesPartialData
+}
+
 // StoreGatewayTenantShardSize returns the store-gateway shard size for a given user.
 func (o *Overrides) StoreGatewayTenantShardSize(userID string) float64 {
 	return o.GetOverridesForUser(userID).StoreGatewayTenantShardSize
diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go
index 3ac0230d679..414cb3e8d45 100644
--- a/pkg/util/validation/limits_test.go
+++ b/pkg/util/validation/limits_test.go
@@ -638,6 +638,40 @@ tenant2:
 	require.Equal(t, 5, ov.MaxDownloadedBytesPerRequest("tenant3"))
 }
 
+func TestPartialDataOverridesPerTenant(t *testing.T) {
+	SetDefaultLimitsForYAMLUnmarshalling(Limits{})
+
+	baseYAML := `
+query_partial_data: false
+rules_partial_data: false`
+	overridesYAML := `
+tenant1:
+  query_partial_data: true
+tenant2:
+  query_partial_data: true
+  rules_partial_data: true`
+
+	l := Limits{}
+	err := yaml.UnmarshalStrict([]byte(baseYAML), &l)
+	require.NoError(t, err)
+
+	overrides := map[string]*Limits{}
+	err = yaml.Unmarshal([]byte(overridesYAML), &overrides)
+	require.NoError(t, err, "parsing overrides")
+
+	tl := newMockTenantLimits(overrides)
+
+	ov, err := NewOverrides(l, tl)
+	require.NoError(t, err)
+
+	require.True(t, ov.QueryPartialData("tenant1"))
+	require.False(t, ov.RulesPartialData("tenant1"))
+	require.True(t, ov.QueryPartialData("tenant2"))
+	require.True(t, ov.RulesPartialData("tenant2"))
+	require.False(t, ov.QueryPartialData("tenant3"))
+	require.False(t, ov.RulesPartialData("tenant3"))
+}
+
 func TestHasQueryAttributeRegexChanged(t *testing.T) {
 	l := Limits{
 		QueryPriority: QueryPriority{

From 1c7157c63ecde3d398143e9aff79b6ca75e951e4 Mon Sep 17 00:00:00 2001
From: Ben Ye <benye@amazon.com>
Date: Tue, 28 Jan 2025 15:30:41 -0800
Subject: [PATCH 27/34] Add timeout for dynamodb ring kv (#6544)

* add dynamodb kv with timeout enforced

Signed-off-by: yeya24 <benye@amazon.com>

* add tests

Signed-off-by: yeya24 <benye@amazon.com>

* docs

Signed-off-by: Ben Ye <benye@amazon.com>

* update changelog

Signed-off-by: Ben Ye <benye@amazon.com>

---------

Signed-off-by: yeya24 <benye@amazon.com>
Signed-off-by: Ben Ye <benye@amazon.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 docs/blocks-storage/compactor.md            |  4 ++
 docs/blocks-storage/store-gateway.md        |  4 ++
 docs/configuration/config-file-reference.md | 28 ++++++++
 pkg/ring/kv/dynamodb/client.go              |  9 ++-
 pkg/ring/kv/dynamodb/client_test.go         | 77 +++++++++++++++++++++
 pkg/ring/kv/dynamodb/dynamodb.go            | 39 +++++++++++
 6 files changed, 160 insertions(+), 1 deletion(-)

diff --git a/docs/blocks-storage/compactor.md b/docs/blocks-storage/compactor.md
index 1cfc53ec5c6..15f8fe7383a 100644
--- a/docs/blocks-storage/compactor.md
+++ b/docs/blocks-storage/compactor.md
@@ -225,6 +225,10 @@ compactor:
         # CLI flag: -compactor.ring.dynamodb.max-cas-retries
         [max_cas_retries: <int> | default = 10]
 
+        # Timeout of dynamoDbClient requests. Default is 2m.
+        # CLI flag: -compactor.ring.dynamodb.timeout
+        [timeout: <duration> | default = 2m]
+
       # The consul_config configures the consul client.
       # The CLI flags prefix for this block config is: compactor.ring
       [consul: <consul_config>]
diff --git a/docs/blocks-storage/store-gateway.md b/docs/blocks-storage/store-gateway.md
index fbd3c92af2c..31005f0eaec 100644
--- a/docs/blocks-storage/store-gateway.md
+++ b/docs/blocks-storage/store-gateway.md
@@ -240,6 +240,10 @@ store_gateway:
         # CLI flag: -store-gateway.sharding-ring.dynamodb.max-cas-retries
         [max_cas_retries: <int> | default = 10]
 
+        # Timeout of dynamoDbClient requests. Default is 2m.
+        # CLI flag: -store-gateway.sharding-ring.dynamodb.timeout
+        [timeout: <duration> | default = 2m]
+
       # The consul_config configures the consul client.
       # The CLI flags prefix for this block config is:
       # store-gateway.sharding-ring
diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
index 43948773a55..e850a511b5b 100644
--- a/docs/configuration/config-file-reference.md
+++ b/docs/configuration/config-file-reference.md
@@ -341,6 +341,10 @@ sharding_ring:
       # CLI flag: -alertmanager.sharding-ring.dynamodb.max-cas-retries
       [max_cas_retries: <int> | default = 10]
 
+      # Timeout of dynamoDbClient requests. Default is 2m.
+      # CLI flag: -alertmanager.sharding-ring.dynamodb.timeout
+      [timeout: <duration> | default = 2m]
+
     # The consul_config configures the consul client.
     # The CLI flags prefix for this block config is: alertmanager.sharding-ring
     [consul: <consul_config>]
@@ -2286,6 +2290,10 @@ sharding_ring:
       # CLI flag: -compactor.ring.dynamodb.max-cas-retries
       [max_cas_retries: <int> | default = 10]
 
+      # Timeout of dynamoDbClient requests. Default is 2m.
+      # CLI flag: -compactor.ring.dynamodb.timeout
+      [timeout: <duration> | default = 2m]
+
     # The consul_config configures the consul client.
     # The CLI flags prefix for this block config is: compactor.ring
     [consul: <consul_config>]
@@ -2595,6 +2603,10 @@ ha_tracker:
       # CLI flag: -distributor.ha-tracker.dynamodb.max-cas-retries
       [max_cas_retries: <int> | default = 10]
 
+      # Timeout of dynamoDbClient requests. Default is 2m.
+      # CLI flag: -distributor.ha-tracker.dynamodb.timeout
+      [timeout: <duration> | default = 2m]
+
     # The consul_config configures the consul client.
     # The CLI flags prefix for this block config is: distributor.ha-tracker
     [consul: <consul_config>]
@@ -2689,6 +2701,10 @@ ring:
       # CLI flag: -distributor.ring.dynamodb.max-cas-retries
       [max_cas_retries: <int> | default = 10]
 
+      # Timeout of dynamoDbClient requests. Default is 2m.
+      # CLI flag: -distributor.ring.dynamodb.timeout
+      [timeout: <duration> | default = 2m]
+
     # The consul_config configures the consul client.
     # The CLI flags prefix for this block config is: distributor.ring
     [consul: <consul_config>]
@@ -3017,6 +3033,10 @@ lifecycler:
         # CLI flag: -dynamodb.max-cas-retries
         [max_cas_retries: <int> | default = 10]
 
+        # Timeout of dynamoDbClient requests. Default is 2m.
+        # CLI flag: -dynamodb.timeout
+        [timeout: <duration> | default = 2m]
+
       # The consul_config configures the consul client.
       [consul: <consul_config>]
 
@@ -4674,6 +4694,10 @@ ring:
       # CLI flag: -ruler.ring.dynamodb.max-cas-retries
       [max_cas_retries: <int> | default = 10]
 
+      # Timeout of dynamoDbClient requests. Default is 2m.
+      # CLI flag: -ruler.ring.dynamodb.timeout
+      [timeout: <duration> | default = 2m]
+
     # The consul_config configures the consul client.
     # The CLI flags prefix for this block config is: ruler.ring
     [consul: <consul_config>]
@@ -5665,6 +5689,10 @@ sharding_ring:
       # CLI flag: -store-gateway.sharding-ring.dynamodb.max-cas-retries
       [max_cas_retries: <int> | default = 10]
 
+      # Timeout of dynamoDbClient requests. Default is 2m.
+      # CLI flag: -store-gateway.sharding-ring.dynamodb.timeout
+      [timeout: <duration> | default = 2m]
+
     # The consul_config configures the consul client.
     # The CLI flags prefix for this block config is: store-gateway.sharding-ring
     [consul: <consul_config>]
diff --git a/pkg/ring/kv/dynamodb/client.go b/pkg/ring/kv/dynamodb/client.go
index 71de47f0e5c..0fb53294d1d 100644
--- a/pkg/ring/kv/dynamodb/client.go
+++ b/pkg/ring/kv/dynamodb/client.go
@@ -26,6 +26,7 @@ type Config struct {
 	TTL            time.Duration `yaml:"ttl"`
 	PullerSyncTime time.Duration `yaml:"puller_sync_time"`
 	MaxCasRetries  int           `yaml:"max_cas_retries"`
+	Timeout        time.Duration `yaml:"timeout"`
 }
 
 type Client struct {
@@ -53,6 +54,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet, prefix string) {
 	f.DurationVar(&cfg.TTL, prefix+"dynamodb.ttl-time", 0, "Time to expire items on dynamodb.")
 	f.DurationVar(&cfg.PullerSyncTime, prefix+"dynamodb.puller-sync-time", 60*time.Second, "Time to refresh local ring with information on dynamodb.")
 	f.IntVar(&cfg.MaxCasRetries, prefix+"dynamodb.max-cas-retries", maxCasRetries, "Maximum number of retries for DDB KV CAS.")
+	f.DurationVar(&cfg.Timeout, prefix+"dynamodb.timeout", 2*time.Minute, "Timeout of dynamoDbClient requests. Default is 2m.")
 }
 
 func NewClient(cfg Config, cc codec.Codec, logger log.Logger, registerer prometheus.Registerer) (*Client, error) {
@@ -69,8 +71,13 @@ func NewClient(cfg Config, cc codec.Codec, logger log.Logger, registerer prometh
 		MaxRetries: cfg.MaxCasRetries,
 	}
 
+	var kv dynamoDbClient
+	kv = dynamodbInstrumentation{kv: dynamoDB, ddbMetrics: ddbMetrics}
+	if cfg.Timeout > 0 {
+		kv = newDynamodbKVWithTimeout(kv, cfg.Timeout)
+	}
 	c := &Client{
-		kv:             dynamodbInstrumentation{kv: dynamoDB, ddbMetrics: ddbMetrics},
+		kv:             kv,
 		codec:          cc,
 		logger:         ddbLog(logger),
 		ddbMetrics:     ddbMetrics,
diff --git a/pkg/ring/kv/dynamodb/client_test.go b/pkg/ring/kv/dynamodb/client_test.go
index 666e2cf3112..7cefe64f752 100644
--- a/pkg/ring/kv/dynamodb/client_test.go
+++ b/pkg/ring/kv/dynamodb/client_test.go
@@ -302,6 +302,29 @@ func Test_UpdateStaleData(t *testing.T) {
 
 }
 
+func Test_DynamodbKVWithTimeout(t *testing.T) {
+	ddbMock := NewDynamodbClientMock()
+	// Backend has delay of 5s while the client timeout is 1s.
+	ddbWithDelay := newDynamodbKVWithDelay(ddbMock, time.Second*5)
+	dbWithTimeout := newDynamodbKVWithTimeout(ddbWithDelay, time.Second)
+
+	ctx := context.Background()
+	_, _, err := dbWithTimeout.List(ctx, dynamodbKey{primaryKey: key})
+	require.True(t, errors.Is(err, context.DeadlineExceeded))
+
+	err = dbWithTimeout.Delete(ctx, dynamodbKey{primaryKey: key})
+	require.True(t, errors.Is(err, context.DeadlineExceeded))
+
+	_, _, err = dbWithTimeout.Query(ctx, dynamodbKey{primaryKey: key}, true)
+	require.True(t, errors.Is(err, context.DeadlineExceeded))
+
+	err = dbWithTimeout.Put(ctx, dynamodbKey{primaryKey: key}, []byte{})
+	require.True(t, errors.Is(err, context.DeadlineExceeded))
+
+	err = dbWithTimeout.Batch(ctx, nil, nil)
+	require.True(t, errors.Is(err, context.DeadlineExceeded))
+}
+
 // NewClientMock makes a new local dynamodb client.
 func NewClientMock(ddbClient dynamoDbClient, cc codec.Codec, logger log.Logger, registerer prometheus.Registerer, time time.Duration, config backoff.Config) *Client {
 	return &Client{
@@ -429,3 +452,57 @@ func (m *DescMock) FindDifference(that codec.MultiKey) (interface{}, []string, e
 	}
 	return args.Get(0), args.Get(1).([]string), err
 }
+
+type dynamodbKVWithDelayAndContextCheck struct {
+	ddbClient dynamoDbClient
+	delay     time.Duration
+}
+
+func newDynamodbKVWithDelay(client dynamoDbClient, delay time.Duration) *dynamodbKVWithDelayAndContextCheck {
+	return &dynamodbKVWithDelayAndContextCheck{ddbClient: client, delay: delay}
+}
+
+func (d *dynamodbKVWithDelayAndContextCheck) List(ctx context.Context, key dynamodbKey) ([]string, float64, error) {
+	select {
+	case <-ctx.Done():
+		return nil, 0, ctx.Err()
+	case <-time.After(d.delay):
+		return d.ddbClient.List(ctx, key)
+	}
+}
+
+func (d *dynamodbKVWithDelayAndContextCheck) Query(ctx context.Context, key dynamodbKey, isPrefix bool) (map[string][]byte, float64, error) {
+	select {
+	case <-ctx.Done():
+		return nil, 0, ctx.Err()
+	case <-time.After(d.delay):
+		return d.ddbClient.Query(ctx, key, isPrefix)
+	}
+}
+
+func (d *dynamodbKVWithDelayAndContextCheck) Delete(ctx context.Context, key dynamodbKey) error {
+	select {
+	case <-ctx.Done():
+		return ctx.Err()
+	case <-time.After(d.delay):
+		return d.ddbClient.Delete(ctx, key)
+	}
+}
+
+func (d *dynamodbKVWithDelayAndContextCheck) Put(ctx context.Context, key dynamodbKey, data []byte) error {
+	select {
+	case <-ctx.Done():
+		return ctx.Err()
+	case <-time.After(d.delay):
+		return d.ddbClient.Put(ctx, key, data)
+	}
+}
+
+func (d *dynamodbKVWithDelayAndContextCheck) Batch(ctx context.Context, put map[dynamodbKey][]byte, delete []dynamodbKey) error {
+	select {
+	case <-ctx.Done():
+		return ctx.Err()
+	case <-time.After(d.delay):
+		return d.ddbClient.Batch(ctx, put, delete)
+	}
+}
diff --git a/pkg/ring/kv/dynamodb/dynamodb.go b/pkg/ring/kv/dynamodb/dynamodb.go
index f54e5fe55be..2dc4769d6e2 100644
--- a/pkg/ring/kv/dynamodb/dynamodb.go
+++ b/pkg/ring/kv/dynamodb/dynamodb.go
@@ -259,6 +259,45 @@ func (kv dynamodbKV) generatePutItemRequest(key dynamodbKey, data []byte) map[st
 	return item
 }
 
+type dynamodbKVWithTimeout struct {
+	ddbClient dynamoDbClient
+	timeout   time.Duration
+}
+
+func newDynamodbKVWithTimeout(client dynamoDbClient, timeout time.Duration) *dynamodbKVWithTimeout {
+	return &dynamodbKVWithTimeout{ddbClient: client, timeout: timeout}
+}
+
+func (d *dynamodbKVWithTimeout) List(ctx context.Context, key dynamodbKey) ([]string, float64, error) {
+	ctx, cancel := context.WithTimeout(ctx, d.timeout)
+	defer cancel()
+	return d.ddbClient.List(ctx, key)
+}
+
+func (d *dynamodbKVWithTimeout) Query(ctx context.Context, key dynamodbKey, isPrefix bool) (map[string][]byte, float64, error) {
+	ctx, cancel := context.WithTimeout(ctx, d.timeout)
+	defer cancel()
+	return d.ddbClient.Query(ctx, key, isPrefix)
+}
+
+func (d *dynamodbKVWithTimeout) Delete(ctx context.Context, key dynamodbKey) error {
+	ctx, cancel := context.WithTimeout(ctx, d.timeout)
+	defer cancel()
+	return d.ddbClient.Delete(ctx, key)
+}
+
+func (d *dynamodbKVWithTimeout) Put(ctx context.Context, key dynamodbKey, data []byte) error {
+	ctx, cancel := context.WithTimeout(ctx, d.timeout)
+	defer cancel()
+	return d.ddbClient.Put(ctx, key, data)
+}
+
+func (d *dynamodbKVWithTimeout) Batch(ctx context.Context, put map[dynamodbKey][]byte, delete []dynamodbKey) error {
+	ctx, cancel := context.WithTimeout(ctx, d.timeout)
+	defer cancel()
+	return d.ddbClient.Batch(ctx, put, delete)
+}
+
 func generateItemKey(key dynamodbKey) map[string]*dynamodb.AttributeValue {
 	resp := map[string]*dynamodb.AttributeValue{
 		primaryKey: {

From 4ec7588755c9f23dd1dd5a9e03773c5b8e3bd53c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 29 Jan 2025 09:20:39 -0800
Subject: [PATCH 28/34] Bump the actions-dependencies group across 1 directory
 with 2 updates (#6564)

Bumps the actions-dependencies group with 2 updates in the / directory: [github/codeql-action](https://github.com/github/codeql-action) and [actions/setup-go](https://github.com/actions/setup-go).

Updates `github/codeql-action` from 3.28.1 to 3.28.7
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/b6a472f63d85b9c78a3ac5e89422239fc15e9b3c...6e5455904168f98c75d8e5ad848b4dc4ab3ae77e)

Updates `actions/setup-go` from 5.2.0 to 5.3.0
- [Release notes](https://github.com/actions/setup-go/releases)
- [Commits](https://github.com/actions/setup-go/compare/3041bf56c941b39c61721a86cd11f3bb1338122a...f111f3307d8850f501ac008e886eec1fd1932a34)

---
updated-dependencies:
- dependency-name: github/codeql-action
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: actions-dependencies
- dependency-name: actions/setup-go
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: actions-dependencies
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 .github/workflows/scorecards.yml        | 2 +-
 .github/workflows/test-build-deploy.yml | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 0b635368489..d61ee925e8f 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -64,6 +64,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1
+        uses: github/codeql-action/upload-sarif@6e5455904168f98c75d8e5ad848b4dc4ab3ae77e # v3.28.7
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/test-build-deploy.yml b/.github/workflows/test-build-deploy.yml
index 88aa222aad7..b3e1afe68fe 100644
--- a/.github/workflows/test-build-deploy.yml
+++ b/.github/workflows/test-build-deploy.yml
@@ -93,15 +93,15 @@ jobs:
 
       # Initializes the CodeQL tools for scanning.
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1
+        uses: github/codeql-action/init@6e5455904168f98c75d8e5ad848b4dc4ab3ae77e # v3.28.7
         with:
           languages: go
 
       - name: Autobuild
-        uses: github/codeql-action/autobuild@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1
+        uses: github/codeql-action/autobuild@6e5455904168f98c75d8e5ad848b4dc4ab3ae77e # v3.28.7
 
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1
+        uses: github/codeql-action/analyze@6e5455904168f98c75d8e5ad848b4dc4ab3ae77e # v3.28.7
 
   
   build:
@@ -164,7 +164,7 @@ jobs:
           - integration_query_fuzz
     steps:
       - name: Upgrade golang
-        uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0
+        uses: actions/setup-go@f111f3307d8850f501ac008e886eec1fd1932a34 # v5.3.0
         with:
           go-version: 1.23.2
       - name: Checkout Repo

From d97f61095c6831c1e71662c423a41360ae6ce543 Mon Sep 17 00:00:00 2001
From: Alan Protasio <approtas@amazon.com>
Date: Wed, 29 Jan 2025 09:23:50 -0800
Subject: [PATCH 29/34] Fix: expanded postings can cache wrong data when
 queries are issued "in the future" (#6562)

* improve fuzz test for expanded postings cache

Signed-off-by: alanprot <alanprot@gmail.com>

* create more tests on the expanded postings cache

Signed-off-by: alanprot <alanprot@gmail.com>

* adding get series call on the test

Signed-off-by: alanprot <alanprot@gmail.com>

* no use CachedBlockChunkQuerier when query time range is completely after the last sample added in the head

Signed-off-by: alanprot <alanprot@gmail.com>

* adding comments

Signed-off-by: alanprot <alanprot@gmail.com>

* increase the number of fuzz test from 100 to 300

Signed-off-by: alanprot <alanprot@gmail.com>

* add get series fuzzy testing

Signed-off-by: alanprot <alanprot@gmail.com>

---------

Signed-off-by: alanprot <alanprot@gmail.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 integration/query_fuzz_test.go              |  67 +++++--
 pkg/ingester/ingester.go                    |  28 ++-
 pkg/ingester/ingester_test.go               | 195 ++++++++++++++++++++
 pkg/storage/tsdb/expanded_postings_cache.go |  14 ++
 4 files changed, 283 insertions(+), 21 deletions(-)

diff --git a/integration/query_fuzz_test.go b/integration/query_fuzz_test.go
index cb2e8514254..c7d28e9eb64 100644
--- a/integration/query_fuzz_test.go
+++ b/integration/query_fuzz_test.go
@@ -433,7 +433,7 @@ func TestExpandedPostingsCacheFuzz(t *testing.T) {
 				scrapeInterval,
 				i*numSamples,
 				numSamples,
-				prompb.Label{Name: "j", Value: fmt.Sprintf("%d", j)},
+				prompb.Label{Name: "test_label", Value: fmt.Sprintf("test_label_value_%d", j)},
 			)
 			ss[i*numberOfLabelsPerSeries+j] = series
 
@@ -453,11 +453,18 @@ func TestExpandedPostingsCacheFuzz(t *testing.T) {
 	ps := promqlsmith.New(rnd, lbls, opts...)
 
 	// Create the queries with the original labels
-	testRun := 100
+	testRun := 300
 	queries := make([]string, testRun)
+	matchers := make([]string, testRun)
 	for i := 0; i < testRun; i++ {
 		expr := ps.WalkRangeQuery()
 		queries[i] = expr.Pretty(0)
+		matchers[i] = storepb.PromMatchersToString(
+			append(
+				ps.WalkSelectors(),
+				labels.MustNewMatcher(labels.MatchEqual, "__name__", fmt.Sprintf("test_series_%d", i%numSeries)),
+			)...,
+		)
 	}
 
 	// Lets run multiples iterations and create new series every iteration
@@ -472,7 +479,7 @@ func TestExpandedPostingsCacheFuzz(t *testing.T) {
 					scrapeInterval,
 					i*numSamples,
 					numSamples,
-					prompb.Label{Name: "j", Value: fmt.Sprintf("%d", j)},
+					prompb.Label{Name: "test_label", Value: fmt.Sprintf("test_label_value_%d", j)},
 					prompb.Label{Name: "k", Value: fmt.Sprintf("%d", k)},
 				)
 			}
@@ -485,20 +492,33 @@ func TestExpandedPostingsCacheFuzz(t *testing.T) {
 		}
 
 		type testCase struct {
-			query      string
-			res1, res2 model.Value
-			err1, err2 error
+			query        string
+			qt           string
+			res1, res2   model.Value
+			sres1, sres2 []model.LabelSet
+			err1, err2   error
 		}
 
-		queryStart := time.Now().Add(-time.Hour * 24)
-		queryEnd := time.Now()
-		cases := make([]*testCase, 0, 200)
+		cases := make([]*testCase, 0, len(queries)*3)
 
 		for _, query := range queries {
-			res1, err1 := c1.QueryRange(query, queryStart, queryEnd, scrapeInterval)
-			res2, err2 := c2.QueryRange(query, queryStart, queryEnd, scrapeInterval)
+			fuzzyTime := time.Duration(rand.Int63n(time.Now().UnixMilli() - start.UnixMilli()))
+			queryEnd := start.Add(fuzzyTime * time.Millisecond)
+			res1, err1 := c1.Query(query, queryEnd)
+			res2, err2 := c2.Query(query, queryEnd)
+			cases = append(cases, &testCase{
+				query: query,
+				qt:    "instant",
+				res1:  res1,
+				res2:  res2,
+				err1:  err1,
+				err2:  err2,
+			})
+			res1, err1 = c1.QueryRange(query, start, queryEnd, scrapeInterval)
+			res2, err2 = c2.QueryRange(query, start, queryEnd, scrapeInterval)
 			cases = append(cases, &testCase{
 				query: query,
+				qt:    "range query",
 				res1:  res1,
 				res2:  res2,
 				err1:  err1,
@@ -506,21 +526,38 @@ func TestExpandedPostingsCacheFuzz(t *testing.T) {
 			})
 		}
 
+		for _, m := range matchers {
+			fuzzyTime := time.Duration(rand.Int63n(time.Now().UnixMilli() - start.UnixMilli()))
+			queryEnd := start.Add(fuzzyTime * time.Millisecond)
+			res1, err := c1.Series([]string{m}, start, queryEnd)
+			require.NoError(t, err)
+			res2, err := c2.Series([]string{m}, start, queryEnd)
+			require.NoError(t, err)
+			cases = append(cases, &testCase{
+				query: m,
+				qt:    "get series",
+				sres1: res1,
+				sres2: res2,
+			})
+		}
+
 		failures := 0
 		for i, tc := range cases {
-			qt := "range query"
 			if tc.err1 != nil || tc.err2 != nil {
 				if !cmp.Equal(tc.err1, tc.err2) {
-					t.Logf("case %d error mismatch.\n%s: %s\nerr1: %v\nerr2: %v\n", i, qt, tc.query, tc.err1, tc.err2)
+					t.Logf("case %d error mismatch.\n%s: %s\nerr1: %v\nerr2: %v\n", i, tc.qt, tc.query, tc.err1, tc.err2)
 					failures++
 				}
 			} else if shouldUseSampleNumComparer(tc.query) {
 				if !cmp.Equal(tc.res1, tc.res2, sampleNumComparer) {
-					t.Logf("case %d # of samples mismatch.\n%s: %s\nres1: %s\nres2: %s\n", i, qt, tc.query, tc.res1.String(), tc.res2.String())
+					t.Logf("case %d # of samples mismatch.\n%s: %s\nres1: %s\nres2: %s\n", i, tc.qt, tc.query, tc.res1.String(), tc.res2.String())
 					failures++
 				}
 			} else if !cmp.Equal(tc.res1, tc.res2, comparer) {
-				t.Logf("case %d results mismatch.\n%s: %s\nres1: %s\nres2: %s\n", i, qt, tc.query, tc.res1.String(), tc.res2.String())
+				t.Logf("case %d results mismatch.\n%s: %s\nres1: %s\nres2: %s\n", i, tc.qt, tc.query, tc.res1.String(), tc.res2.String())
+				failures++
+			} else if !cmp.Equal(tc.sres1, tc.sres1, labelSetsComparer) {
+				t.Logf("case %d results mismatch.\n%s: %s\nsres1: %s\nsres2: %s\n", i, tc.qt, tc.query, tc.sres1, tc.sres2)
 				failures++
 			}
 		}
diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go
index bf921243af4..4c41a90a991 100644
--- a/pkg/ingester/ingester.go
+++ b/pkg/ingester/ingester.go
@@ -2283,6 +2283,27 @@ func (i *Ingester) getOrCreateTSDB(userID string, force bool) (*userTSDB, error)
 	return db, nil
 }
 
+func (i *Ingester) blockChunkQuerierFunc(userId string) tsdb.BlockChunkQuerierFunc {
+	return func(b tsdb.BlockReader, mint, maxt int64) (storage.ChunkQuerier, error) {
+		db := i.getTSDB(userId)
+
+		var postingCache cortex_tsdb.ExpandedPostingsCache
+		if db != nil {
+			postingCache = db.postingCache
+		}
+
+		// Caching expanded postings for queries that are "in the future" may lead to incorrect results being cached.
+		// This occurs because the tsdb.PostingsForMatchers function can return invalid data in such scenarios.
+		// For more details, see: https://github.com/cortexproject/cortex/issues/6556
+		// TODO: alanprot: Consider removing this logic when prometheus is updated as this logic is "fixed" upstream.
+		if postingCache == nil || mint > db.Head().MaxTime() {
+			return tsdb.NewBlockChunkQuerier(b, mint, maxt)
+		}
+
+		return cortex_tsdb.NewCachedBlockChunkQuerier(postingCache, b, mint, maxt)
+	}
+}
+
 // createTSDB creates a TSDB for a given userID, and returns the created db.
 func (i *Ingester) createTSDB(userID string) (*userTSDB, error) {
 	tsdbPromReg := prometheus.NewRegistry()
@@ -2346,12 +2367,7 @@ func (i *Ingester) createTSDB(userID string) (*userTSDB, error) {
 		OutOfOrderCapMax:               i.cfg.BlocksStorageConfig.TSDB.OutOfOrderCapMax,
 		EnableOverlappingCompaction:    false, // Always let compactors handle overlapped blocks, e.g. OOO blocks.
 		EnableNativeHistograms:         i.cfg.BlocksStorageConfig.TSDB.EnableNativeHistograms,
-		BlockChunkQuerierFunc: func(b tsdb.BlockReader, mint, maxt int64) (storage.ChunkQuerier, error) {
-			if postingCache != nil {
-				return cortex_tsdb.NewCachedBlockChunkQuerier(postingCache, b, mint, maxt)
-			}
-			return tsdb.NewBlockChunkQuerier(b, mint, maxt)
-		},
+		BlockChunkQuerierFunc:          i.blockChunkQuerierFunc(userID),
 	}, nil)
 	if err != nil {
 		return nil, errors.Wrapf(err, "failed to open TSDB: %s", udir)
diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go
index 3d386947fa5..f3da262fe6a 100644
--- a/pkg/ingester/ingester_test.go
+++ b/pkg/ingester/ingester_test.go
@@ -5605,6 +5605,201 @@ func TestExpendedPostingsCacheIsolation(t *testing.T) {
 	wg.Wait()
 }
 
+func TestExpendedPostingsCacheMatchers(t *testing.T) {
+	cfg := defaultIngesterTestConfig(t)
+	cfg.BlocksStorageConfig.TSDB.ExpandedCachingExpireInterval = time.Second
+	cfg.BlocksStorageConfig.TSDB.BlockRanges = []time.Duration{2 * time.Hour}
+	cfg.BlocksStorageConfig.TSDB.PostingsCache.Blocks.Enabled = true
+	cfg.BlocksStorageConfig.TSDB.PostingsCache.Head.Enabled = true
+	cfg.QueryIngestersWithin = 24 * time.Hour
+
+	ctx := user.InjectOrgID(context.Background(), userID)
+
+	r := prometheus.NewRegistry()
+	ing, err := prepareIngesterWithBlocksStorage(t, cfg, r)
+	require.NoError(t, err)
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), ing))
+	defer services.StopAndAwaitTerminated(context.Background(), ing) //nolint:errcheck
+
+	// Wait until the ingester is ACTIVE
+	test.Poll(t, 100*time.Millisecond, ring.ACTIVE, func() interface{} {
+		return ing.lifecycler.GetState()
+	})
+
+	numberOfMetricNames := 10
+	seriesPerMetricsNames := 25
+	timeStamp := int64(60 * 1000)
+	seriesCreated := map[string]labels.Labels{}
+
+	for i := 0; i < numberOfMetricNames; i++ {
+		metricName := fmt.Sprintf("metric_%v", i)
+		for j := 0; j < seriesPerMetricsNames; j++ {
+			s := labels.FromStrings(labels.MetricName, metricName, "labelA", fmt.Sprintf("series_%v", j))
+			_, err = ing.Push(ctx, cortexpb.ToWriteRequest([]labels.Labels{s}, []cortexpb.Sample{{Value: 2, TimestampMs: timeStamp}}, nil, nil, cortexpb.API))
+			seriesCreated[s.String()] = s
+			require.NoError(t, err)
+		}
+	}
+
+	db := ing.getTSDB(userID)
+
+	type testCase struct {
+		matchers []*client.LabelMatcher
+	}
+
+	cases := []testCase{}
+
+	nameMatcher := &client.LabelMatcher{
+		Type:  client.EQUAL,
+		Name:  labels.MetricName,
+		Value: "metric_0",
+	}
+
+	for i := 0; i < 4; i++ {
+		tc := testCase{
+			matchers: []*client.LabelMatcher{nameMatcher},
+		}
+
+		switch client.MatchType(i) {
+		case client.EQUAL | client.NOT_EQUAL:
+			tc.matchers = append(tc.matchers, &client.LabelMatcher{
+				Type:  client.MatchType(i),
+				Name:  "labelA",
+				Value: "series_0",
+			})
+		default:
+			tc.matchers = append(tc.matchers, &client.LabelMatcher{
+				Type:  client.MatchType(i),
+				Name:  "labelA",
+				Value: "series_.*",
+			})
+		}
+		cases = append(cases, tc)
+	}
+
+	for _, v := range []string{".*", "", ".+"} {
+		cases = append(cases,
+			testCase{
+				matchers: []*client.LabelMatcher{
+					nameMatcher,
+					{
+						Type:  client.REGEX_MATCH,
+						Name:  "labelA",
+						Value: v,
+					},
+				},
+			},
+			testCase{
+				matchers: []*client.LabelMatcher{
+					nameMatcher,
+					{
+						Type:  client.REGEX_NO_MATCH,
+						Name:  "labelA",
+						Value: v,
+					},
+				},
+			},
+		)
+	}
+
+	ranges := []struct {
+		startTs, endTs int64
+		hasSamples     bool
+	}{
+		// Totally in the past
+		{
+			startTs:    0,
+			endTs:      timeStamp / 2,
+			hasSamples: false,
+		},
+		{
+			startTs:    timeStamp / 2,
+			endTs:      timeStamp,
+			hasSamples: true,
+		},
+		{
+			startTs:    timeStamp / 2,
+			endTs:      timeStamp * 2,
+			hasSamples: true,
+		},
+		{
+			startTs:    timeStamp + 1,
+			endTs:      timeStamp * 2,
+			hasSamples: false,
+		},
+	}
+
+	verify := func(t *testing.T, tc testCase, startTs, endTs int64, hasSamples bool) {
+
+		expectedCount := len(seriesCreated)
+		matchers, err := client.FromLabelMatchers(ing.matchersCache, tc.matchers)
+		require.NoError(t, err)
+		for _, s := range seriesCreated {
+			for _, m := range matchers {
+				if !m.Matches(s.Get(m.Name)) {
+					expectedCount--
+					break
+				}
+			}
+		}
+
+		seriesResponse, err := ing.MetricsForLabelMatchers(ctx, &client.MetricsForLabelMatchersRequest{
+			StartTimestampMs: startTs,
+			EndTimestampMs:   endTs,
+			MatchersSet: []*client.LabelMatchers{
+				{
+					Matchers: tc.matchers,
+				},
+			},
+		})
+		require.NoError(t, err)
+		if hasSamples {
+			require.Len(t, seriesResponse.Metric, expectedCount)
+		} else {
+			require.Len(t, seriesResponse.Metric, 0)
+		}
+
+		s := &mockQueryStreamServer{ctx: ctx}
+		err = ing.QueryStream(&client.QueryRequest{
+			StartTimestampMs: startTs,
+			EndTimestampMs:   endTs,
+			Matchers:         tc.matchers,
+		}, s)
+		require.NoError(t, err)
+		if hasSamples {
+			require.Equal(t, expectedCount, len(s.series))
+		} else {
+			require.Equal(t, 0, len(s.series))
+		}
+	}
+
+	for _, tc := range cases {
+		testName := ""
+		for _, matcher := range tc.matchers {
+			t, _ := matcher.MatcherType()
+			testName += matcher.Name + t.String() + matcher.Value + "|"
+
+		}
+		t.Run(fmt.Sprintf("%v", testName), func(t *testing.T) {
+			for _, r := range ranges {
+				t.Run(fmt.Sprintf("start=%v,end=%v", r.startTs, r.endTs), func(t *testing.T) {
+					db.postingCache.Clear()
+
+					// lets run 2 times to hit the cache
+					for i := 0; i < 2; i++ {
+						verify(t, tc, r.startTs, r.endTs, r.hasSamples)
+					}
+
+					// run the test again with all other ranges
+					for _, r1 := range ranges {
+						verify(t, tc, r1.startTs, r1.endTs, r1.hasSamples)
+					}
+				})
+			}
+		})
+	}
+}
+
 func TestExpendedPostingsCache(t *testing.T) {
 	cfg := defaultIngesterTestConfig(t)
 	cfg.BlocksStorageConfig.TSDB.ExpandedCachingExpireInterval = time.Second
diff --git a/pkg/storage/tsdb/expanded_postings_cache.go b/pkg/storage/tsdb/expanded_postings_cache.go
index a24087e824f..a436350b610 100644
--- a/pkg/storage/tsdb/expanded_postings_cache.go
+++ b/pkg/storage/tsdb/expanded_postings_cache.go
@@ -125,6 +125,7 @@ type ExpandedPostingsCache interface {
 	PostingsForMatchers(ctx context.Context, blockID ulid.ULID, ix tsdb.IndexReader, ms ...*labels.Matcher) (index.Postings, error)
 	ExpireSeries(metric labels.Labels)
 	PurgeExpiredItems()
+	Clear()
 	Size() int
 }
 
@@ -140,6 +141,11 @@ type blocksPostingsForMatchersCache struct {
 	seedByHash *seedByHash
 }
 
+func (c *blocksPostingsForMatchersCache) Clear() {
+	c.headCache.clear()
+	c.blocksCache.clear()
+}
+
 func newBlocksPostingsForMatchersCache(userId string, cfg TSDBPostingsCacheConfig, metrics *ExpandedPostingsCacheMetrics, seedByHash *seedByHash) ExpandedPostingsCache {
 	if cfg.PostingsForMatchers == nil {
 		cfg.PostingsForMatchers = tsdb.PostingsForMatchers
@@ -358,6 +364,14 @@ func newFifoCache[V any](cfg PostingsCacheConfig, name string, metrics *Expanded
 	}
 }
 
+func (c *fifoCache[V]) clear() {
+	c.cachedMtx.Lock()
+	defer c.cachedMtx.Unlock()
+	c.cached = list.New()
+	c.cachedBytes = 0
+	c.cachedValues = new(sync.Map)
+}
+
 func (c *fifoCache[V]) expire() {
 	if c.cfg.Ttl <= 0 {
 		return

From 0350068d7f9586a47fbd81d136134eacdbbb9ec2 Mon Sep 17 00:00:00 2001
From: Daniel Blando <daniel@blando.com.br>
Date: Thu, 30 Jan 2025 14:25:53 -0800
Subject: [PATCH 30/34] Extend ShuffleSharding on READONLY ingesters (#6517)

* Filter readOnly ingesters when sharding

Signed-off-by: Daniel Deluiggi <ddeluigg@amazon.com>

* Extend shard on READONLY

Signed-off-by: Daniel Deluiggi <ddeluigg@amazon.com>

* Remove old code

Signed-off-by: Daniel Deluiggi <ddeluigg@amazon.com>

* Fix test

Signed-off-by: Daniel Deluiggi <ddeluigg@amazon.com>

* update changelog

Signed-off-by: Daniel Deluiggi <ddeluigg@amazon.com>

---------

Signed-off-by: Daniel Deluiggi <ddeluigg@amazon.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 CHANGELOG.md                |   1 +
 pkg/ingester/ingester.go    |   2 +-
 pkg/ring/lifecycler.go      |   6 +++
 pkg/ring/lifecycler_test.go |  80 +++++++++++++++++++++++++++
 pkg/ring/model.go           |  19 +++++--
 pkg/ring/model_test.go      |  20 +++++++
 pkg/ring/ring.go            |  10 +++-
 pkg/ring/ring_test.go       | 105 ++++++++++++++++++++++++++++++++++++
 8 files changed, 235 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 771bb0554a5..7744fdb6387 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## master / unreleased
 
 * [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526
+* [BUGFIX] Ingester: Avoid error or early throttling when READONLY ingesters are present in the ring #6517
 
 ## 1.19.0 in progress
 
diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go
index 4c41a90a991..c15f82ce354 100644
--- a/pkg/ingester/ingester.go
+++ b/pkg/ingester/ingester.go
@@ -3122,7 +3122,7 @@ func (i *Ingester) flushHandler(w http.ResponseWriter, r *http.Request) {
 	w.WriteHeader(http.StatusNoContent)
 }
 
-// ModeHandler Change mode of ingester. It will also update set unregisterOnShutdown to true if READONLY mode
+// ModeHandler Change mode of ingester.
 func (i *Ingester) ModeHandler(w http.ResponseWriter, r *http.Request) {
 	err := r.ParseForm()
 	if err != nil {
diff --git a/pkg/ring/lifecycler.go b/pkg/ring/lifecycler.go
index 7e8b033e83f..c34d3464019 100644
--- a/pkg/ring/lifecycler.go
+++ b/pkg/ring/lifecycler.go
@@ -1005,6 +1005,12 @@ func (i *Lifecycler) changeState(ctx context.Context, state InstanceState) error
 
 	level.Info(i.logger).Log("msg", "changing instance state from", "old_state", currState, "new_state", state, "ring", i.RingName)
 	i.setState(state)
+
+	//The instances is rejoining the ring. It should reset its registered time.
+	if currState == READONLY && state == ACTIVE {
+		registeredAt := time.Now()
+		i.setRegisteredAt(registeredAt)
+	}
 	return i.updateConsul(ctx)
 }
 
diff --git a/pkg/ring/lifecycler_test.go b/pkg/ring/lifecycler_test.go
index 035cfc8f1b8..e0756765379 100644
--- a/pkg/ring/lifecycler_test.go
+++ b/pkg/ring/lifecycler_test.go
@@ -827,6 +827,86 @@ func TestTokenFileOnDisk(t *testing.T) {
 	}
 }
 
+func TestRegisteredAtOnBackToActive(t *testing.T) {
+	ringStore, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil)
+	t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+	var ringConfig Config
+	flagext.DefaultValues(&ringConfig)
+	ringConfig.KVStore.Mock = ringStore
+
+	r, err := New(ringConfig, "ingester", ringKey, log.NewNopLogger(), nil)
+	require.NoError(t, err)
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), r))
+	defer services.StopAndAwaitTerminated(context.Background(), r) //nolint:errcheck
+
+	tokenDir := t.TempDir()
+
+	lifecyclerConfig := testLifecyclerConfig(ringConfig, "ing1")
+	lifecyclerConfig.NumTokens = 512
+	lifecyclerConfig.TokensFilePath = tokenDir + "/tokens"
+
+	// Start first ingester.
+	l1, err := NewLifecycler(lifecyclerConfig, &noopFlushTransferer{}, "ingester", ringKey, true, true, log.NewNopLogger(), nil)
+	require.NoError(t, err)
+	require.NoError(t, services.StartAndAwaitRunning(context.Background(), l1))
+
+	// Check this ingester joined, is active.
+	test.Poll(t, 1000*time.Millisecond, true, func() interface{} {
+		d, err := r.KVClient.Get(context.Background(), ringKey)
+		require.NoError(t, err)
+
+		desc, ok := d.(*Desc)
+		return ok &&
+			len(desc.Ingesters) == 1 &&
+			desc.Ingesters["ing1"].State == ACTIVE
+	})
+
+	//Get original registeredTime
+	d, err := r.KVClient.Get(context.Background(), ringKey)
+	require.NoError(t, err)
+	desc, ok := d.(*Desc)
+	require.True(t, ok)
+	originalRegisterTime := desc.Ingesters["ing1"].RegisteredTimestamp
+
+	// Change state from ACTIVE to READONLY
+	err = l1.ChangeState(context.Background(), READONLY)
+	require.NoError(t, err)
+	test.Poll(t, 1000*time.Millisecond, true, func() interface{} {
+		d, err := r.KVClient.Get(context.Background(), ringKey)
+		require.NoError(t, err)
+
+		desc, ok := d.(*Desc)
+		return ok &&
+			desc.Ingesters["ing1"].State == READONLY
+	})
+
+	//Guarantee 1s diff for RegisteredTimestamp
+	time.Sleep(1 * time.Second)
+
+	// Change state from READONLY to ACTIVE
+	err = l1.ChangeState(context.Background(), ACTIVE)
+	require.NoError(t, err)
+	test.Poll(t, 1000*time.Millisecond, true, func() interface{} {
+		d, err := r.KVClient.Get(context.Background(), ringKey)
+		require.NoError(t, err)
+
+		desc, ok := d.(*Desc)
+		return ok &&
+			desc.Ingesters["ing1"].State == ACTIVE
+	})
+
+	d, err = r.KVClient.Get(context.Background(), ringKey)
+	require.NoError(t, err)
+
+	desc, ok = d.(*Desc)
+	require.True(t, ok)
+	ing := desc.Ingesters["ing1"]
+	require.True(t, ing.RegisteredTimestamp > originalRegisterTime)
+
+	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), l1))
+}
+
 func TestTokenFileOnDisk_WithoutAutoJoinOnStartup(t *testing.T) {
 	ringStore, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil)
 	t.Cleanup(func() { assert.NoError(t, closer.Close()) })
diff --git a/pkg/ring/model.go b/pkg/ring/model.go
index 3f0e6944e2f..a465bf0fa91 100644
--- a/pkg/ring/model.go
+++ b/pkg/ring/model.go
@@ -543,10 +543,11 @@ type CompareResult int
 const (
 	Equal                       CompareResult = iota // Both rings contain same exact instances.
 	EqualButStatesAndTimestamps                      // Both rings contain the same instances with the same data except states and timestamps (may differ).
+	EqualButReadOnly                                 // Both rings contain the same instances but Write ring can change due to ReadOnly update
 	Different                                        // Rings have different set of instances, or their information don't match.
 )
 
-// RingCompare compares this ring against another one and returns one of Equal, EqualButStatesAndTimestamps or Different.
+// RingCompare compares this ring against another one and returns one of Equal, EqualButStatesAndTimestamps, EqualButReadOnly or Different.
 func (d *Desc) RingCompare(o *Desc) CompareResult {
 	if d == nil {
 		if o == nil || len(o.Ingesters) == 0 {
@@ -566,6 +567,7 @@ func (d *Desc) RingCompare(o *Desc) CompareResult {
 	}
 
 	equalStatesAndTimestamps := true
+	equalReadOnly := true
 
 	for name, ing := range d.Ingesters {
 		oing, ok := o.Ingesters[name]
@@ -600,14 +602,21 @@ func (d *Desc) RingCompare(o *Desc) CompareResult {
 		}
 
 		if ing.State != oing.State {
-			equalStatesAndTimestamps = false
+			if ing.State == READONLY || oing.State == READONLY {
+				equalReadOnly = false
+			} else {
+				equalStatesAndTimestamps = false
+			}
 		}
 	}
 
-	if equalStatesAndTimestamps {
-		return Equal
+	if !equalReadOnly {
+		return EqualButReadOnly
+	}
+	if !equalStatesAndTimestamps {
+		return EqualButStatesAndTimestamps
 	}
-	return EqualButStatesAndTimestamps
+	return Equal
 }
 
 func GetOrCreateRingDesc(d interface{}) *Desc {
diff --git a/pkg/ring/model_test.go b/pkg/ring/model_test.go
index 896aef56897..f34b6e566d2 100644
--- a/pkg/ring/model_test.go
+++ b/pkg/ring/model_test.go
@@ -395,6 +395,21 @@ func TestDesc_RingsCompare(t *testing.T) {
 			r2:       &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1"}}},
 			expected: Equal,
 		},
+		"same number of instances, from active to readOnly": {
+			r1:       &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", State: ACTIVE}}},
+			r2:       &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", State: READONLY}}},
+			expected: EqualButReadOnly,
+		},
+		"same number of instances, from readOnly to active": {
+			r1:       &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", State: READONLY}}},
+			r2:       &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", State: ACTIVE}}},
+			expected: EqualButReadOnly,
+		},
+		"same number of instances, prioritize readOnly than timestamp changes": {
+			r1:       &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", State: ACTIVE, Timestamp: 123456}}},
+			r2:       &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", State: READONLY, Timestamp: 789012}}},
+			expected: EqualButReadOnly,
+		},
 		"same single instance, different timestamp": {
 			r1:       &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", Timestamp: 123456}}},
 			r2:       &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", Timestamp: 789012}}},
@@ -440,6 +455,11 @@ func TestDesc_RingsCompare(t *testing.T) {
 			r2:       &Desc{Ingesters: map[string]InstanceDesc{"ing2": {Addr: "addr1", Tokens: []uint32{1, 2, 3}}}},
 			expected: Different,
 		},
+		"same number of instances, prioritize diff than ReadOnly": {
+			r1:       &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", Zone: "one", State: ACTIVE}}},
+			r2:       &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", Zone: "two", State: READONLY}}},
+			expected: Different,
+		},
 	}
 
 	for testName, testData := range tests {
diff --git a/pkg/ring/ring.go b/pkg/ring/ring.go
index 7377cbcccd4..f4557ec5436 100644
--- a/pkg/ring/ring.go
+++ b/pkg/ring/ring.go
@@ -333,12 +333,16 @@ func (r *Ring) updateRingState(ringDesc *Desc) {
 	}
 
 	rc := prevRing.RingCompare(ringDesc)
-	if rc == Equal || rc == EqualButStatesAndTimestamps {
+	if rc == Equal || rc == EqualButStatesAndTimestamps || rc == EqualButReadOnly {
 		// No need to update tokens or zones. Only states and timestamps
 		// have changed. (If Equal, nothing has changed, but that doesn't happen
 		// when watching the ring for updates).
 		r.mtx.Lock()
 		r.ringDesc = ringDesc
+		if rc == EqualButReadOnly && r.shuffledSubringCache != nil {
+			// Invalidate all cached subrings.
+			r.shuffledSubringCache = make(map[subringCacheKey]*Ring)
+		}
 		r.updateRingMetrics(rc)
 		r.mtx.Unlock()
 		return
@@ -852,7 +856,9 @@ func (r *Ring) shuffleShard(identifier string, size int, lookbackPeriod time.Dur
 
 				// If the lookback is enabled and this instance has been registered within the lookback period
 				// then we should include it in the subring but continuing selecting instances.
-				if lookbackPeriod > 0 && instance.RegisteredTimestamp >= lookbackUntil {
+				// If an instance is in READONLY we should always extend. The write path will filter it out when GetRing.
+				// The read path should extend to get new ingester used on write
+				if (lookbackPeriod > 0 && instance.RegisteredTimestamp >= lookbackUntil) || instance.State == READONLY {
 					continue
 				}
 
diff --git a/pkg/ring/ring_test.go b/pkg/ring/ring_test.go
index a5937e2e8ec..ff51e9a4e6c 100644
--- a/pkg/ring/ring_test.go
+++ b/pkg/ring/ring_test.go
@@ -2523,6 +2523,111 @@ func TestRing_ShuffleShardWithLookback(t *testing.T) {
 	}
 }
 
+func TestRing_ShuffleShardWithReadOnlyIngesters(t *testing.T) {
+	g := NewRandomTokenGenerator()
+
+	const (
+		userID = "user-1"
+	)
+
+	tests := map[string]struct {
+		ringInstances         map[string]InstanceDesc
+		ringReplicationFactor int
+		shardSize             int
+		expectedSize          int
+		op                    Operation
+		expectedToBePresent   []string
+	}{
+		"single zone, shard size = 1, default scenario": {
+			ringInstances: map[string]InstanceDesc{
+				"instance-1": {Addr: "127.0.0.1", Zone: "zone-a", State: ACTIVE, Tokens: g.GenerateTokens(NewDesc(), "instance-1", "zone-a", 128, true)},
+				"instance-2": {Addr: "127.0.0.2", Zone: "zone-a", State: ACTIVE, Tokens: g.GenerateTokens(NewDesc(), "instance-2", "zone-a", 128, true)},
+			},
+			ringReplicationFactor: 1,
+			shardSize:             1,
+			expectedSize:          1,
+		},
+		"single zone, shard size = 1, not filter ReadOnly": {
+			ringInstances: map[string]InstanceDesc{
+				"instance-1": {Addr: "127.0.0.1", Zone: "zone-a", State: ACTIVE, Tokens: g.GenerateTokens(NewDesc(), "instance-1", "zone-a", 128, true)},
+				"instance-2": {Addr: "127.0.0.2", Zone: "zone-a", State: READONLY, Tokens: g.GenerateTokens(NewDesc(), "instance-2", "zone-a", 128, true)},
+			},
+			ringReplicationFactor: 1,
+			shardSize:             2,
+			expectedSize:          2,
+		},
+		"single zone, shard size = 4, do not filter other states": {
+			ringInstances: map[string]InstanceDesc{
+				"instance-1": {Addr: "127.0.0.1", Zone: "zone-a", State: ACTIVE, Tokens: g.GenerateTokens(NewDesc(), "instance-1", "zone-a", 128, true)},
+				"instance-2": {Addr: "127.0.0.2", Zone: "zone-a", State: JOINING, Tokens: g.GenerateTokens(NewDesc(), "instance-2", "zone-a", 128, true)},
+				"instance-3": {Addr: "127.0.0.3", Zone: "zone-a", State: LEAVING, Tokens: g.GenerateTokens(NewDesc(), "instance-3", "zone-a", 128, true)},
+				"instance-4": {Addr: "127.0.0.4", Zone: "zone-a", State: PENDING, Tokens: g.GenerateTokens(NewDesc(), "instance-4", "zone-a", 128, true)},
+			},
+			ringReplicationFactor: 1,
+			shardSize:             4,
+			expectedSize:          4,
+		},
+		"single zone, shard size = 4, extend on readOnly": {
+			ringInstances: map[string]InstanceDesc{
+				"instance-1": {Addr: "127.0.0.1", Zone: "zone-a", State: ACTIVE, Tokens: []uint32{2}},
+				"instance-2": {Addr: "127.0.0.2", Zone: "zone-a", State: ACTIVE, Tokens: []uint32{4}},
+				"instance-3": {Addr: "127.0.0.3", Zone: "zone-a", State: ACTIVE, Tokens: []uint32{6}},
+				"instance-4": {Addr: "127.0.0.4", Zone: "zone-a", State: READONLY, Tokens: []uint32{1, 3, 5}},
+			},
+			ringReplicationFactor: 1,
+			shardSize:             2,
+			expectedSize:          3,
+			expectedToBePresent:   []string{"instance-4"},
+		},
+		"rf = 3, shard size = 4, extend readOnly from different zones": {
+			ringInstances: map[string]InstanceDesc{
+				"instance-1": {Addr: "127.0.0.1", Zone: "zone-a", State: ACTIVE, Tokens: []uint32{2}},
+				"instance-2": {Addr: "127.0.0.2", Zone: "zone-b", State: ACTIVE, Tokens: []uint32{12}},
+				"instance-3": {Addr: "127.0.0.3", Zone: "zone-c", State: ACTIVE, Tokens: []uint32{22}},
+				"instance-4": {Addr: "127.0.0.4", Zone: "zone-a", State: ACTIVE, Tokens: []uint32{4}},
+				"instance-5": {Addr: "127.0.0.5", Zone: "zone-b", State: ACTIVE, Tokens: []uint32{14}},
+				"instance-6": {Addr: "127.0.0.6", Zone: "zone-c", State: ACTIVE, Tokens: []uint32{24}},
+				"instance-7": {Addr: "127.0.0.7", Zone: "zone-a", State: READONLY, Tokens: []uint32{1, 3}},
+				"instance-8": {Addr: "127.0.0.8", Zone: "zone-b", State: READONLY, Tokens: []uint32{11, 13}},
+				"instance-9": {Addr: "127.0.0.9", Zone: "zone-c", State: READONLY, Tokens: []uint32{21, 23}},
+			},
+			ringReplicationFactor: 3,
+			shardSize:             6,
+			expectedSize:          9,
+			expectedToBePresent:   []string{"instance-7", "instance-8", "instance-9"},
+		},
+	}
+
+	for testName, testData := range tests {
+		t.Run(testName, func(t *testing.T) {
+			// Init the ring.
+			ringDesc := &Desc{Ingesters: testData.ringInstances}
+			for id, instance := range ringDesc.Ingesters {
+				ringDesc.Ingesters[id] = instance
+			}
+
+			ring := Ring{
+				cfg: Config{
+					ReplicationFactor: testData.ringReplicationFactor,
+				},
+				ringDesc:            ringDesc,
+				ringTokens:          ringDesc.GetTokens(),
+				ringTokensByZone:    ringDesc.getTokensByZone(),
+				ringInstanceByToken: ringDesc.getTokensInfo(),
+				ringZones:           getZones(ringDesc.getTokensByZone()),
+				strategy:            NewDefaultReplicationStrategy(),
+				KVClient:            &MockClient{},
+			}
+
+			shardRing := ring.ShuffleShard(userID, testData.shardSize)
+			assert.Equal(t, testData.expectedSize, shardRing.InstancesCount())
+			for _, expectedInstance := range testData.expectedToBePresent {
+				assert.True(t, shardRing.HasInstance(expectedInstance))
+			}
+		})
+	}
+}
+
 func TestRing_ShuffleShardWithLookback_CorrectnessWithFuzzy(t *testing.T) {
 	// The goal of this test is NOT to ensure that the minimum required number of instances
 	// are returned at any given time, BUT at least all required instances are returned.

From 71fa87896a2646134ffac9bfe9997095adc1cce4 Mon Sep 17 00:00:00 2001
From: Alex Le <leqiyue@amazon.com>
Date: Wed, 15 Jan 2025 15:21:43 -0800
Subject: [PATCH 31/34] Create guide doc for partition compaction

Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 docs/guides/partitioning-compactor.md | 42 +++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 docs/guides/partitioning-compactor.md

diff --git a/docs/guides/partitioning-compactor.md b/docs/guides/partitioning-compactor.md
new file mode 100644
index 00000000000..2a161a76a0e
--- /dev/null
+++ b/docs/guides/partitioning-compactor.md
@@ -0,0 +1,42 @@
+---
+title: "Use Partition Compaction in Cortex"
+linkTitle: "Partition Compaction"
+weight: 10
+slug: partition-compaction
+---
+
+## Context
+
+Compactor is bonded by maximum 64GB of index file size. If compaction failed due to exceeding index file size limit, partition compaction can be enabled to allow compactor compacting into multiple blocks that have index file size stays within limit.
+
+## Enable Partition Compaction
+
+In order to enable partition compaction, the following flag needs to be set:
+
+```
+-compactor.sharding-enabled=true
+-compactor.sharding-strategy=shuffle-sharding
+-compactor.compaction-strategy=partitioning
+```
+
+## Configure Partition Compaction
+
+By default, partition compaction utilizes the following configurations and their values:
+
+```
+-compactor.partition-index-size-bytes=68719476736 // 64GB
+-compactor.partition-series-count=0 // no limit
+```
+
+The default value should start partitioning result blocks when sum of index files size of parent blocks exceeds 64GB. End user could also change those two configurations. Partition compaction would always calculate partition count based on both configuration and pick the one with higher partition count.
+
+Both configurations support to be set per tenant.
+
+Note: `compactor.partition-series-count` is using sum of series count of all parent blocks. If parent blocks were not deduped, the result block could have fewer series than the configuration value. 
+
+## Useful Metrics
+
+- `cortex_compactor_group_partition_count`: can be used to keep track of how many partitions being compacted for each time range.
+- `cortex_compactor_group_compactions_not_planned_total`: can be used to alarm any compaction was failed to be planned due to error.
+- `cortex_compact_group_compaction_duration_seconds`: can be used to monitor compaction duration or each time range compactions.
+- `cortex_compactor_oldest_partition_offset`: can be used to monitor when was the oldest compaction that is still not completed.

From 117373a4d5fb325b02d8edccc1e962c866dc4286 Mon Sep 17 00:00:00 2001
From: Alex Le <emoc1989@gmail.com>
Date: Thu, 23 Jan 2025 13:37:27 -0800
Subject: [PATCH 32/34] Update docs/guides/partitioning-compactor.md

Co-authored-by: Charlie Le <charlie_le@apple.com>
Signed-off-by: Alex Le <emoc1989@gmail.com>
Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 docs/guides/partitioning-compactor.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/guides/partitioning-compactor.md b/docs/guides/partitioning-compactor.md
index 2a161a76a0e..189f1e14c47 100644
--- a/docs/guides/partitioning-compactor.md
+++ b/docs/guides/partitioning-compactor.md
@@ -7,7 +7,7 @@ slug: partition-compaction
 
 ## Context
 
-Compactor is bonded by maximum 64GB of index file size. If compaction failed due to exceeding index file size limit, partition compaction can be enabled to allow compactor compacting into multiple blocks that have index file size stays within limit.
+Compactor is bounded by maximum 64GB of index file size. If compaction failed due to exceeding index file size limit, partition compaction can be enabled to allow compactor compacting into multiple blocks that have index file size stays within limit.
 
 ## Enable Partition Compaction
 

From a7466d1c996a7bd84505101a3e0ae81e7e4a1954 Mon Sep 17 00:00:00 2001
From: Alex Le <leqiyue@amazon.com>
Date: Thu, 30 Jan 2025 15:59:16 -0800
Subject: [PATCH 33/34] updated doc

Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 docs/guides/partitioning-compactor.md | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/docs/guides/partitioning-compactor.md b/docs/guides/partitioning-compactor.md
index 189f1e14c47..c5aefee182c 100644
--- a/docs/guides/partitioning-compactor.md
+++ b/docs/guides/partitioning-compactor.md
@@ -14,18 +14,26 @@ Compactor is bounded by maximum 64GB of index file size. If compaction failed du
 In order to enable partition compaction, the following flag needs to be set:
 
 ```
--compactor.sharding-enabled=true
--compactor.sharding-strategy=shuffle-sharding
--compactor.compaction-strategy=partitioning
+-compactor.sharding-enabled=true                # Enable sharding tenants across multiple compactor instances. This is required to enable partition compaction
+-compactor.sharding-strategy=shuffle-sharding   # Use Shuffle Sharding as sharding strategy. This is required to enable partition compaction
+-compactor.compaction-strategy=partitioning     # Use Partition Compaction as compaction strategy. To turn if off, set it to `default`
 ```
 
+### Migration
+
+There is no special migration process needed to enable partition compaction. End user could enable it by setting the above configurations all at once. 
+
+Enabling partition compaction would group previously compacted blocks (only those have time range smaller than the largest configured compaction time ranges) with uncompacted blocks and generate new compaction plans. This would group blocks having duplicated series together and those series would be deduped after compaction.
+
+Disabling partition compaction after enabled it does not need migration either. After disabling partition compaction, compactor would group partitioned result blocks together and compact them into one block.
+
 ## Configure Partition Compaction
 
 By default, partition compaction utilizes the following configurations and their values:
 
 ```
--compactor.partition-index-size-bytes=68719476736 // 64GB
--compactor.partition-series-count=0 // no limit
+-compactor.partition-index-size-bytes=68719476736   # 64GB
+-compactor.partition-series-count=0                 # no limit
 ```
 
 The default value should start partitioning result blocks when sum of index files size of parent blocks exceeds 64GB. End user could also change those two configurations. Partition compaction would always calculate partition count based on both configuration and pick the one with higher partition count.
@@ -38,5 +46,5 @@ Note: `compactor.partition-series-count` is using sum of series count of all par
 
 - `cortex_compactor_group_partition_count`: can be used to keep track of how many partitions being compacted for each time range.
 - `cortex_compactor_group_compactions_not_planned_total`: can be used to alarm any compaction was failed to be planned due to error.
-- `cortex_compact_group_compaction_duration_seconds`: can be used to monitor compaction duration or each time range compactions.
+- `cortex_compact_group_compaction_duration_seconds`: can be used to monitor compaction duration of each time range compactions.
 - `cortex_compactor_oldest_partition_offset`: can be used to monitor when was the oldest compaction that is still not completed.

From 6b8066b9c18686a630e7b7a42dbb211ce596057c Mon Sep 17 00:00:00 2001
From: Alex Le <leqiyue@amazon.com>
Date: Tue, 4 Feb 2025 10:04:55 -0800
Subject: [PATCH 34/34] clean white space

Signed-off-by: Alex Le <leqiyue@amazon.com>
---
 docs/guides/partitioning-compactor.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/guides/partitioning-compactor.md b/docs/guides/partitioning-compactor.md
index c5aefee182c..6238648cf87 100644
--- a/docs/guides/partitioning-compactor.md
+++ b/docs/guides/partitioning-compactor.md
@@ -21,7 +21,7 @@ In order to enable partition compaction, the following flag needs to be set:
 
 ### Migration
 
-There is no special migration process needed to enable partition compaction. End user could enable it by setting the above configurations all at once. 
+There is no special migration process needed to enable partition compaction. End user could enable it by setting the above configurations all at once.
 
 Enabling partition compaction would group previously compacted blocks (only those have time range smaller than the largest configured compaction time ranges) with uncompacted blocks and generate new compaction plans. This would group blocks having duplicated series together and those series would be deduped after compaction.
 
@@ -40,7 +40,7 @@ The default value should start partitioning result blocks when sum of index file
 
 Both configurations support to be set per tenant.
 
-Note: `compactor.partition-series-count` is using sum of series count of all parent blocks. If parent blocks were not deduped, the result block could have fewer series than the configuration value. 
+Note: `compactor.partition-series-count` is using sum of series count of all parent blocks. If parent blocks were not deduped, the result block could have fewer series than the configuration value.
 
 ## Useful Metrics