Skip to content

Commit 75525a9

Browse files
authored
Expanded Postings Cache can cache results without the nearly created series under high load. (#6417)
* [ExpandedPostingsCache] Quering and adding series concurrently can cache wrong results Signed-off-by: alanprot <[email protected]> * Expiring the series after commit call Signed-off-by: alanprot <[email protected]> * Adding option to run tests with no-race check Signed-off-by: alanprot <[email protected]> --------- Signed-off-by: alanprot <[email protected]>
1 parent 95236cf commit 75525a9

File tree

4 files changed

+132
-0
lines changed

4 files changed

+132
-0
lines changed

.github/workflows/test-build-deploy.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,24 @@ jobs:
6161
ln -s $GITHUB_WORKSPACE/* /go/src/github.com/cortexproject/cortex
6262
- name: Run Tests
6363
run: make BUILD_IN_CONTAINER=false test
64+
test-no-race:
65+
runs-on: ubuntu-20.04
66+
container:
67+
image: quay.io/cortexproject/build-image:master-0ddced051
68+
steps:
69+
- name: Checkout Repo
70+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
71+
- name: Setup Git safe.directory
72+
run: |
73+
echo "this step is needed because when running in container, actions/checkout does not set safe.directory effectively."
74+
echo "See https://github.com/actions/runner/issues/2033. We should use --system instead of --global"
75+
git config --system --add safe.directory $GITHUB_WORKSPACE
76+
- name: Sym Link Expected Path to Workspace
77+
run: |
78+
mkdir -p /go/src/github.com/cortexproject/cortex
79+
ln -s $GITHUB_WORKSPACE/* /go/src/github.com/cortexproject/cortex
80+
- name: Run Tests
81+
run: make BUILD_IN_CONTAINER=false test-no-race
6482

6583
security:
6684
name: CodeQL

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,9 @@ lint:
218218
test:
219219
go test -tags netgo -timeout 30m -race -count 1 ./...
220220

221+
test-no-race:
222+
go test -tags netgo -timeout 30m -count 1 ./...
223+
221224
cover:
222225
$(eval COVERDIR := $(shell mktemp -d coverage.XXXXXXXXXX))
223226
$(eval COVERFILE := $(shell mktemp $(COVERDIR)/unit.XXXXXXXXXX))

pkg/ingester/ingester.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,6 +1204,8 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
12041204

12051205
// Walk the samples, appending them to the users database
12061206
app := db.Appender(ctx).(extendedAppender)
1207+
var newSeries []labels.Labels
1208+
12071209
for _, ts := range req.Timeseries {
12081210
// The labels must be sorted (in our case, it's guaranteed a write request
12091211
// has sorted labels once hit the ingester).
@@ -1233,6 +1235,10 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
12331235
copiedLabels = cortexpb.FromLabelAdaptersToLabelsWithCopy(ts.Labels)
12341236
// Retain the reference in case there are multiple samples for the series.
12351237
if ref, err = app.Append(0, copiedLabels, s.TimestampMs, s.Value); err == nil {
1238+
// Keep track of what series needs to be expired on the postings cache
1239+
if db.postingCache != nil {
1240+
newSeries = append(newSeries, copiedLabels)
1241+
}
12361242
succeededSamplesCount++
12371243
continue
12381244
}
@@ -1274,6 +1280,10 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
12741280
// Copy the label set because both TSDB and the active series tracker may retain it.
12751281
copiedLabels = cortexpb.FromLabelAdaptersToLabelsWithCopy(ts.Labels)
12761282
if ref, err = app.AppendHistogram(0, copiedLabels, hp.TimestampMs, h, fh); err == nil {
1283+
// Keep track of what series needs to be expired on the postings cache
1284+
if db.postingCache != nil {
1285+
newSeries = append(newSeries, copiedLabels)
1286+
}
12771287
succeededHistogramsCount++
12781288
continue
12791289
}
@@ -1342,6 +1352,17 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
13421352
if err := app.Commit(); err != nil {
13431353
return nil, wrapWithUser(err, userID)
13441354
}
1355+
1356+
// This is a workaround of https://github.com/prometheus/prometheus/pull/15579
1357+
// Calling expire here may result in the series names being expired multiple times,
1358+
// as there may be multiple Push operations concurrently for the same new timeseries.
1359+
// TODO: alanprot remove this when/if the PR is merged
1360+
if db.postingCache != nil {
1361+
for _, s := range newSeries {
1362+
db.postingCache.ExpireSeries(s)
1363+
}
1364+
}
1365+
13451366
i.TSDBState.appenderCommitDuration.Observe(time.Since(startCommit).Seconds())
13461367

13471368
// If only invalid samples or histograms are pushed, don't change "last update", as TSDB was not modified.
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
//go:build !race
2+
3+
package ingester
4+
5+
import (
6+
"context"
7+
"fmt"
8+
"math"
9+
"strconv"
10+
"sync"
11+
"testing"
12+
"time"
13+
14+
"github.com/prometheus/client_golang/prometheus"
15+
"github.com/prometheus/prometheus/model/labels"
16+
"github.com/stretchr/testify/require"
17+
"github.com/weaveworks/common/user"
18+
19+
"github.com/cortexproject/cortex/pkg/cortexpb"
20+
"github.com/cortexproject/cortex/pkg/ingester/client"
21+
"github.com/cortexproject/cortex/pkg/ring"
22+
"github.com/cortexproject/cortex/pkg/util/services"
23+
"github.com/cortexproject/cortex/pkg/util/test"
24+
)
25+
26+
// Running this test without race check as there is a known prometheus race condition.
27+
// See https://github.com/prometheus/prometheus/pull/15141 and https://github.com/prometheus/prometheus/pull/15316
28+
func TestExpandedCachePostings_Race(t *testing.T) {
29+
cfg := defaultIngesterTestConfig(t)
30+
cfg.BlocksStorageConfig.TSDB.BlockRanges = []time.Duration{2 * time.Hour}
31+
cfg.LifecyclerConfig.JoinAfter = 0
32+
cfg.BlocksStorageConfig.TSDB.PostingsCache.Head.Enabled = true
33+
34+
r := prometheus.NewRegistry()
35+
i, err := prepareIngesterWithBlocksStorage(t, cfg, r)
36+
require.NoError(t, err)
37+
require.NoError(t, services.StartAndAwaitRunning(context.Background(), i))
38+
defer services.StopAndAwaitTerminated(context.Background(), i) //nolint:errcheck
39+
40+
// Wait until the ingester is ACTIVE
41+
test.Poll(t, 100*time.Millisecond, ring.ACTIVE, func() interface{} {
42+
return i.lifecycler.GetState()
43+
})
44+
45+
ctx := user.InjectOrgID(context.Background(), "test")
46+
47+
wg := sync.WaitGroup{}
48+
labelNames := 100
49+
seriesPerLabelName := 200
50+
51+
for j := 0; j < labelNames; j++ {
52+
metricName := fmt.Sprintf("test_metric_%d", j)
53+
wg.Add(seriesPerLabelName * 2)
54+
for k := 0; k < seriesPerLabelName; k++ {
55+
go func() {
56+
defer wg.Done()
57+
_, err := i.Push(ctx, cortexpb.ToWriteRequest(
58+
[]labels.Labels{labels.FromStrings(labels.MetricName, metricName, "k", strconv.Itoa(k))},
59+
[]cortexpb.Sample{{Value: 1, TimestampMs: 9}}, nil, nil, cortexpb.API))
60+
require.NoError(t, err)
61+
}()
62+
63+
go func() {
64+
defer wg.Done()
65+
err := i.QueryStream(&client.QueryRequest{
66+
StartTimestampMs: 0,
67+
EndTimestampMs: math.MaxInt64,
68+
Matchers: []*client.LabelMatcher{{Type: client.EQUAL, Name: labels.MetricName, Value: metricName}},
69+
}, &mockQueryStreamServer{ctx: ctx})
70+
require.NoError(t, err)
71+
}()
72+
}
73+
74+
wg.Wait()
75+
76+
s := &mockQueryStreamServer{ctx: ctx}
77+
err = i.QueryStream(&client.QueryRequest{
78+
StartTimestampMs: 0,
79+
EndTimestampMs: math.MaxInt64,
80+
Matchers: []*client.LabelMatcher{{Type: client.EQUAL, Name: labels.MetricName, Value: metricName}},
81+
}, s)
82+
require.NoError(t, err)
83+
84+
set, err := seriesSetFromResponseStream(s)
85+
require.NoError(t, err)
86+
res, err := client.MatrixFromSeriesSet(set)
87+
require.NoError(t, err)
88+
require.Equal(t, seriesPerLabelName, res.Len())
89+
}
90+
}

0 commit comments

Comments
 (0)