Skip to content

Commit c6b42cf

Browse files
committed
remove stats correction from ES|QL sample
1 parent f02a3c4 commit c6b42cf

File tree

12 files changed

+46
-181
lines changed

12 files changed

+46
-181
lines changed

x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/RestSampleTestCase.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ public class RestSampleTestCase extends ESRestTestCase {
3333
public void skipWhenSampleDisabled() throws IOException {
3434
assumeTrue(
3535
"Requires SAMPLE capability",
36-
EsqlSpecTestCase.hasCapabilities(adminClient(), List.of(EsqlCapabilities.Cap.SAMPLE_V2.capabilityName()))
36+
EsqlSpecTestCase.hasCapabilities(adminClient(), List.of(EsqlCapabilities.Cap.SAMPLE_V3.capabilityName()))
3737
);
3838
}
3939

x-pack/plugin/esql/qa/testFixtures/src/main/resources/sample.csv-spec

Lines changed: 32 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
// because the CSV tests don't support such assertions.
1010

1111
row
12-
required_capability: sample_v2
12+
required_capability: sample_v3
1313

1414
ROW x = 1 | SAMPLE .999999999
1515
;
@@ -20,7 +20,7 @@ x:integer
2020

2121

2222
row and mv_expand
23-
required_capability: sample_v2
23+
required_capability: sample_v3
2424

2525
ROW x = [1,2,3,4,5] | MV_EXPAND x | SAMPLE .999999999
2626
;
@@ -35,15 +35,14 @@ x:integer
3535

3636

3737
adjust stats for sampling
38-
required_capability: sample_v2
38+
required_capability: sample_v3
3939

4040
FROM employees
4141
| SAMPLE 0.5
42-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no)), avg_emp_no = AVG(emp_no), sum_emp_no = SUM(emp_no)
43-
| EVAL is_expected = count >= 20 AND count <= 180 AND
44-
values_count >= 10 AND values_count <= 90 AND
42+
| STATS count = COUNT(), avg_emp_no = AVG(emp_no), sum_emp_no = SUM(emp_no)
43+
| EVAL is_expected = count >= 10 AND count <= 90 AND
4544
avg_emp_no > 10010 AND avg_emp_no < 10090 AND
46-
sum_emp_no > 20*10010 AND sum_emp_no < 180*10090
45+
sum_emp_no > 10*10010 AND sum_emp_no < 90*10090
4746
| KEEP is_expected
4847
;
4948

@@ -53,14 +52,13 @@ true
5352

5453

5554
before where
56-
required_capability: sample_v2
55+
required_capability: sample_v3
5756

5857
FROM employees
5958
| SAMPLE 0.5
6059
| WHERE emp_no > 10050
61-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no)), avg_emp_no = AVG(emp_no)
62-
| EVAL is_expected = count >= 5 AND count <= 95 AND
63-
values_count >= 2 AND values_count <= 48 AND
60+
| STATS count = COUNT(), avg_emp_no = AVG(emp_no)
61+
| EVAL is_expected = count >= 2 AND count <= 48 AND
6462
avg_emp_no > 10055 AND avg_emp_no < 10095
6563
| KEEP is_expected
6664
;
@@ -71,14 +69,13 @@ true
7169

7270

7371
after where
74-
required_capability: sample_v2
72+
required_capability: sample_v3
7573

7674
FROM employees
7775
| WHERE emp_no <= 10050
7876
| SAMPLE 0.5
79-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no)), avg_emp_no = AVG(emp_no)
80-
| EVAL is_expected = count >= 5 AND count <= 95 AND
81-
values_count >= 2 AND values_count <= 48 AND
77+
| STATS count = COUNT(), avg_emp_no = AVG(emp_no)
78+
| EVAL is_expected = count >= 2 AND count <= 48 AND
8279
avg_emp_no > 10005 AND avg_emp_no < 10045
8380
| KEEP is_expected
8481
;
@@ -89,14 +86,13 @@ true
8986

9087

9188
before sort
92-
required_capability: sample_v2
89+
required_capability: sample_v3
9390

9491
FROM employees
9592
| SAMPLE 0.5
9693
| SORT emp_no
97-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no)), avg_emp_no = AVG(emp_no)
98-
| EVAL is_expected = count >= 20 AND count <= 180 AND
99-
values_count >= 10 AND values_count <= 90 AND
94+
| STATS count = COUNT(), avg_emp_no = AVG(emp_no)
95+
| EVAL is_expected = count >= 10 AND count <= 90 AND
10096
avg_emp_no > 10010 AND avg_emp_no < 10090
10197
| KEEP is_expected
10298
;
@@ -107,14 +103,13 @@ true
107103

108104

109105
after sort
110-
required_capability: sample_v2
106+
required_capability: sample_v3
111107

112108
FROM employees
113109
| SORT emp_no
114110
| SAMPLE 0.5
115-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no)), avg_emp_no = AVG(emp_no)
116-
| EVAL is_expected = count >= 20 AND count <= 180 AND
117-
values_count >= 10 AND values_count <= 90 AND
111+
| STATS count = COUNT(), avg_emp_no = AVG(emp_no)
112+
| EVAL is_expected = count >= 10 AND count <= 90 AND
118113
avg_emp_no > 10010 AND avg_emp_no < 10090
119114
| KEEP is_expected
120115
;
@@ -125,13 +120,13 @@ true
125120

126121

127122
before limit
128-
required_capability: sample_v2
123+
required_capability: sample_v3
129124

130125
FROM employees
131126
| SAMPLE 0.5
132127
| LIMIT 10
133-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no))
134-
| EVAL is_expected = count == 10 AND values_count == 10
128+
| STATS count = COUNT()
129+
| EVAL is_expected = count == 10
135130
| KEEP is_expected
136131
;
137132

@@ -141,14 +136,13 @@ true
141136

142137

143138
after limit
144-
required_capability: sample_v2
139+
required_capability: sample_v3
145140

146141
FROM employees
147142
| LIMIT 50
148143
| SAMPLE 0.5
149-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no))
150-
| EVAL is_expected = count >= 5 AND count <= 95 AND
151-
values_count >= 2 AND values_count <= 48
144+
| STATS count = COUNT()
145+
| EVAL is_expected = count >= 2 AND count <= 48
152146
| KEEP is_expected
153147
;
154148

@@ -158,7 +152,7 @@ true
158152

159153

160154
before mv_expand
161-
required_capability: sample_v2
155+
required_capability: sample_v3
162156

163157
ROW x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50], y = [1,2]
164158
| MV_EXPAND x
@@ -176,7 +170,7 @@ true
176170

177171

178172
after mv_expand
179-
required_capability: sample_v2
173+
required_capability: sample_v3
180174

181175
ROW x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50], y = [1,2]
182176
| MV_EXPAND x
@@ -194,15 +188,14 @@ true
194188

195189

196190
multiple samples
197-
required_capability: sample_v2
191+
required_capability: sample_v3
198192

199193
FROM employees
200194
| SAMPLE 0.7
201195
| SAMPLE 0.8
202196
| SAMPLE 0.9
203-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no)), avg_emp_no = AVG(emp_no)
204-
| EVAL is_expected = count >= 20 AND count <= 180 AND
205-
values_count >= 10 AND values_count <= 90 AND
197+
| STATS count = COUNT(), avg_emp_no = AVG(emp_no)
198+
| EVAL is_expected = count >= 10 AND count <= 90 AND
206199
avg_emp_no > 10010 AND avg_emp_no < 10090
207200
| KEEP is_expected
208201
;
@@ -213,15 +206,14 @@ true
213206

214207

215208
after stats
216-
required_capability: sample_v2
209+
required_capability: sample_v3
217210

218211
FROM employees
219212
| SAMPLE 0.5
220213
| STATS avg_salary = AVG(salary) BY job_positions
221214
| SAMPLE 0.8
222-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(avg_salary)), avg_avg_salary = AVG(avg_salary)
223-
| EVAL is_expected = count >= 1 AND count <= 20 AND
224-
values_count >= 1 AND values_count <= 16 AND
215+
| STATS count = COUNT(), avg_avg_salary = AVG(avg_salary)
216+
| EVAL is_expected = count >= 1 AND count <= 16 AND
225217
avg_avg_salary > 25000 AND avg_avg_salary < 75000
226218
| KEEP is_expected
227219
;

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1082,7 +1082,7 @@ public enum Cap {
10821082
/**
10831083
* Support for the SAMPLE command
10841084
*/
1085-
SAMPLE_V2(Build.current().isSnapshot()),
1085+
SAMPLE_V3(Build.current().isSnapshot()),
10861086

10871087
/**
10881088
* The {@code _query} API now gives a cast recommendation if multiple types are found in certain instances.

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Count.java

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,9 @@
3939
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.DEFAULT;
4040
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isType;
4141

42-
public class Count extends AggregateFunction implements ToAggregator, SurrogateExpression, HasSampleCorrection {
42+
public class Count extends AggregateFunction implements ToAggregator, SurrogateExpression {
4343
public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Count", Count::new);
4444

45-
private final boolean isSampleCorrected;
46-
4745
@FunctionInfo(
4846
returnType = "long",
4947
description = "Returns the total number (count) of input values.",
@@ -98,20 +96,11 @@ public Count(
9896
}
9997

10098
public Count(Source source, Expression field, Expression filter) {
101-
this(source, field, filter, false);
102-
}
103-
104-
private Count(Source source, Expression field, Expression filter, boolean isSampleCorrected) {
10599
super(source, field, filter, emptyList());
106-
this.isSampleCorrected = isSampleCorrected;
107100
}
108101

109102
private Count(StreamInput in) throws IOException {
110103
super(in);
111-
// isSampleCorrected is only used during query optimization to mark
112-
// whether this function has been processed. Hence there's no need to
113-
// serialize it.
114-
this.isSampleCorrected = false;
115104
}
116105

117106
@Override
@@ -182,14 +171,4 @@ public Expression surrogate() {
182171

183172
return null;
184173
}
185-
186-
@Override
187-
public boolean isSampleCorrected() {
188-
return isSampleCorrected;
189-
}
190-
191-
@Override
192-
public Expression sampleCorrection(Expression sampleProbability) {
193-
return new ToLong(source(), new Div(source(), new Count(source(), field(), filter(), true), sampleProbability));
194-
}
195174
}

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/HasSampleCorrection.java

Lines changed: 0 additions & 21 deletions
This file was deleted.

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Sum.java

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,7 @@
2626
import org.elasticsearch.xpack.esql.expression.function.FunctionType;
2727
import org.elasticsearch.xpack.esql.expression.function.Param;
2828
import org.elasticsearch.xpack.esql.expression.function.scalar.convert.FromAggregateMetricDouble;
29-
import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToLong;
3029
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvSum;
31-
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Div;
3230
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Mul;
3331

3432
import java.io.IOException;
@@ -45,11 +43,9 @@
4543
/**
4644
* Sum all values of a field in matching documents.
4745
*/
48-
public class Sum extends NumericAggregate implements SurrogateExpression, HasSampleCorrection {
46+
public class Sum extends NumericAggregate implements SurrogateExpression {
4947
public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Sum", Sum::new);
5048

51-
private final boolean isSampleCorrected;
52-
5349
@FunctionInfo(
5450
returnType = { "long", "double" },
5551
description = "The sum of a numeric expression.",
@@ -69,20 +65,11 @@ public Sum(Source source, @Param(name = "number", type = { "aggregate_metric_dou
6965
}
7066

7167
public Sum(Source source, Expression field, Expression filter) {
72-
this(source, field, filter, false);
73-
}
74-
75-
private Sum(Source source, Expression field, Expression filter, boolean isSampleCorrected) {
7668
super(source, field, filter, emptyList());
77-
this.isSampleCorrected = isSampleCorrected;
7869
}
7970

8071
private Sum(StreamInput in) throws IOException {
8172
super(in);
82-
// isSampleCorrected is only used during query optimization to mark
83-
// whether this function has been processed. Hence there's no need to
84-
// serialize it.
85-
this.isSampleCorrected = false;
8673
}
8774

8875
@Override
@@ -160,19 +147,4 @@ public Expression surrogate() {
160147
? new Mul(s, new MvSum(s, field), new Count(s, new Literal(s, StringUtils.WILDCARD, DataType.KEYWORD)))
161148
: null;
162149
}
163-
164-
@Override
165-
public boolean isSampleCorrected() {
166-
return isSampleCorrected;
167-
}
168-
169-
@Override
170-
public Expression sampleCorrection(Expression sampleProbability) {
171-
Expression correctedSum = new Div(source(), new Sum(source(), field(), filter(), true), sampleProbability);
172-
return switch (dataType()) {
173-
case DOUBLE -> correctedSum;
174-
case LONG -> new ToLong(source(), correctedSum);
175-
default -> throw new IllegalStateException("unexpected data type [" + dataType() + "]");
176-
};
177-
}
178150
}

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizer.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import org.elasticsearch.xpack.esql.VerificationException;
1111
import org.elasticsearch.xpack.esql.common.Failures;
1212
import org.elasticsearch.xpack.esql.core.type.DataType;
13-
import org.elasticsearch.xpack.esql.optimizer.rules.logical.ApplySampleCorrections;
1413
import org.elasticsearch.xpack.esql.optimizer.rules.logical.BooleanFunctionEqualsElimination;
1514
import org.elasticsearch.xpack.esql.optimizer.rules.logical.BooleanSimplification;
1615
import org.elasticsearch.xpack.esql.optimizer.rules.logical.CombineBinaryComparisons;
@@ -130,7 +129,6 @@ protected static Batch<LogicalPlan> substitutions() {
130129
return new Batch<>(
131130
"Substitutions",
132131
Limiter.ONCE,
133-
new ApplySampleCorrections(),
134132
new SubstituteSurrogatePlans(),
135133
// Translate filtered expressions into aggregate with filters - can't use surrogate expressions because it was
136134
// retrofitted for constant folding - this needs to be fixed.

0 commit comments

Comments
 (0)