Skip to content

Commit 557f1f1

Browse files
kanoshiouastefanelasticsearchmachine
authored
ESQL: Fix alias removal in regex extraction with JOIN (#127687)
* Disallow removal of regex extracted fields --------- Co-authored-by: Andrei Stefan <[email protected]> Co-authored-by: elasticsearchmachine <[email protected]>
1 parent 265848e commit 557f1f1

File tree

6 files changed

+139
-10
lines changed

6 files changed

+139
-10
lines changed

docs/changelog/127687.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 127687
2+
summary: "ESQL: Fix alias removal in regex extraction with JOIN"
3+
area: ES|QL
4+
type: bug
5+
issues:
6+
- 127467

x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ public abstract class GenerativeRestTest extends ESRestTestCase {
5151
"Unknown column \\[<all-fields-projected>\\]", // https://github.com/elastic/elasticsearch/issues/121741,
5252
"Plan \\[ProjectExec\\[\\[<no-fields>.* optimized incorrectly due to missing references", // https://github.com/elastic/elasticsearch/issues/125866
5353
"optimized incorrectly due to missing references", // https://github.com/elastic/elasticsearch/issues/116781
54-
"Unknown column", // https://github.com/elastic/elasticsearch/issues/127467
5554
"only supports KEYWORD or TEXT values", // https://github.com/elastic/elasticsearch/issues/127468
5655
"The incoming YAML document exceeds the limit:" // still to investigate, but it seems to be specific to the test framework
5756
);

x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1673,3 +1673,72 @@ null | Milky Way | Marunouchi
16731673
null | null | null
16741674
null | null | null
16751675
;
1676+
1677+
1678+
joinMaskingRegex
1679+
// https://github.com/elastic/elasticsearch/issues/127467
1680+
required_capability: union_types
1681+
required_capability: join_lookup_v12
1682+
required_capability: fix_join_masking_regex_extract
1683+
from books,message_*,ul*
1684+
| enrich languages_policy on status
1685+
| drop `language_name`, `bytes_out`, `id`, id
1686+
| dissect book_no "%{type}"
1687+
| dissect author.keyword "%{HZicfARaID}"
1688+
| mv_expand `status`
1689+
| sort HZicfARaID, year DESC NULLS LAST, publisher DESC NULLS FIRST, description DESC, type NULLS LAST, message ASC NULLS LAST, title NULLS FIRST, status NULLS LAST
1690+
| enrich languages_policy on book_no
1691+
| grok message "%{WORD:DiLNyZKNDu}"
1692+
| limit 7972
1693+
| rename year as language_code
1694+
| lookup join languages_lookup on language_code
1695+
| limit 13966
1696+
| stats rcyIZnSOb = min(language_code), `ratings` = min(@timestamp), dgDxwMeFYrD = count(`@timestamp`), ifyZfXigqVN = count(*), qTXdrzSpY = min(language_code) by author.keyword
1697+
| rename author.keyword as message
1698+
| lookup join message_types_lookup on message
1699+
| stats `ratings` = count(*) by type
1700+
| stats `type` = count(type), `ratings` = count(*)
1701+
| keep `ratings`, ratings
1702+
;
1703+
1704+
ratings:long
1705+
1
1706+
;
1707+
1708+
joinMaskingDissect
1709+
// https://github.com/elastic/elasticsearch/issues/127467
1710+
required_capability: join_lookup_v12
1711+
required_capability: fix_join_masking_regex_extract
1712+
from sample_data
1713+
| dissect message "%{type}"
1714+
| drop type
1715+
| lookup join message_types_lookup on message
1716+
| stats count = count(*) by type
1717+
| keep count
1718+
| sort count
1719+
;
1720+
count:long
1721+
1
1722+
3
1723+
3
1724+
;
1725+
1726+
1727+
joinMaskingGrok
1728+
// https://github.com/elastic/elasticsearch/issues/127467
1729+
required_capability: join_lookup_v12
1730+
required_capability: fix_join_masking_regex_extract
1731+
from sample_data
1732+
| grok message "%{WORD:type}"
1733+
| drop type
1734+
| lookup join message_types_lookup on message
1735+
| stats max = max(event_duration) by type
1736+
| keep max
1737+
| sort max
1738+
;
1739+
1740+
max:long
1741+
1232382
1742+
3450233
1743+
8268153
1744+
;

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1085,7 +1085,13 @@ public enum Cap {
10851085
/**
10861086
* Full text functions in STATS
10871087
*/
1088-
FULL_TEXT_FUNCTIONS_IN_STATS_WHERE;
1088+
FULL_TEXT_FUNCTIONS_IN_STATS_WHERE,
1089+
1090+
/**
1091+
* During resolution (pre-analysis) we have to consider that joins can override regex extracted values
1092+
* see <a href="https://github.com/elastic/elasticsearch/issues/127467"> ES|QL: pruning of JOINs leads to missing fields #127467 </a>
1093+
*/
1094+
FIX_JOIN_MASKING_REGEX_EXTRACT;
10891095

10901096
private final boolean enabled;
10911097

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
import org.elasticsearch.xpack.esql.core.expression.Expressions;
4242
import org.elasticsearch.xpack.esql.core.expression.FoldContext;
4343
import org.elasticsearch.xpack.esql.core.expression.MetadataAttribute;
44+
import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
45+
import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute;
4446
import org.elasticsearch.xpack.esql.core.expression.UnresolvedAttribute;
4547
import org.elasticsearch.xpack.esql.core.expression.UnresolvedStar;
4648
import org.elasticsearch.xpack.esql.core.util.Holder;
@@ -606,11 +608,7 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy
606608

607609
parsed.forEachDown(p -> {// go over each plan top-down
608610
if (p instanceof RegexExtract re) { // for Grok and Dissect
609-
// remove other down-the-tree references to the extracted fields
610-
for (Attribute extracted : re.extractedFields()) {
611-
referencesBuilder.removeIf(attr -> matchByName(attr, extracted.name(), false));
612-
}
613-
// but keep the inputs needed by Grok/Dissect
611+
// keep the inputs needed by Grok/Dissect
614612
referencesBuilder.addAll(re.input().references());
615613
} else if (p instanceof Enrich enrich) {
616614
AttributeSet enrichFieldRefs = Expressions.references(enrich.enrichFields());
@@ -665,15 +663,19 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy
665663
// remove any already discovered UnresolvedAttributes that are in fact aliases defined later down in the tree
666664
// for example "from test | eval x = salary | stats max = max(x) by gender"
667665
// remove the UnresolvedAttribute "x", since that is an Alias defined in "eval"
666+
// also remove other down-the-tree references to the extracted fields from "grok" and "dissect"
668667
AttributeSet planRefs = p.references();
669668
Set<String> fieldNames = planRefs.names();
670-
p.forEachExpressionDown(Alias.class, alias -> {
669+
p.forEachExpressionDown(NamedExpression.class, ne -> {
670+
if ((ne instanceof Alias || ne instanceof ReferenceAttribute) == false) {
671+
return;
672+
}
671673
// do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id AS id"
672674
// or the UnresolvedAttributes that are used in Functions that have aliases "STATS id = MAX(id)"
673-
if (fieldNames.contains(alias.name())) {
675+
if (fieldNames.contains(ne.name())) {
674676
return;
675677
}
676-
referencesBuilder.removeIf(attr -> matchByName(attr, alias.name(), shadowingRefsBuilder.contains(attr)));
678+
referencesBuilder.removeIf(attr -> matchByName(attr, ne.name(), shadowingRefsBuilder.contains(attr)));
677679
});
678680
}
679681
});

x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1341,6 +1341,53 @@ public void testDissectOverwriteName() {
13411341
assertThat(fieldNames, equalTo(Set.of("emp_no", "emp_no.*", "first_name", "first_name.*")));
13421342
}
13431343

1344+
/**
1345+
* Fix alias removal in regex extraction with JOIN
1346+
* @see <a href="https://github.com/elastic/elasticsearch/issues/127467">ES|QL: pruning of JOINs leads to missing fields</a>
1347+
*/
1348+
public void testAvoidGrokAttributesRemoval() {
1349+
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
1350+
Set<String> fieldNames = fieldNames("""
1351+
from message_types
1352+
| eval type = 1
1353+
| lookup join message_types_lookup on message
1354+
| drop message
1355+
| grok type "%{WORD:b}"
1356+
| stats x = max(b)
1357+
| keep x""", Set.of());
1358+
assertThat(fieldNames, equalTo(Set.of("message", "x", "x.*", "message.*")));
1359+
}
1360+
1361+
public void testAvoidGrokAttributesRemoval2() {
1362+
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
1363+
Set<String> fieldNames = fieldNames("""
1364+
from sample_data
1365+
| dissect message "%{type}"
1366+
| drop type
1367+
| lookup join message_types_lookup on message
1368+
| stats count = count(*) by type
1369+
| keep count
1370+
| sort count""", Set.of());
1371+
assertThat(fieldNames, equalTo(Set.of("type", "message", "count", "message.*", "type.*", "count.*")));
1372+
}
1373+
1374+
public void testAvoidGrokAttributesRemoval3() {
1375+
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
1376+
Set<String> fieldNames = fieldNames("""
1377+
from sample_data
1378+
| grok message "%{WORD:type}"
1379+
| drop type
1380+
| lookup join message_types_lookup on message
1381+
| stats max = max(event_duration) by type
1382+
| keep max
1383+
| sort max""", Set.of());
1384+
assertThat(
1385+
fieldNames,
1386+
equalTo(Set.of("type", "event_duration", "message", "max", "event_duration.*", "message.*", "type.*", "max.*"))
1387+
);
1388+
1389+
}
1390+
13441391
public void testEnrichOnDefaultField() {
13451392
Set<String> fieldNames = fieldNames("""
13461393
from employees

0 commit comments

Comments
 (0)