Skip to content

Commit a02b385

Browse files
astefankanoshiouelasticsearchmachine
authored
ESQL: Fix alias removal in regex extraction with JOIN (#127687) (#128202)
* Disallow removal of regex extracted fields --------- (cherry picked from commit 557f1f1) Co-authored-by: kanoshiou <[email protected]> Co-authored-by: elasticsearchmachine <[email protected]>
1 parent c615db2 commit a02b385

File tree

5 files changed

+139
-9
lines changed

5 files changed

+139
-9
lines changed

docs/changelog/127687.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 127687
2+
summary: "ESQL: Fix alias removal in regex extraction with JOIN"
3+
area: ES|QL
4+
type: bug
5+
issues:
6+
- 127467

x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1592,3 +1592,72 @@ null | Milky Way | Marunouchi
15921592
null | null | null
15931593
null | null | null
15941594
;
1595+
1596+
1597+
joinMaskingRegex
1598+
// https://github.com/elastic/elasticsearch/issues/127467
1599+
required_capability: union_types
1600+
required_capability: join_lookup_v12
1601+
required_capability: fix_join_masking_regex_extract
1602+
from books,message_*,ul*
1603+
| enrich languages_policy on status
1604+
| drop `language_name`, `bytes_out`, `id`, id
1605+
| dissect book_no "%{type}"
1606+
| dissect author.keyword "%{HZicfARaID}"
1607+
| mv_expand `status`
1608+
| sort HZicfARaID, year DESC NULLS LAST, publisher DESC NULLS FIRST, description DESC, type NULLS LAST, message ASC NULLS LAST, title NULLS FIRST, status NULLS LAST
1609+
| enrich languages_policy on book_no
1610+
| grok message "%{WORD:DiLNyZKNDu}"
1611+
| limit 7972
1612+
| rename year as language_code
1613+
| lookup join languages_lookup on language_code
1614+
| limit 13966
1615+
| stats rcyIZnSOb = min(language_code), `ratings` = min(@timestamp), dgDxwMeFYrD = count(`@timestamp`), ifyZfXigqVN = count(*), qTXdrzSpY = min(language_code) by author.keyword
1616+
| rename author.keyword as message
1617+
| lookup join message_types_lookup on message
1618+
| stats `ratings` = count(*) by type
1619+
| stats `type` = count(type), `ratings` = count(*)
1620+
| keep `ratings`, ratings
1621+
;
1622+
1623+
ratings:long
1624+
1
1625+
;
1626+
1627+
joinMaskingDissect
1628+
// https://github.com/elastic/elasticsearch/issues/127467
1629+
required_capability: join_lookup_v12
1630+
required_capability: fix_join_masking_regex_extract
1631+
from sample_data
1632+
| dissect message "%{type}"
1633+
| drop type
1634+
| lookup join message_types_lookup on message
1635+
| stats count = count(*) by type
1636+
| keep count
1637+
| sort count
1638+
;
1639+
count:long
1640+
1
1641+
3
1642+
3
1643+
;
1644+
1645+
1646+
joinMaskingGrok
1647+
// https://github.com/elastic/elasticsearch/issues/127467
1648+
required_capability: join_lookup_v12
1649+
required_capability: fix_join_masking_regex_extract
1650+
from sample_data
1651+
| grok message "%{WORD:type}"
1652+
| drop type
1653+
| lookup join message_types_lookup on message
1654+
| stats max = max(event_duration) by type
1655+
| keep max
1656+
| sort max
1657+
;
1658+
1659+
max:long
1660+
1232382
1661+
3450233
1662+
8268153
1663+
;

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -895,7 +895,13 @@ public enum Cap {
895895
* Support for keeping `DROP` attributes when resolving field names.
896896
* see <a href="https://github.com/elastic/elasticsearch/issues/126418"> ES|QL: no matches for pattern #126418 </a>
897897
*/
898-
DROP_AGAIN_WITH_WILDCARD_AFTER_EVAL;
898+
DROP_AGAIN_WITH_WILDCARD_AFTER_EVAL,
899+
900+
/**
901+
* During resolution (pre-analysis) we have to consider that joins can override regex extracted values
902+
* see <a href="https://github.com/elastic/elasticsearch/issues/127467"> ES|QL: pruning of JOINs leads to missing fields #127467 </a>
903+
*/
904+
FIX_JOIN_MASKING_REGEX_EXTRACT;
899905

900906
private final boolean enabled;
901907

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
import org.elasticsearch.xpack.esql.core.expression.Expressions;
4242
import org.elasticsearch.xpack.esql.core.expression.FoldContext;
4343
import org.elasticsearch.xpack.esql.core.expression.MetadataAttribute;
44+
import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
45+
import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute;
4446
import org.elasticsearch.xpack.esql.core.expression.UnresolvedAttribute;
4547
import org.elasticsearch.xpack.esql.core.expression.UnresolvedStar;
4648
import org.elasticsearch.xpack.esql.core.util.Holder;
@@ -594,11 +596,7 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy
594596

595597
parsed.forEachDown(p -> {// go over each plan top-down
596598
if (p instanceof RegexExtract re) { // for Grok and Dissect
597-
// remove other down-the-tree references to the extracted fields
598-
for (Attribute extracted : re.extractedFields()) {
599-
referencesBuilder.removeIf(attr -> matchByName(attr, extracted.name(), false));
600-
}
601-
// but keep the inputs needed by Grok/Dissect
599+
// keep the inputs needed by Grok/Dissect
602600
referencesBuilder.addAll(re.input().references());
603601
} else if (p instanceof Enrich enrich) {
604602
AttributeSet enrichFieldRefs = Expressions.references(enrich.enrichFields());
@@ -653,15 +651,19 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy
653651
// remove any already discovered UnresolvedAttributes that are in fact aliases defined later down in the tree
654652
// for example "from test | eval x = salary | stats max = max(x) by gender"
655653
// remove the UnresolvedAttribute "x", since that is an Alias defined in "eval"
654+
// also remove other down-the-tree references to the extracted fields from "grok" and "dissect"
656655
AttributeSet planRefs = p.references();
657656
Set<String> fieldNames = planRefs.names();
658-
p.forEachExpressionDown(Alias.class, alias -> {
657+
p.forEachExpressionDown(NamedExpression.class, ne -> {
658+
if ((ne instanceof Alias || ne instanceof ReferenceAttribute) == false) {
659+
return;
660+
}
659661
// do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id AS id"
660662
// or the UnresolvedAttributes that are used in Functions that have aliases "STATS id = MAX(id)"
661-
if (fieldNames.contains(alias.name())) {
663+
if (fieldNames.contains(ne.name())) {
662664
return;
663665
}
664-
referencesBuilder.removeIf(attr -> matchByName(attr, alias.name(), shadowingRefsBuilder.contains(attr)));
666+
referencesBuilder.removeIf(attr -> matchByName(attr, ne.name(), shadowingRefsBuilder.contains(attr)));
665667
});
666668
}
667669
});

x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1341,6 +1341,53 @@ public void testDissectOverwriteName() {
13411341
assertThat(fieldNames, equalTo(Set.of("emp_no", "emp_no.*", "first_name", "first_name.*")));
13421342
}
13431343

1344+
/**
1345+
* Fix alias removal in regex extraction with JOIN
1346+
* @see <a href="https://github.com/elastic/elasticsearch/issues/127467">ES|QL: pruning of JOINs leads to missing fields</a>
1347+
*/
1348+
public void testAvoidGrokAttributesRemoval() {
1349+
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
1350+
Set<String> fieldNames = fieldNames("""
1351+
from message_types
1352+
| eval type = 1
1353+
| lookup join message_types_lookup on message
1354+
| drop message
1355+
| grok type "%{WORD:b}"
1356+
| stats x = max(b)
1357+
| keep x""", Set.of());
1358+
assertThat(fieldNames, equalTo(Set.of("message", "x", "x.*", "message.*")));
1359+
}
1360+
1361+
public void testAvoidGrokAttributesRemoval2() {
1362+
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
1363+
Set<String> fieldNames = fieldNames("""
1364+
from sample_data
1365+
| dissect message "%{type}"
1366+
| drop type
1367+
| lookup join message_types_lookup on message
1368+
| stats count = count(*) by type
1369+
| keep count
1370+
| sort count""", Set.of());
1371+
assertThat(fieldNames, equalTo(Set.of("type", "message", "count", "message.*", "type.*", "count.*")));
1372+
}
1373+
1374+
public void testAvoidGrokAttributesRemoval3() {
1375+
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
1376+
Set<String> fieldNames = fieldNames("""
1377+
from sample_data
1378+
| grok message "%{WORD:type}"
1379+
| drop type
1380+
| lookup join message_types_lookup on message
1381+
| stats max = max(event_duration) by type
1382+
| keep max
1383+
| sort max""", Set.of());
1384+
assertThat(
1385+
fieldNames,
1386+
equalTo(Set.of("type", "event_duration", "message", "max", "event_duration.*", "message.*", "type.*", "max.*"))
1387+
);
1388+
1389+
}
1390+
13441391
public void testEnrichOnDefaultField() {
13451392
Set<String> fieldNames = fieldNames("""
13461393
from employees

0 commit comments

Comments
 (0)