Skip to content

Commit b777ac3

Browse files
authored
Fix how quickwit handles "." in field names. (#1362)
After this PR we still accept field names with a ".". On the query side this dot then needs to be escaped with a backslash to disambiguate it from the "." of Json fields, and the "." of object fields. Under the hood, we still rely on tantivy's query parser. The hack consists in escaping the tantivy field name itself. This PR updates tantivy in order to include quickwit-oss/tantivy#1356. Closes #1338 Closes #1334
1 parent c8ffd05 commit b777ac3

File tree

12 files changed

+192
-42
lines changed

12 files changed

+192
-42
lines changed

Cargo.lock

Lines changed: 6 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

quickwit-core/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ quickwit-config = { version = "0.2.1", path = "../quickwit-config" }
2525
tokio = { version = "1", features = ["full"] }
2626
tokio-util = { version = "0.7", features = ["full"] }
2727
rand = "0.8"
28-
tantivy = { git = "https://github.com/quickwit-oss/tantivy", rev = "8c1e1cf", default-features = false, features = [
28+
tantivy = { git = "https://github.com/quickwit-oss/tantivy", rev = "ed26552", default-features = false, features = [
2929
"mmap",
3030
"lz4-compression",
3131
"quickwit",

quickwit-directories/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ futures = "0.3"
1616
serde = "1"
1717
serde_cbor = "0.11"
1818
serde_json = "1"
19-
tantivy = { git = "https://github.com/quickwit-oss/tantivy", rev = "8c1e1cf", default-features = false, features = [
19+
tantivy = { git = "https://github.com/quickwit-oss/tantivy", rev = "ed26552", default-features = false, features = [
2020
"mmap",
2121
"lz4-compression",
2222
"quickwit",

quickwit-doc-mapper/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ once_cell = "1.10"
1818
regex = "1"
1919
serde = { version = "1.0", features = ["derive"] }
2020
serde_json = "1.0"
21-
tantivy = { git = "https://github.com/quickwit-oss/tantivy", rev = "8c1e1cf", default-features = false, features = ["mmap", "lz4-compression", "quickwit"] }
22-
tantivy-query-grammar = { git = "https://github.com/quickwit-oss/tantivy/", rev = "8c1e1cf" }
21+
tantivy = { git = "https://github.com/quickwit-oss/tantivy", rev = "ed26552", default-features = false, features = ["mmap", "lz4-compression", "quickwit"] }
22+
tantivy-query-grammar = { git = "https://github.com/quickwit-oss/tantivy/", rev = "ed26552" }
2323
thiserror = "1.0"
2424
tracing = "0.1.29"
2525
typetag = "0.1"

quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs

Lines changed: 104 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,7 @@ impl DocMapper for DefaultDocMapper {
524524
mod tests {
525525
use std::collections::HashMap;
526526

527+
use quickwit_proto::SearchRequest;
527528
use serde_json::{self, json, Value as JsonValue};
528529
use tantivy::schema::{FieldType, Type, Value};
529530

@@ -560,9 +561,9 @@ mod tests {
560561
"owner": ["foo"],
561562
"body_other_tokenizer": ["20200415T072306-0700 INFO This is a great log"],
562563
"attributes.server": ["ABC"],
563-
"attributes.server.payload": [[97], [98]],
564+
"attributes.server\\.payload": [[97], [98]],
564565
"attributes.tags": [22, 23],
565-
"attributes.server.status": ["200", "201"]
566+
"attributes.server\\.status": ["200", "201"]
566567
}"#;
567568

568569
#[test]
@@ -573,7 +574,7 @@ mod tests {
573574
default_search_field_names.sort();
574575
assert_eq!(
575576
default_search_field_names,
576-
["attributes.server", "attributes.server.status", "body"]
577+
["attributes.server", r#"attributes.server\.status"#, "body"]
577578
);
578579
assert_eq!(config.field_mappings.num_fields(), 9);
579580
Ok(())
@@ -647,19 +648,20 @@ mod tests {
647648
}
648649

649650
#[test]
650-
fn test_accept_parsing_document_with_unknown_fields_and_missing_fields() -> anyhow::Result<()> {
651+
fn test_accept_parsing_document_with_unknown_fields_and_missing_fields() {
651652
let doc_mapper = crate::default_doc_mapper_for_tests();
652-
doc_mapper.doc_from_json(
653-
r#"{
653+
doc_mapper
654+
.doc_from_json(
655+
r#"{
654656
"timestamp": 1586960586000,
655657
"unknown_field": "20200415T072306-0700 INFO This is a great log",
656658
"response_date": "2021-12-19T16:39:57+00:00",
657659
"response_time": 12,
658660
"response_payload": "YWJj"
659661
}"#
660-
.to_string(),
661-
)?;
662-
Ok(())
662+
.to_string(),
663+
)
664+
.unwrap();
663665
}
664666

665667
#[test]
@@ -1332,4 +1334,97 @@ mod tests {
13321334
panic!("Expected json");
13331335
}
13341336
}
1337+
1338+
fn default_doc_mapper_query_aux(
1339+
doc_mapper: &dyn DocMapper,
1340+
query: &str,
1341+
) -> Result<String, String> {
1342+
let search_request = SearchRequest {
1343+
query: query.to_string(),
1344+
..Default::default()
1345+
};
1346+
let query = doc_mapper
1347+
.query(doc_mapper.schema(), &search_request)
1348+
.map_err(|err| err.to_string())?;
1349+
Ok(format!("{:?}", query))
1350+
}
1351+
1352+
#[test]
1353+
fn test_doc_mapper_sub_field_query_on_non_json_field_should_error() {
1354+
let doc_mapper: DefaultDocMapper = serde_json::from_str(
1355+
r#"{
1356+
"field_mappings": [{"name": "body", "type": "text"}],
1357+
"mode": "dynamic"
1358+
}"#,
1359+
)
1360+
.unwrap();
1361+
assert_eq!(
1362+
default_doc_mapper_query_aux(&doc_mapper, "body.wrong_field:hello").unwrap_err(),
1363+
"Field does not exists: 'body.wrong_field'"
1364+
);
1365+
}
1366+
1367+
#[test]
1368+
fn test_doc_mapper_accept_sub_field_query_on_json_field() {
1369+
let doc_mapper: DefaultDocMapper = serde_json::from_str(
1370+
r#"{
1371+
"field_mappings": [{"name": "body", "type": "json"}],
1372+
"mode": "dynamic"
1373+
}"#,
1374+
)
1375+
.unwrap();
1376+
assert_eq!(
1377+
default_doc_mapper_query_aux(&doc_mapper, "body.dynamic_field:hello"),
1378+
Ok(
1379+
r#"TermQuery(Term(type=Json, field=0, path=dynamic_field, vtype=Str, "hello"))"#
1380+
.to_string()
1381+
)
1382+
);
1383+
}
1384+
1385+
#[test]
1386+
fn test_doc_mapper_object_dot_collision_with_object_field() {
1387+
let doc_mapper: DefaultDocMapper = serde_json::from_str(
1388+
r#"{
1389+
"field_mappings": [
1390+
{
1391+
"name": "identity",
1392+
"type": "object",
1393+
"field_mappings": [{"type": "text", "name": "username"}]
1394+
},
1395+
{"type": "text", "name": "identity.username"}
1396+
]
1397+
}"#,
1398+
)
1399+
.unwrap();
1400+
assert_eq!(
1401+
default_doc_mapper_query_aux(&doc_mapper, "identity.username:toto").unwrap(),
1402+
r#"TermQuery(Term(type=Str, field=0, "toto"))"#
1403+
);
1404+
assert_eq!(
1405+
default_doc_mapper_query_aux(&doc_mapper, r#"identity\.username:toto"#).unwrap(),
1406+
r#"TermQuery(Term(type=Str, field=1, "toto"))"#
1407+
);
1408+
}
1409+
1410+
#[test]
1411+
fn test_doc_mapper_object_dot_collision_with_json_field() {
1412+
let doc_mapper: DefaultDocMapper = serde_json::from_str(
1413+
r#"{
1414+
"field_mappings": [
1415+
{"name": "identity", "type": "json"},
1416+
{"type": "text", "name": "identity.username"}
1417+
]
1418+
}"#,
1419+
)
1420+
.unwrap();
1421+
assert_eq!(
1422+
default_doc_mapper_query_aux(&doc_mapper, "identity.username:toto").unwrap(),
1423+
r#"TermQuery(Term(type=Json, field=0, path=username, vtype=Str, "toto"))"#
1424+
);
1425+
assert_eq!(
1426+
default_doc_mapper_query_aux(&doc_mapper, r#"identity\.username:toto"#).unwrap(),
1427+
r#"TermQuery(Term(type=Str, field=1, "toto"))"#
1428+
);
1429+
}
13351430
}

quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use std::any::type_name;
2121
use std::collections::BTreeMap;
2222

2323
use anyhow::bail;
24+
use itertools::Itertools;
2425
use serde_json::Value as JsonValue;
2526
use tantivy::schema::{
2627
BytesOptions, Cardinality, Field, JsonObjectOptions, NumericOptions, SchemaBuilder,
@@ -520,12 +521,36 @@ fn get_bytes_options(quickwit_numeric_options: &QuickwitNumericOptions) -> Bytes
520521
bytes_options
521522
}
522523

524+
/// Creates a tantivy field name for a given field path.
525+
///
526+
/// By field path, we mean the list of `field_name` that are crossed
527+
/// to reach the field starting from the root of the document.
528+
/// There can be more than one due to quickwit object type.
529+
///
530+
/// We simply concatenate these field names, interleaving them with '.'.
531+
/// If a fieldname itself contains a '.', we escape it with '\'.
532+
/// ('\' itself is forbidden).
533+
fn field_name_for_field_path(field_path: &[&str]) -> String {
534+
field_path.iter().cloned().map(escape_dots).join(".")
535+
}
536+
537+
fn escape_dots(field_name: &str) -> String {
538+
let mut escaped_field_name = String::new();
539+
for chr in field_name.chars() {
540+
if chr == '.' {
541+
escaped_field_name.push('\\');
542+
}
543+
escaped_field_name.push(chr);
544+
}
545+
escaped_field_name
546+
}
547+
523548
fn build_mapping_from_field_type<'a>(
524549
field_mapping_type: &'a FieldMappingType,
525550
field_path: &mut Vec<&'a str>,
526551
schema_builder: &mut SchemaBuilder,
527552
) -> anyhow::Result<MappingTree> {
528-
let field_name = field_path.join(".");
553+
let field_name = field_name_for_field_path(field_path);
529554
match field_mapping_type {
530555
FieldMappingType::Text(options, cardinality) => {
531556
let text_options: TextOptions = options.clone().into();
@@ -619,6 +644,29 @@ mod tests {
619644
QuickwitNumericOptions, QuickwitTextOptions,
620645
};
621646

647+
#[test]
648+
fn test_field_name_from_field_path() {
649+
// not really a possibility, but still, let's test it.
650+
assert_eq!(super::field_name_for_field_path(&[]), "");
651+
assert_eq!(super::field_name_for_field_path(&["hello"]), "hello");
652+
assert_eq!(
653+
super::field_name_for_field_path(&["one", "two", "three"]),
654+
"one.two.three"
655+
);
656+
assert_eq!(
657+
super::field_name_for_field_path(&["one", "two", "three"]),
658+
"one.two.three"
659+
);
660+
assert_eq!(
661+
super::field_name_for_field_path(&["one.two"]),
662+
r#"one\.two"#
663+
);
664+
assert_eq!(
665+
super::field_name_for_field_path(&["one.two", "three"]),
666+
r#"one\.two.three"#
667+
);
668+
}
669+
622670
#[test]
623671
fn test_get_or_insert_path() {
624672
let mut map = Default::default();

quickwit-doc-mapper/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ pub fn default_doc_mapper_for_tests() -> DefaultDocMapper {
5454
{
5555
"store_source": true,
5656
"default_search_fields": [
57-
"body", "attributes.server", "attributes.server.status"
57+
"body", "attributes.server", "attributes.server\\.status"
5858
],
5959
"timestamp_field": "timestamp",
6060
"sort_by": {

0 commit comments

Comments
 (0)