Skip to content

Commit 1287529

Browse files
tustvoldalamb
andauthored
Simplify InListExpr ~20-70% Faster (#4057)
* Simplify InList expression * Simplify * Hash floats as integers * Fix tests * Format * Update datafusion-cli lockfile * Sort Cargo.toml * Update datafusion/physical-expr/src/expressions/in_list.rs Co-authored-by: Andrew Lamb <[email protected]> Co-authored-by: Andrew Lamb <[email protected]>
1 parent 8d6448e commit 1287529

File tree

8 files changed

+215
-1140
lines changed

8 files changed

+215
-1140
lines changed

datafusion-cli/Cargo.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/core/Cargo.toml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,7 @@ unicode_expressions = ["datafusion-physical-expr/regex_expressions", "datafusion
5757
ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
5858
apache-avro = { version = "0.14", optional = true }
5959
arrow = { version = "25.0.0", features = ["prettyprint"] }
60-
arrow-buffer = "25.0.0"
61-
arrow-schema = "25.0.0"
60+
6261
async-compression = { version = "0.3.14", features = ["bzip2", "gzip", "futures-io", "tokio"] }
6362
async-trait = "0.1.41"
6463
bytes = "1.1"
@@ -74,7 +73,6 @@ datafusion-sql = { path = "../sql", version = "13.0.0" }
7473
flate2 = "1.0.24"
7574
futures = "0.3"
7675
glob = "0.3.0"
77-
half = { version = "2.1", default-features = false }
7876
hashbrown = { version = "0.12", features = ["raw"] }
7977
itertools = "0.10"
8078
lazy_static = { version = "^1.4.0" }

datafusion/core/src/physical_plan/mod.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -525,7 +525,6 @@ pub mod empty;
525525
pub mod explain;
526526
pub mod file_format;
527527
pub mod filter;
528-
pub mod hash_utils;
529528
pub mod joins;
530529
pub mod limit;
531530
pub mod memory;
@@ -541,4 +540,6 @@ pub mod values;
541540
pub mod windows;
542541

543542
use crate::execution::context::TaskContext;
544-
pub use datafusion_physical_expr::{expressions, functions, type_coercion, udf};
543+
pub use datafusion_physical_expr::{
544+
expressions, functions, hash_utils, type_coercion, udf,
545+
};

datafusion/core/src/physical_plan/planner.rs

Lines changed: 3 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -2035,8 +2035,9 @@ mod tests {
20352035
.build()?;
20362036
let execution_plan = plan(&logical_plan).await?;
20372037
// verify that the plan correctly adds cast from Int64(1) to Utf8, and the const will be evaluated.
2038-
let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"1\") }], negated: false, set: None }";
2039-
assert!(format!("{:?}", execution_plan).contains(expected));
2038+
let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"1\") }], negated: false }";
2039+
let actual = format!("{:?}", execution_plan);
2040+
assert!(actual.contains(expected), "{}", actual);
20402041

20412042
Ok(())
20422043
}
@@ -2068,50 +2069,6 @@ mod tests {
20682069
lit(struct_literal)
20692070
}
20702071

2071-
#[tokio::test]
2072-
async fn in_set_test() -> Result<()> {
2073-
// OPTIMIZER_INSET_THRESHOLD = 10
2074-
// expression: "a in ('a', 1, 2, ..30)"
2075-
let mut list = vec![Expr::Literal(ScalarValue::Utf8(Some("a".to_string())))];
2076-
for i in 1..31 {
2077-
list.push(Expr::Literal(ScalarValue::Int64(Some(i))));
2078-
}
2079-
let logical_plan = test_csv_scan()
2080-
.await?
2081-
.filter(col("c12").lt(lit(0.05)))?
2082-
.project(vec![col("c1").in_list(list, false)])?
2083-
.build()?;
2084-
let execution_plan = plan(&logical_plan).await?;
2085-
let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"1\") }, Literal { value: Utf8(\"2\") },";
2086-
assert!(format!("{:?}", execution_plan).contains(expected));
2087-
let expected =
2088-
"Literal { value: Utf8(\"30\") }], negated: false, set: Some(InSet { set: ";
2089-
assert!(format!("{:?}", execution_plan).contains(expected));
2090-
Ok(())
2091-
}
2092-
2093-
#[tokio::test]
2094-
async fn in_set_null_test() -> Result<()> {
2095-
// test NULL
2096-
let mut list = vec![Expr::Literal(ScalarValue::Int64(None))];
2097-
for i in 1..31 {
2098-
list.push(Expr::Literal(ScalarValue::Int64(Some(i))));
2099-
}
2100-
2101-
let logical_plan = test_csv_scan()
2102-
.await?
2103-
.filter(col("c12").lt(lit(0.05)))?
2104-
.project(vec![col("c1").in_list(list, false)])?
2105-
.build()?;
2106-
let execution_plan = plan(&logical_plan).await?;
2107-
let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(NULL) }, Literal { value: Utf8(\"1\") }, Literal { value: Utf8(\"2\") }";
2108-
assert!(format!("{:?}", execution_plan).contains(expected));
2109-
let expected =
2110-
"Literal { value: Utf8(\"30\") }], negated: false, set: Some(InSet";
2111-
assert!(format!("{:?}", execution_plan).contains(expected));
2112-
Ok(())
2113-
}
2114-
21152072
#[tokio::test]
21162073
async fn hash_agg_input_schema() -> Result<()> {
21172074
let logical_plan = test_csv_scan_with_name("aggregate_test_100")

datafusion/physical-expr/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,15 @@ unicode_expressions = ["unicode-segmentation"]
4141
[dependencies]
4242
ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
4343
arrow = { version = "25.0.0", features = ["prettyprint"] }
44+
arrow-buffer = "25.0.0"
45+
arrow-schema = "25.0.0"
4446
blake2 = { version = "^0.10.2", optional = true }
4547
blake3 = { version = "1.0", optional = true }
4648
chrono = { version = "0.4.22", default-features = false }
4749
datafusion-common = { path = "../common", version = "13.0.0" }
4850
datafusion-expr = { path = "../expr", version = "13.0.0" }
4951
datafusion-row = { path = "../row", version = "13.0.0" }
52+
half = { version = "2.1", default-features = false }
5053
hashbrown = { version = "0.12", features = ["raw"] }
5154
itertools = { version = "0.10", features = ["use_std"] }
5255
lazy_static = { version = "^1.4.0" }

0 commit comments

Comments
 (0)