Skip to content

Commit f755626

Browse files
authored
Port regexp_replace functions and related tests (#9454)
* Port regexp_replace functions and related tests * porting tests * delete files * change config * adding dependency * change tests * fix * optimize code * remove unused
1 parent 64f998f commit f755626

File tree

17 files changed

+202
-547
lines changed

17 files changed

+202
-547
lines changed

datafusion-cli/Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/core/tests/dataframe/dataframe_functions.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ async fn test_fn_regexp_match() -> Result<()> {
468468
#[tokio::test]
469469
#[cfg(feature = "unicode_expressions")]
470470
async fn test_fn_regexp_replace() -> Result<()> {
471-
let expr = regexp_replace(vec![col("a"), lit("[a-z]"), lit("x"), lit("g")]);
471+
let expr = regexp_replace(col("a"), lit("[a-z]"), lit("x"), lit("g"));
472472

473473
let expected = [
474474
"+----------------------------------------------------------+",

datafusion/expr/src/built_in_function.rs

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -218,9 +218,6 @@ pub enum BuiltinScalarFunction {
218218
OctetLength,
219219
/// random
220220
Random,
221-
/// regexp_match
222-
/// regexp_replace
223-
RegexpReplace,
224221
/// repeat
225222
Repeat,
226223
/// replace
@@ -417,7 +414,6 @@ impl BuiltinScalarFunction {
417414
BuiltinScalarFunction::MD5 => Volatility::Immutable,
418415
BuiltinScalarFunction::OctetLength => Volatility::Immutable,
419416
BuiltinScalarFunction::Radians => Volatility::Immutable,
420-
BuiltinScalarFunction::RegexpReplace => Volatility::Immutable,
421417
BuiltinScalarFunction::Repeat => Volatility::Immutable,
422418
BuiltinScalarFunction::Replace => Volatility::Immutable,
423419
BuiltinScalarFunction::Reverse => Volatility::Immutable,
@@ -674,9 +670,6 @@ impl BuiltinScalarFunction {
674670
BuiltinScalarFunction::Pi => Ok(Float64),
675671
BuiltinScalarFunction::Random => Ok(Float64),
676672
BuiltinScalarFunction::Uuid => Ok(Utf8),
677-
BuiltinScalarFunction::RegexpReplace => {
678-
utf8_to_str_type(&input_expr_types[0], "regexp_replace")
679-
}
680673
BuiltinScalarFunction::Repeat => {
681674
utf8_to_str_type(&input_expr_types[0], "repeat")
682675
}
@@ -1161,14 +1154,6 @@ impl BuiltinScalarFunction {
11611154
BuiltinScalarFunction::Replace | BuiltinScalarFunction::Translate => {
11621155
Signature::one_of(vec![Exact(vec![Utf8, Utf8, Utf8])], self.volatility())
11631156
}
1164-
BuiltinScalarFunction::RegexpReplace => Signature::one_of(
1165-
vec![
1166-
Exact(vec![Utf8, Utf8, Utf8]),
1167-
Exact(vec![Utf8, Utf8, Utf8, Utf8]),
1168-
],
1169-
self.volatility(),
1170-
),
1171-
11721157
BuiltinScalarFunction::Pi => Signature::exact(vec![], self.volatility()),
11731158
BuiltinScalarFunction::Random => Signature::exact(vec![], self.volatility()),
11741159
BuiltinScalarFunction::Uuid => Signature::exact(vec![], self.volatility()),
@@ -1398,9 +1383,6 @@ impl BuiltinScalarFunction {
13981383
BuiltinScalarFunction::SubstrIndex => &["substr_index", "substring_index"],
13991384
BuiltinScalarFunction::FindInSet => &["find_in_set"],
14001385

1401-
// regex functions
1402-
BuiltinScalarFunction::RegexpReplace => &["regexp_replace"],
1403-
14041386
// time/date functions
14051387
BuiltinScalarFunction::Now => &["now"],
14061388
BuiltinScalarFunction::CurrentDate => &["current_date", "today"],

datafusion/expr/src/expr_fn.rs

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -822,11 +822,6 @@ nary_scalar_expr!(
822822
rpad,
823823
"fill up a string to the length by appending the characters"
824824
);
825-
nary_scalar_expr!(
826-
RegexpReplace,
827-
regexp_replace,
828-
"replace strings that match a regular expression"
829-
);
830825
nary_scalar_expr!(
831826
Btrim,
832827
btrim,
@@ -1314,21 +1309,6 @@ mod test {
13141309
test_scalar_expr!(Ltrim, ltrim, string);
13151310
test_scalar_expr!(MD5, md5, string);
13161311
test_scalar_expr!(OctetLength, octet_length, string);
1317-
test_nary_scalar_expr!(
1318-
RegexpReplace,
1319-
regexp_replace,
1320-
string,
1321-
pattern,
1322-
replacement
1323-
);
1324-
test_nary_scalar_expr!(
1325-
RegexpReplace,
1326-
regexp_replace,
1327-
string,
1328-
pattern,
1329-
replacement,
1330-
flags
1331-
);
13321312
test_scalar_expr!(Replace, replace, string, from, to);
13331313
test_scalar_expr!(Repeat, repeat, string, count);
13341314
test_scalar_expr!(Reverse, reverse, string);

datafusion/functions/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,11 @@ chrono = { workspace = true }
5555
datafusion-common = { workspace = true }
5656
datafusion-execution = { workspace = true }
5757
datafusion-expr = { workspace = true }
58+
datafusion-physical-expr = { workspace = true, default-features = true }
5859
hex = { version = "0.4", optional = true }
5960
itertools = { workspace = true }
6061
log = { workspace = true }
61-
62+
regex = { version = "1.8" }
6263
[dev-dependencies]
6364
criterion = "0.5"
6465
rand = { workspace = true }

datafusion/functions/benches/regx.rs

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,18 @@
1717

1818
extern crate criterion;
1919

20-
use std::sync::Arc;
21-
2220
use arrow_array::builder::StringBuilder;
2321
use arrow_array::{ArrayRef, StringArray};
2422
use criterion::{black_box, criterion_group, criterion_main, Criterion};
2523
use datafusion_functions::regex::regexplike::regexp_like;
2624
use datafusion_functions::regex::regexpmatch::regexp_match;
25+
use datafusion_functions::regex::regexpreplace::regexp_replace;
2726
use rand::distributions::Alphanumeric;
2827
use rand::rngs::ThreadRng;
2928
use rand::seq::SliceRandom;
3029
use rand::Rng;
30+
use std::iter;
31+
use std::sync::Arc;
3132
fn data(rng: &mut ThreadRng) -> StringArray {
3233
let mut data: Vec<String> = vec![];
3334
for _ in 0..1000 {
@@ -101,6 +102,42 @@ fn criterion_benchmark(c: &mut Criterion) {
101102
)
102103
})
103104
});
105+
106+
c.bench_function("regexp_match_1000", |b| {
107+
let mut rng = rand::thread_rng();
108+
let data = Arc::new(data(&mut rng)) as ArrayRef;
109+
let regex = Arc::new(regex(&mut rng)) as ArrayRef;
110+
let flags = Arc::new(flags(&mut rng)) as ArrayRef;
111+
112+
b.iter(|| {
113+
black_box(
114+
regexp_match::<i32>(&[data.clone(), regex.clone(), flags.clone()])
115+
.expect("regexp_match should work on valid values"),
116+
)
117+
})
118+
});
119+
120+
c.bench_function("regexp_replace_1000", |b| {
121+
let mut rng = rand::thread_rng();
122+
let data = Arc::new(data(&mut rng)) as ArrayRef;
123+
let regex = Arc::new(regex(&mut rng)) as ArrayRef;
124+
let flags = Arc::new(flags(&mut rng)) as ArrayRef;
125+
let replacement =
126+
Arc::new(StringArray::from_iter_values(iter::repeat("XX").take(1000)))
127+
as ArrayRef;
128+
129+
b.iter(|| {
130+
black_box(
131+
regexp_replace::<i32>(&[
132+
data.clone(),
133+
regex.clone(),
134+
replacement.clone(),
135+
flags.clone(),
136+
])
137+
.expect("regexp_replace should work on valid values"),
138+
)
139+
})
140+
});
104141
}
105142

106143
criterion_group!(benches, criterion_benchmark);

datafusion/functions/src/regex/mod.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,15 @@
1919
2020
pub mod regexplike;
2121
pub mod regexpmatch;
22-
22+
pub mod regexpreplace;
2323
// create UDFs
2424
make_udf_function!(regexpmatch::RegexpMatchFunc, REGEXP_MATCH, regexp_match);
2525
make_udf_function!(regexplike::RegexpLikeFunc, REGEXP_LIKE, regexp_like);
26+
make_udf_function!(
27+
regexpreplace::RegexpReplaceFunc,
28+
REGEXP_REPLACE,
29+
regexp_replace
30+
);
2631
export_functions!((
2732
regexp_match,
2833
input_arg1 input_arg2,
@@ -31,4 +36,4 @@ export_functions!((
3136
regexp_like,
3237
input_arg1 input_arg2,
3338
"Returns true if a has at least one match in a string,false otherwise."
34-
));
39+
),(regexp_replace, arg1 arg2 arg3 arg4, "Replaces substrings in a string that match"));

0 commit comments

Comments
 (0)