Skip to content

Commit 82fb5b9

Browse files
authored
Faster character_length() string function for ASCII-only case (#12356)
* charcter_length() benchmark * char_length() ascii fast path * use usize_as
1 parent 453c026 commit 82fb5b9

File tree

4 files changed

+144
-6
lines changed

4 files changed

+144
-6
lines changed

datafusion/functions/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,8 @@ required-features = ["math_expressions"]
166166
harness = false
167167
name = "substr"
168168
required-features = ["unicode_expressions"]
169+
170+
[[bench]]
171+
harness = false
172+
name = "character_length"
173+
required-features = ["unicode_expressions"]
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
extern crate criterion;
19+
20+
use arrow::array::{StringArray, StringViewArray};
21+
use criterion::{black_box, criterion_group, criterion_main, Criterion};
22+
use datafusion_expr::ColumnarValue;
23+
use rand::distributions::Alphanumeric;
24+
use rand::{rngs::StdRng, Rng, SeedableRng};
25+
use std::sync::Arc;
26+
27+
/// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with
28+
/// 4096 rows, each row containing a string with 128 random characters.
29+
/// around 10% of the rows are null, around 10% of the rows are non-ASCII.
30+
fn gen_string_array(
31+
n_rows: usize,
32+
str_len_chars: usize,
33+
null_density: f32,
34+
utf8_density: f32,
35+
is_string_view: bool, // false -> StringArray, true -> StringViewArray
36+
) -> Vec<ColumnarValue> {
37+
let mut rng = StdRng::seed_from_u64(42);
38+
let rng_ref = &mut rng;
39+
40+
let corpus = "DataFusionДатаФусион数据融合📊🔥"; // includes utf8 encoding with 1~4 bytes
41+
let corpus_char_count = corpus.chars().count();
42+
43+
let mut output_string_vec: Vec<Option<String>> = Vec::with_capacity(n_rows);
44+
for _ in 0..n_rows {
45+
let rand_num = rng_ref.gen::<f32>(); // [0.0, 1.0)
46+
if rand_num < null_density {
47+
output_string_vec.push(None);
48+
} else if rand_num < null_density + utf8_density {
49+
// Generate random UTF8 string
50+
let mut generated_string = String::with_capacity(str_len_chars);
51+
for _ in 0..str_len_chars {
52+
let idx = rng_ref.gen_range(0..corpus_char_count);
53+
let char = corpus.chars().nth(idx).unwrap();
54+
generated_string.push(char);
55+
}
56+
output_string_vec.push(Some(generated_string));
57+
} else {
58+
// Generate random ASCII-only string
59+
let value = rng_ref
60+
.sample_iter(&Alphanumeric)
61+
.take(str_len_chars)
62+
.collect();
63+
let value = String::from_utf8(value).unwrap();
64+
output_string_vec.push(Some(value));
65+
}
66+
}
67+
68+
if is_string_view {
69+
let string_view_array: StringViewArray = output_string_vec.into_iter().collect();
70+
vec![ColumnarValue::Array(Arc::new(string_view_array))]
71+
} else {
72+
let string_array: StringArray = output_string_vec.clone().into_iter().collect();
73+
vec![ColumnarValue::Array(Arc::new(string_array))]
74+
}
75+
}
76+
77+
fn criterion_benchmark(c: &mut Criterion) {
78+
// All benches are single batch run with 8192 rows
79+
let character_length = datafusion_functions::unicode::character_length();
80+
81+
let n_rows = 8192;
82+
for str_len in [8, 32, 128, 4096] {
83+
// StringArray ASCII only
84+
let args_string_ascii = gen_string_array(n_rows, str_len, 0.1, 0.0, false);
85+
c.bench_function(
86+
&format!("character_length_StringArray_ascii_str_len_{}", str_len),
87+
|b| b.iter(|| black_box(character_length.invoke(&args_string_ascii))),
88+
);
89+
90+
// StringArray UTF8
91+
let args_string_utf8 = gen_string_array(n_rows, str_len, 0.1, 0.5, false);
92+
c.bench_function(
93+
&format!("character_length_StringArray_utf8_str_len_{}", str_len),
94+
|b| b.iter(|| black_box(character_length.invoke(&args_string_utf8))),
95+
);
96+
97+
// StringViewArray ASCII only
98+
let args_string_view_ascii = gen_string_array(n_rows, str_len, 0.1, 0.0, true);
99+
c.bench_function(
100+
&format!("character_length_StringViewArray_ascii_str_len_{}", str_len),
101+
|b| b.iter(|| black_box(character_length.invoke(&args_string_view_ascii))),
102+
);
103+
104+
// StringViewArray UTF8
105+
let args_string_view_utf8 = gen_string_array(n_rows, str_len, 0.1, 0.5, true);
106+
c.bench_function(
107+
&format!("character_length_StringViewArray_utf8_str_len_{}", str_len),
108+
|b| b.iter(|| black_box(character_length.invoke(&args_string_view_utf8))),
109+
);
110+
}
111+
}
112+
113+
criterion_group!(benches, criterion_benchmark);
114+
criterion_main!(benches);

datafusion/functions/src/string/common.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,18 +351,29 @@ pub trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
351351
///
352352
/// This iterator iterates returns `Option<&str>` for each item in the array.
353353
fn iter(&self) -> ArrayIter<Self>;
354+
355+
/// Check if the array is ASCII only.
356+
fn is_ascii(&self) -> bool;
354357
}
355358

356359
impl<'a, T: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray<T> {
357360
fn iter(&self) -> ArrayIter<Self> {
358361
GenericStringArray::<T>::iter(self)
359362
}
363+
364+
fn is_ascii(&self) -> bool {
365+
GenericStringArray::<T>::is_ascii(self)
366+
}
360367
}
361368

362369
impl<'a> StringArrayType<'a> for &'a StringViewArray {
363370
fn iter(&self) -> ArrayIter<Self> {
364371
StringViewArray::iter(self)
365372
}
373+
374+
fn is_ascii(&self) -> bool {
375+
StringViewArray::is_ascii(self)
376+
}
366377
}
367378

368379
/// Optimized version of the StringBuilder in Arrow that:

datafusion/functions/src/unicode/character_length.rs

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use crate::string::common::StringArrayType;
1819
use crate::utils::{make_scalar_function, utf8_to_int_type};
1920
use arrow::array::{
20-
Array, ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray,
21-
OffsetSizeTrait, PrimitiveArray,
21+
Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, PrimitiveArray,
2222
};
2323
use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
2424
use datafusion_common::Result;
@@ -99,18 +99,26 @@ fn character_length(args: &[ArrayRef]) -> Result<ArrayRef> {
9999
}
100100
}
101101

102-
fn character_length_general<'a, T: ArrowPrimitiveType, V: ArrayAccessor<Item = &'a str>>(
102+
fn character_length_general<'a, T: ArrowPrimitiveType, V: StringArrayType<'a>>(
103103
array: V,
104104
) -> Result<ArrayRef>
105105
where
106106
T::Native: OffsetSizeTrait,
107107
{
108-
let iter = ArrayIter::new(array);
108+
// String characters are variable length encoded in UTF-8, counting the
109+
// number of chars requires expensive decoding, however checking if the
110+
// string is ASCII only is relatively cheap.
111+
// If strings are ASCII only, count bytes instead.
112+
let is_array_ascii_only = array.is_ascii();
113+
let iter = array.iter();
109114
let result = iter
110115
.map(|string| {
111116
string.map(|string: &str| {
112-
T::Native::from_usize(string.chars().count())
113-
.expect("should not fail as string.chars will always return integer")
117+
if is_array_ascii_only {
118+
T::Native::usize_as(string.len())
119+
} else {
120+
T::Native::usize_as(string.chars().count())
121+
}
114122
})
115123
})
116124
.collect::<PrimitiveArray<T>>();

0 commit comments

Comments
 (0)