Skip to content

Commit 2b3f70c

Browse files
committed
more efficient bit counting
1 parent c883f06 commit 2b3f70c

File tree

1 file changed

+22
-3
lines changed

1 file changed

+22
-3
lines changed

datafusion/core/src/physical_plan/file_format/row_filter.rs

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,7 @@ impl ArrowPredicate for DatafusionArrowPredicate {
118118
Ok(array) => {
119119
if let Some(mask) = array.as_any().downcast_ref::<BooleanArray>() {
120120
let bool_arr = BooleanArray::from(mask.data().clone());
121-
// TODO is there a more efficient way to count the rows that are filtered?
122-
let num_filtered =
123-
bool_arr.iter().filter(|p| !matches!(p, Some(true))).count();
121+
let num_filtered = bool_arr.len() - true_count(&bool_arr);
124122
self.rows_filtered.add(num_filtered);
125123
timer.stop();
126124
Ok(bool_arr)
@@ -138,6 +136,27 @@ impl ArrowPredicate for DatafusionArrowPredicate {
138136
}
139137
}
140138

139+
/// Return the number of non null true vaulues in an array
140+
// TODO remove when https://github.com/apache/arrow-rs/issues/2963 is released
141+
fn true_count(arr: &BooleanArray) -> usize {
142+
match arr.data().null_buffer() {
143+
Some(nulls) => {
144+
let null_chunks = nulls.bit_chunks(arr.offset(), arr.len());
145+
let value_chunks = arr.values().bit_chunks(arr.offset(), arr.len());
146+
null_chunks
147+
.iter()
148+
.zip(value_chunks.iter())
149+
.chain(std::iter::once((
150+
null_chunks.remainder_bits(),
151+
value_chunks.remainder_bits(),
152+
)))
153+
.map(|(a, b)| (a & b).count_ones() as usize)
154+
.sum()
155+
}
156+
None => arr.values().count_set_bits_offset(arr.offset(), arr.len()),
157+
}
158+
}
159+
141160
/// A candidate expression for creating a `RowFilter` contains the
142161
/// expression as well as data to estimate the cost of evaluating
143162
/// the resulting expression.

0 commit comments

Comments
 (0)