Skip to content

Commit ecc04d4

Browse files
authored
feat: Support faster multi-column grouping ( GroupColumn) for Date/Time/Timestamp types (#13457)
* feat: Add `GroupColumn` for `Date/Time/Timestamp` * Add tests
1 parent 8ce4da6 commit ecc04d4

File tree

3 files changed

+263
-3
lines changed

3 files changed

+263
-3
lines changed

datafusion/physical-plan/src/aggregates/group_values/mod.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,13 @@
1818
//! [`GroupValues`] trait for storing and interning group keys
1919
2020
use arrow::record_batch::RecordBatch;
21+
use arrow_array::types::{
22+
Date32Type, Date64Type, Time32MillisecondType, Time32SecondType,
23+
Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType,
24+
TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
25+
};
2126
use arrow_array::{downcast_primitive, ArrayRef};
27+
use arrow_schema::TimeUnit;
2228
use arrow_schema::{DataType, SchemaRef};
2329
use datafusion_common::Result;
2430

@@ -142,6 +148,28 @@ pub(crate) fn new_group_values(
142148
}
143149

144150
match d {
151+
DataType::Date32 => {
152+
downcast_helper!(Date32Type, d);
153+
}
154+
DataType::Date64 => {
155+
downcast_helper!(Date64Type, d);
156+
}
157+
DataType::Time32(t) => match t {
158+
TimeUnit::Second => downcast_helper!(Time32SecondType, d),
159+
TimeUnit::Millisecond => downcast_helper!(Time32MillisecondType, d),
160+
_ => {}
161+
},
162+
DataType::Time64(t) => match t {
163+
TimeUnit::Microsecond => downcast_helper!(Time64MicrosecondType, d),
164+
TimeUnit::Nanosecond => downcast_helper!(Time64NanosecondType, d),
165+
_ => {}
166+
},
167+
DataType::Timestamp(t, _) => match t {
168+
TimeUnit::Second => downcast_helper!(TimestampSecondType, d),
169+
TimeUnit::Millisecond => downcast_helper!(TimestampMillisecondType, d),
170+
TimeUnit::Microsecond => downcast_helper!(TimestampMicrosecondType, d),
171+
TimeUnit::Nanosecond => downcast_helper!(TimestampNanosecondType, d),
172+
},
145173
DataType::Utf8 => {
146174
return Ok(Box::new(GroupValuesByes::<i32>::new(OutputType::Utf8)));
147175
}

datafusion/physical-plan/src/aggregates/group_values/multi_group_by/mod.rs

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,14 @@ use ahash::RandomState;
3232
use arrow::compute::cast;
3333
use arrow::datatypes::{
3434
BinaryViewType, Date32Type, Date64Type, Float32Type, Float64Type, Int16Type,
35-
Int32Type, Int64Type, Int8Type, StringViewType, UInt16Type, UInt32Type, UInt64Type,
36-
UInt8Type,
35+
Int32Type, Int64Type, Int8Type, StringViewType, Time32MillisecondType,
36+
Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
37+
TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
38+
TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
3739
};
3840
use arrow::record_batch::RecordBatch;
3941
use arrow_array::{Array, ArrayRef};
40-
use arrow_schema::{DataType, Schema, SchemaRef};
42+
use arrow_schema::{DataType, Schema, SchemaRef, TimeUnit};
4143
use datafusion_common::hash_utils::create_hashes;
4244
use datafusion_common::{not_impl_err, DataFusionError, Result};
4345
use datafusion_execution::memory_pool::proxy::{RawTableAllocExt, VecAllocExt};
@@ -913,6 +915,38 @@ impl<const STREAMING: bool> GroupValues for GroupValuesColumn<STREAMING> {
913915
}
914916
&DataType::Date32 => instantiate_primitive!(v, nullable, Date32Type),
915917
&DataType::Date64 => instantiate_primitive!(v, nullable, Date64Type),
918+
&DataType::Time32(t) => match t {
919+
TimeUnit::Second => {
920+
instantiate_primitive!(v, nullable, Time32SecondType)
921+
}
922+
TimeUnit::Millisecond => {
923+
instantiate_primitive!(v, nullable, Time32MillisecondType)
924+
}
925+
_ => {}
926+
},
927+
&DataType::Time64(t) => match t {
928+
TimeUnit::Microsecond => {
929+
instantiate_primitive!(v, nullable, Time64MicrosecondType)
930+
}
931+
TimeUnit::Nanosecond => {
932+
instantiate_primitive!(v, nullable, Time64NanosecondType)
933+
}
934+
_ => {}
935+
},
936+
&DataType::Timestamp(t, _) => match t {
937+
TimeUnit::Second => {
938+
instantiate_primitive!(v, nullable, TimestampSecondType)
939+
}
940+
TimeUnit::Millisecond => {
941+
instantiate_primitive!(v, nullable, TimestampMillisecondType)
942+
}
943+
TimeUnit::Microsecond => {
944+
instantiate_primitive!(v, nullable, TimestampMicrosecondType)
945+
}
946+
TimeUnit::Nanosecond => {
947+
instantiate_primitive!(v, nullable, TimestampNanosecondType)
948+
}
949+
},
916950
&DataType::Utf8 => {
917951
let b = ByteGroupValueBuilder::<i32>::new(OutputType::Utf8);
918952
v.push(Box::new(b) as _)
@@ -1125,6 +1159,8 @@ fn supported_type(data_type: &DataType) -> bool {
11251159
| DataType::LargeBinary
11261160
| DataType::Date32
11271161
| DataType::Date64
1162+
| DataType::Time32(_)
1163+
| DataType::Timestamp(_, _)
11281164
| DataType::Utf8View
11291165
| DataType::BinaryView
11301166
)

datafusion/sqllogictest/test_files/group_by.slt

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5272,6 +5272,201 @@ drop view t
52725272
statement ok
52735273
drop table source;
52745274

5275+
# Test multi group by int + Date32
5276+
statement ok
5277+
create table source as values
5278+
(1, '2020-01-01'),
5279+
(1, '2020-01-01'),
5280+
(2, '2020-01-02'),
5281+
(2, '2020-01-03'),
5282+
(3, '2020-01-04'),
5283+
(3, '2020-01-04'),
5284+
(2, '2020-01-03'),
5285+
(null, null),
5286+
(null, '2020-01-01'),
5287+
(null, null),
5288+
(null, '2020-01-01'),
5289+
(2, '2020-01-02'),
5290+
(2, '2020-01-02'),
5291+
(1, null)
5292+
;
5293+
5294+
statement ok
5295+
create view t as select column1 as a, arrow_cast(column2, 'Date32') as b from source;
5296+
5297+
query IDI
5298+
select a, b, count(*) from t group by a, b order by a, b;
5299+
----
5300+
1 2020-01-01 2
5301+
1 NULL 1
5302+
2 2020-01-02 3
5303+
2 2020-01-03 2
5304+
3 2020-01-04 2
5305+
NULL 2020-01-01 2
5306+
NULL NULL 2
5307+
5308+
statement ok
5309+
drop view t
5310+
5311+
statement ok
5312+
drop table source;
5313+
5314+
# Test multi group by int + Date64
5315+
statement ok
5316+
create table source as values
5317+
(1, '2020-01-01'),
5318+
(1, '2020-01-01'),
5319+
(2, '2020-01-02'),
5320+
(2, '2020-01-03'),
5321+
(3, '2020-01-04'),
5322+
(3, '2020-01-04'),
5323+
(2, '2020-01-03'),
5324+
(null, null),
5325+
(null, '2020-01-01'),
5326+
(null, null),
5327+
(null, '2020-01-01'),
5328+
(2, '2020-01-02'),
5329+
(2, '2020-01-02'),
5330+
(1, null)
5331+
;
5332+
5333+
statement ok
5334+
create view t as select column1 as a, arrow_cast(column2, 'Date64') as b from source;
5335+
5336+
query IDI
5337+
select a, b, count(*) from t group by a, b order by a, b;
5338+
----
5339+
1 2020-01-01T00:00:00 2
5340+
1 NULL 1
5341+
2 2020-01-02T00:00:00 3
5342+
2 2020-01-03T00:00:00 2
5343+
3 2020-01-04T00:00:00 2
5344+
NULL 2020-01-01T00:00:00 2
5345+
NULL NULL 2
5346+
5347+
statement ok
5348+
drop view t
5349+
5350+
statement ok
5351+
drop table source;
5352+
5353+
# Test multi group by int + Time32
5354+
statement ok
5355+
create table source as values
5356+
(1, '12:34:56'),
5357+
(1, '12:34:56'),
5358+
(2, '13:00:00'),
5359+
(2, '14:15:00'),
5360+
(3, '23:59:59'),
5361+
(3, '23:59:59'),
5362+
(2, '14:15:00'),
5363+
(null, null),
5364+
(null, '12:00:00'),
5365+
(null, null),
5366+
(null, '12:00:00'),
5367+
(2, '13:00:00'),
5368+
(2, '13:00:00'),
5369+
(1, null)
5370+
;
5371+
5372+
statement ok
5373+
create view t as select column1 as a, arrow_cast(column2, 'Time32(Second)') as b from source;
5374+
5375+
query IDI
5376+
select a, b, count(*) from t group by a, b order by a, b;
5377+
----
5378+
1 12:34:56 2
5379+
1 NULL 1
5380+
2 13:00:00 3
5381+
2 14:15:00 2
5382+
3 23:59:59 2
5383+
NULL 12:00:00 2
5384+
NULL NULL 2
5385+
5386+
statement ok
5387+
drop view t
5388+
5389+
statement ok
5390+
drop table source;
5391+
5392+
# Test multi group by int + Time64
5393+
statement ok
5394+
create table source as values
5395+
(1, '12:34:56.123456'),
5396+
(1, '12:34:56.123456'),
5397+
(2, '13:00:00.000001'),
5398+
(2, '14:15:00.999999'),
5399+
(3, '23:59:59.500000'),
5400+
(3, '23:59:59.500000'),
5401+
(2, '14:15:00.999999'),
5402+
(null, null),
5403+
(null, '12:00:00.000000'),
5404+
(null, null),
5405+
(null, '12:00:00.000000'),
5406+
(2, '13:00:00.000001'),
5407+
(2, '13:00:00.000001'),
5408+
(1, null)
5409+
;
5410+
5411+
statement ok
5412+
create view t as select column1 as a, arrow_cast(column2, 'Time64(Microsecond)') as b from source;
5413+
5414+
query IDI
5415+
select a, b, count(*) from t group by a, b order by a, b;
5416+
----
5417+
1 12:34:56.123456 2
5418+
1 NULL 1
5419+
2 13:00:00.000001 3
5420+
2 14:15:00.999999 2
5421+
3 23:59:59.500 2
5422+
NULL 12:00:00 2
5423+
NULL NULL 2
5424+
5425+
statement ok
5426+
drop view t
5427+
5428+
statement ok
5429+
drop table source;
5430+
5431+
# Test multi group by int + Timestamp
5432+
statement ok
5433+
create table source as values
5434+
(1, '2020-01-01 12:34:56'),
5435+
(1, '2020-01-01 12:34:56'),
5436+
(2, '2020-01-02 13:00:00'),
5437+
(2, '2020-01-03 14:15:00'),
5438+
(3, '2020-01-04 23:59:59'),
5439+
(3, '2020-01-04 23:59:59'),
5440+
(2, '2020-01-03 14:15:00'),
5441+
(null, null),
5442+
(null, '2020-01-01 12:00:00'),
5443+
(null, null),
5444+
(null, '2020-01-01 12:00:00'),
5445+
(2, '2020-01-02 13:00:00'),
5446+
(2, '2020-01-02 13:00:00'),
5447+
(1, null)
5448+
;
5449+
5450+
statement ok
5451+
create view t as select column1 as a, arrow_cast(column2, 'Timestamp(Nanosecond, None)') as b from source;
5452+
5453+
query IPI
5454+
select a, b, count(*) from t group by a, b order by a, b;
5455+
----
5456+
1 2020-01-01T12:34:56 2
5457+
1 NULL 1
5458+
2 2020-01-02T13:00:00 3
5459+
2 2020-01-03T14:15:00 2
5460+
3 2020-01-04T23:59:59 2
5461+
NULL 2020-01-01T12:00:00 2
5462+
NULL NULL 2
5463+
5464+
statement ok
5465+
drop view t
5466+
5467+
statement ok
5468+
drop table source;
5469+
52755470
# Test whether min, max accumulator produces NaN result when input is NaN.
52765471
# See https://github.com/apache/datafusion/issues/13415 for rationale
52775472
statement ok
@@ -5287,3 +5482,4 @@ query RR
52875482
SELECT max(input_table.x), min(input_table.x) from input_table GROUP BY input_table."row";
52885483
----
52895484
NaN NaN
5485+

0 commit comments

Comments
 (0)