Skip to content

Commit 7e69580

Browse files
authored
Add support for Utf8View to crypto functions #13406 (#13407)
1 parent d840e98 commit 7e69580

File tree

9 files changed

+116
-19
lines changed

9 files changed

+116
-19
lines changed

datafusion/functions/src/crypto/basic.rs

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,18 @@
1717

1818
//! "crypto" DataFusion functions
1919
20-
use arrow::array::StringArray;
2120
use arrow::array::{Array, ArrayRef, BinaryArray, OffsetSizeTrait};
21+
use arrow::array::{AsArray, GenericStringArray, StringArray, StringViewArray};
2222
use arrow::datatypes::DataType;
2323
use blake2::{Blake2b512, Blake2s256, Digest};
2424
use blake3::Hasher as Blake3;
2525
use datafusion_common::cast::as_binary_array;
2626

27+
use arrow::compute::StringArrayType;
2728
use datafusion_common::plan_err;
2829
use datafusion_common::{
29-
cast::{as_generic_binary_array, as_generic_string_array},
30-
exec_err, internal_err, DataFusionError, Result, ScalarValue,
30+
cast::as_generic_binary_array, exec_err, internal_err, DataFusionError, Result,
31+
ScalarValue,
3132
};
3233
use datafusion_expr::ColumnarValue;
3334
use md5::Md5;
@@ -121,9 +122,9 @@ pub fn digest(args: &[ColumnarValue]) -> Result<ColumnarValue> {
121122
}
122123
let digest_algorithm = match &args[1] {
123124
ColumnarValue::Scalar(scalar) => match scalar {
124-
ScalarValue::Utf8(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => {
125-
method.parse::<DigestAlgorithm>()
126-
}
125+
ScalarValue::Utf8View(Some(method))
126+
| ScalarValue::Utf8(Some(method))
127+
| ScalarValue::LargeUtf8(Some(method)) => method.parse::<DigestAlgorithm>(),
127128
other => exec_err!("Unsupported data type {other:?} for function digest"),
128129
},
129130
ColumnarValue::Array(_) => {
@@ -132,6 +133,7 @@ pub fn digest(args: &[ColumnarValue]) -> Result<ColumnarValue> {
132133
}?;
133134
digest_process(&args[0], digest_algorithm)
134135
}
136+
135137
impl FromStr for DigestAlgorithm {
136138
type Err = DataFusionError;
137139
fn from_str(name: &str) -> Result<DigestAlgorithm> {
@@ -166,12 +168,14 @@ impl FromStr for DigestAlgorithm {
166168
})
167169
}
168170
}
171+
169172
impl fmt::Display for DigestAlgorithm {
170173
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
171174
write!(f, "{}", format!("{self:?}").to_lowercase())
172175
}
173176
}
174-
// /// computes md5 hash digest of the given input
177+
178+
/// computes md5 hash digest of the given input
175179
pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
176180
if args.len() != 1 {
177181
return exec_err!(
@@ -180,7 +184,9 @@ pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
180184
DigestAlgorithm::Md5
181185
);
182186
}
187+
183188
let value = digest_process(&args[0], DigestAlgorithm::Md5)?;
189+
184190
// md5 requires special handling because of its unique utf8 return type
185191
Ok(match value {
186192
ColumnarValue::Array(array) => {
@@ -214,7 +220,8 @@ pub fn utf8_or_binary_to_binary_type(
214220
name: &str,
215221
) -> Result<DataType> {
216222
Ok(match arg_type {
217-
DataType::LargeUtf8
223+
DataType::Utf8View
224+
| DataType::LargeUtf8
218225
| DataType::Utf8
219226
| DataType::Binary
220227
| DataType::LargeBinary => DataType::Binary,
@@ -296,8 +303,30 @@ impl DigestAlgorithm {
296303
where
297304
T: OffsetSizeTrait,
298305
{
299-
let input_value = as_generic_string_array::<T>(value)?;
300-
let array: ArrayRef = match self {
306+
let array = match value.data_type() {
307+
DataType::Utf8 | DataType::LargeUtf8 => {
308+
let v = value.as_string::<T>();
309+
self.digest_utf8_array_impl::<&GenericStringArray<T>>(v)
310+
}
311+
DataType::Utf8View => {
312+
let v = value.as_string_view();
313+
self.digest_utf8_array_impl::<&StringViewArray>(v)
314+
}
315+
other => {
316+
return exec_err!("unsupported type for digest_utf_array: {other:?}")
317+
}
318+
};
319+
Ok(ColumnarValue::Array(array))
320+
}
321+
322+
pub fn digest_utf8_array_impl<'a, StringArrType>(
323+
self,
324+
input_value: StringArrType,
325+
) -> ArrayRef
326+
where
327+
StringArrType: StringArrayType<'a>,
328+
{
329+
match self {
301330
Self::Md5 => digest_to_array!(Md5, input_value),
302331
Self::Sha224 => digest_to_array!(Sha224, input_value),
303332
Self::Sha256 => digest_to_array!(Sha256, input_value),
@@ -318,8 +347,7 @@ impl DigestAlgorithm {
318347
.collect();
319348
Arc::new(binary_array)
320349
}
321-
};
322-
Ok(ColumnarValue::Array(array))
350+
}
323351
}
324352
}
325353
pub fn digest_process(
@@ -328,6 +356,7 @@ pub fn digest_process(
328356
) -> Result<ColumnarValue> {
329357
match value {
330358
ColumnarValue::Array(a) => match a.data_type() {
359+
DataType::Utf8View => digest_algorithm.digest_utf8_array::<i32>(a.as_ref()),
331360
DataType::Utf8 => digest_algorithm.digest_utf8_array::<i32>(a.as_ref()),
332361
DataType::LargeUtf8 => digest_algorithm.digest_utf8_array::<i64>(a.as_ref()),
333362
DataType::Binary => digest_algorithm.digest_binary_array::<i32>(a.as_ref()),
@@ -339,7 +368,9 @@ pub fn digest_process(
339368
),
340369
},
341370
ColumnarValue::Scalar(scalar) => match scalar {
342-
ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => {
371+
ScalarValue::Utf8View(a)
372+
| ScalarValue::Utf8(a)
373+
| ScalarValue::LargeUtf8(a) => {
343374
Ok(digest_algorithm
344375
.digest_scalar(a.as_ref().map(|s: &String| s.as_bytes())))
345376
}

datafusion/functions/src/crypto/digest.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ impl DigestFunc {
4242
Self {
4343
signature: Signature::one_of(
4444
vec![
45+
Exact(vec![Utf8View, Utf8View]),
4546
Exact(vec![Utf8, Utf8]),
4647
Exact(vec![LargeUtf8, Utf8]),
4748
Exact(vec![Binary, Utf8]),

datafusion/functions/src/crypto/md5.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ impl Md5Func {
4242
Self {
4343
signature: Signature::uniform(
4444
1,
45-
vec![Utf8, LargeUtf8, Binary, LargeBinary],
45+
vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
4646
Volatility::Immutable,
4747
),
4848
}
@@ -65,7 +65,7 @@ impl ScalarUDFImpl for Md5Func {
6565
use DataType::*;
6666
Ok(match &arg_types[0] {
6767
LargeUtf8 | LargeBinary => LargeUtf8,
68-
Utf8 | Binary => Utf8,
68+
Utf8View | Utf8 | Binary => Utf8,
6969
Null => Null,
7070
Dictionary(_, t) => match **t {
7171
LargeUtf8 | LargeBinary => LargeUtf8,

datafusion/functions/src/crypto/sha224.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ impl SHA224Func {
4343
Self {
4444
signature: Signature::uniform(
4545
1,
46-
vec![Utf8, LargeUtf8, Binary, LargeBinary],
46+
vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
4747
Volatility::Immutable,
4848
),
4949
}

datafusion/functions/src/crypto/sha256.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ impl SHA256Func {
4242
Self {
4343
signature: Signature::uniform(
4444
1,
45-
vec![Utf8, LargeUtf8, Binary, LargeBinary],
45+
vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
4646
Volatility::Immutable,
4747
),
4848
}

datafusion/functions/src/crypto/sha384.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ impl SHA384Func {
4242
Self {
4343
signature: Signature::uniform(
4444
1,
45-
vec![Utf8, LargeUtf8, Binary, LargeBinary],
45+
vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
4646
Volatility::Immutable,
4747
),
4848
}

datafusion/functions/src/crypto/sha512.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ impl SHA512Func {
4242
Self {
4343
signature: Signature::uniform(
4444
1,
45-
vec![Utf8, LargeUtf8, Binary, LargeBinary],
45+
vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
4646
Volatility::Immutable,
4747
),
4848
}

datafusion/sqllogictest/test_files/expr.slt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2225,6 +2225,11 @@ SELECT digest('','blake3');
22252225
----
22262226
af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262
22272227

2228+
# vverify utf8view
2229+
query ?
2230+
SELECT sha224(arrow_cast('tom', 'Utf8View'));
2231+
----
2232+
0bf6cb62649c42a9ae3876ab6f6d92ad36cb5414e495f8873292be4d
22282233

22292234
query T
22302235
SELECT substring('alphabet', 1)

datafusion/sqllogictest/test_files/string/string_view.slt

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -963,6 +963,66 @@ logical_plan
963963
01)Projection: nullif(test.column1_utf8view, test.column1_utf8view) AS c
964964
02)--TableScan: test projection=[column1_utf8view]
965965

966+
## Ensure no casts for md5
967+
query TT
968+
EXPLAIN SELECT
969+
md5(column1_utf8view) as c
970+
FROM test;
971+
----
972+
logical_plan
973+
01)Projection: md5(test.column1_utf8view) AS c
974+
02)--TableScan: test projection=[column1_utf8view]
975+
976+
## Ensure no casts for sha224
977+
query TT
978+
EXPLAIN SELECT
979+
sha224(column1_utf8view) as c
980+
FROM test;
981+
----
982+
logical_plan
983+
01)Projection: sha224(test.column1_utf8view) AS c
984+
02)--TableScan: test projection=[column1_utf8view]
985+
986+
## Ensure no casts for sha256
987+
query TT
988+
EXPLAIN SELECT
989+
sha256(column1_utf8view) as c
990+
FROM test;
991+
----
992+
logical_plan
993+
01)Projection: sha256(test.column1_utf8view) AS c
994+
02)--TableScan: test projection=[column1_utf8view]
995+
996+
## Ensure no casts for sha384
997+
query TT
998+
EXPLAIN SELECT
999+
sha384(column1_utf8view) as c
1000+
FROM test;
1001+
----
1002+
logical_plan
1003+
01)Projection: sha384(test.column1_utf8view) AS c
1004+
02)--TableScan: test projection=[column1_utf8view]
1005+
1006+
## Ensure no casts for sha512
1007+
query TT
1008+
EXPLAIN SELECT
1009+
sha512(column1_utf8view) as c
1010+
FROM test;
1011+
----
1012+
logical_plan
1013+
01)Projection: sha512(test.column1_utf8view) AS c
1014+
02)--TableScan: test projection=[column1_utf8view]
1015+
1016+
## Ensure no casts for digest
1017+
query TT
1018+
EXPLAIN SELECT
1019+
digest(column1_utf8view, 'md5') as c
1020+
FROM test;
1021+
----
1022+
logical_plan
1023+
01)Projection: digest(test.column1_utf8view, Utf8View("md5")) AS c
1024+
02)--TableScan: test projection=[column1_utf8view]
1025+
9661026
## Ensure no casts for binary operators
9671027
# `~` operator (regex match)
9681028
query TT

0 commit comments

Comments
 (0)