Skip to content

Commit 4389cf9

Browse files
authored
fix: check overflow numbers while inferring type for csv files (#6481)
* refactor: detect overflow for type inference * chore: fallback to utf8 and tests
1 parent 581c647 commit 4389cf9

File tree

1 file changed

+8
-1
lines changed

1 file changed

+8
-1
lines changed

arrow-csv/src/reader/mod.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,12 @@ impl InferredDataType {
215215
self.packed |= if string.starts_with('"') {
216216
1 << 8 // Utf8
217217
} else if let Some(m) = REGEX_SET.matches(string).into_iter().next() {
218-
1 << m
218+
if m == 1 && string.len() >= 19 && string.parse::<i64>().is_err() {
219+
// if overflow i64, fallback to utf8
220+
1 << 8
221+
} else {
222+
1 << m
223+
}
219224
} else {
220225
1 << 8 // Utf8
221226
}
@@ -1819,6 +1824,8 @@ mod tests {
18191824
infer_field_schema("2021-12-19T13:12:30.123456789"),
18201825
DataType::Timestamp(TimeUnit::Nanosecond, None)
18211826
);
1827+
assert_eq!(infer_field_schema("–9223372036854775809"), DataType::Utf8);
1828+
assert_eq!(infer_field_schema("9223372036854775808"), DataType::Utf8);
18221829
}
18231830

18241831
#[test]

0 commit comments

Comments
 (0)