16
16
*
17
17
*
18
18
*/
19
-
20
- use std:: {
21
- collections:: { HashMap , HashSet } ,
22
- fs:: { remove_file, write, File , OpenOptions } ,
23
- num:: NonZeroU32 ,
24
- path:: { Path , PathBuf } ,
25
- process,
26
- sync:: { Arc , Mutex , RwLock } ,
27
- time:: { Instant , SystemTime , UNIX_EPOCH } ,
28
- } ;
29
-
30
- use arrow_array:: { Array , Float64Array , Int64Array , NullArray , StringArray } ;
19
+ use arrow_array:: { Array , Date32Array , Float64Array , Int64Array , NullArray , StringArray } ;
31
20
use arrow_array:: { BooleanArray , RecordBatch , TimestampMillisecondArray } ;
32
21
use arrow_schema:: { Field , Fields , Schema } ;
33
22
use chrono:: { DateTime , NaiveDateTime , Timelike , Utc } ;
34
23
use datafusion:: { datasource:: MemTable , prelude:: SessionContext } ;
35
24
use derive_more:: { Deref , DerefMut } ;
36
- use futures :: stream :: { FuturesUnordered , StreamExt } ;
25
+ use futures_util :: StreamExt ;
37
26
use itertools:: Itertools ;
38
27
use parquet:: {
39
28
arrow:: ArrowWriter ,
@@ -45,6 +34,15 @@ use parquet::{
45
34
use rand:: distributions:: DistString ;
46
35
use relative_path:: RelativePathBuf ;
47
36
use serde:: Serialize ;
37
+ use std:: {
38
+ collections:: { HashMap , HashSet } ,
39
+ fs:: { remove_file, write, File , OpenOptions } ,
40
+ num:: NonZeroU32 ,
41
+ path:: { Path , PathBuf } ,
42
+ process,
43
+ sync:: { Arc , Mutex , RwLock } ,
44
+ time:: { Instant , SystemTime , UNIX_EPOCH } ,
45
+ } ;
48
46
use tokio:: task:: JoinSet ;
49
47
use tracing:: { error, info, trace, warn} ;
50
48
@@ -76,6 +74,8 @@ use super::{
76
74
LogStream , ARROW_FILE_EXTENSION ,
77
75
} ;
78
76
77
+ const MAX_CONCURRENT_FIELD_STATS : usize = 10 ;
78
+
79
79
#[ derive( Serialize , Debug ) ]
80
80
struct DistinctStat {
81
81
distinct_value : String ,
@@ -355,7 +355,7 @@ impl Stream {
355
355
// if yes, then merge them and save
356
356
357
357
if let Some ( mut schema) = schema {
358
- if ! & self . stream_name . contains ( INTERNAL_STREAM_NAME ) {
358
+ if self . get_stream_type ( ) != StreamType :: Internal {
359
359
if let Err ( err) = self . calculate_field_stats ( rbs, schema. clone ( ) . into ( ) ) . await {
360
360
warn ! (
361
361
"Error calculating field stats for stream {}: {}" ,
@@ -826,19 +826,25 @@ impl Stream {
826
826
ctx : & SessionContext ,
827
827
schema : & Arc < Schema > ,
828
828
) -> Vec < FieldStat > {
829
- let field_futures = schema. fields ( ) . iter ( ) . map ( |field| {
829
+ // Collect field names into an owned Vec<String> to avoid lifetime issues
830
+ let field_names: Vec < String > = schema
831
+ . fields ( )
832
+ . iter ( )
833
+ . map ( |field| field. name ( ) . clone ( ) )
834
+ . collect ( ) ;
835
+
836
+ let field_futures = field_names. into_iter ( ) . map ( |field_name| {
830
837
let ctx = ctx. clone ( ) ;
831
838
let stream_name = self . stream_name . clone ( ) ;
832
- let field_name = field. name ( ) . clone ( ) ;
833
839
async move { Self :: calculate_single_field_stats ( ctx, stream_name, field_name) . await }
834
840
} ) ;
835
841
836
- FuturesUnordered :: from_iter ( field_futures)
842
+ futures:: stream:: iter ( field_futures)
843
+ . buffer_unordered ( MAX_CONCURRENT_FIELD_STATS )
837
844
. filter_map ( |x| async { x } )
838
845
. collect :: < Vec < _ > > ( )
839
846
. await
840
847
}
841
-
842
848
async fn calculate_single_field_stats (
843
849
ctx : SessionContext ,
844
850
stream_name : String ,
@@ -876,11 +882,12 @@ impl Stream {
876
882
async fn query_single_i64 ( ctx : & SessionContext , sql : & str ) -> Option < i64 > {
877
883
let df = ctx. sql ( sql) . await . ok ( ) ?;
878
884
let batches = df. collect ( ) . await . ok ( ) ?;
879
- let array = batches
880
- . first ( ) ?
881
- . column ( 0 )
882
- . as_any ( )
883
- . downcast_ref :: < arrow_array:: Int64Array > ( ) ?;
885
+ let batch = batches. first ( ) ?;
886
+ if batch. num_rows ( ) == 0 {
887
+ return None ;
888
+ }
889
+ let array = batch. column ( 0 ) . as_any ( ) . downcast_ref :: < Int64Array > ( ) ?;
890
+
884
891
Some ( array. value ( 0 ) )
885
892
}
886
893
@@ -899,11 +906,17 @@ impl Stream {
899
906
DateTime :: from_timestamp_millis ( timestamp)
900
907
. map ( |dt| dt. to_string ( ) )
901
908
. unwrap_or_else ( || "INVALID_TIMESTAMP" . to_string ( ) )
909
+ } else if let Some ( arr) = array. as_any ( ) . downcast_ref :: < Date32Array > ( ) {
910
+ return arr. value ( idx) . to_string ( ) ;
902
911
} else if let Some ( arr) = array. as_any ( ) . downcast_ref :: < BooleanArray > ( ) {
903
912
arr. value ( idx) . to_string ( )
904
913
} else if array. as_any ( ) . downcast_ref :: < NullArray > ( ) . is_some ( ) {
905
914
"NULL" . to_string ( )
906
915
} else {
916
+ warn ! (
917
+ "Unsupported array type for statistics: {:?}" ,
918
+ array. data_type( )
919
+ ) ;
907
920
"UNSUPPORTED" . to_string ( )
908
921
}
909
922
}
@@ -914,7 +927,8 @@ impl Stream {
914
927
field_name : & str ,
915
928
) -> Vec < DistinctStat > {
916
929
let sql = format ! (
917
- "select count(*) as distinct_count, \" {field_name}\" from \" {stream_name}\" where \" {field_name}\" is not null group by \" {field_name}\" order by distinct_count desc limit 50"
930
+ "select count(*) as distinct_count, \" {field_name}\" from \" {stream_name}\" where \" {field_name}\" is not null group by \" {field_name}\" order by distinct_count desc limit {}" ,
931
+ PARSEABLE . options. max_field_statistics
918
932
) ;
919
933
let mut distinct_stats = Vec :: new ( ) ;
920
934
if let Ok ( df) = ctx. sql ( & sql) . await {
0 commit comments