@@ -23,17 +23,28 @@ use anyhow::anyhow;
23
23
use arrow_array:: RecordBatch ;
24
24
use arrow_json:: reader:: { infer_json_schema_from_iterator, ReaderBuilder } ;
25
25
use arrow_schema:: { DataType , Field , Fields , Schema } ;
26
+ use chrono:: { DateTime , NaiveDateTime , Utc } ;
26
27
use datafusion:: arrow:: util:: bit_util:: round_upto_multiple_of_64;
27
28
use itertools:: Itertools ;
28
29
use serde_json:: Value ;
29
30
use std:: { collections:: HashMap , sync:: Arc } ;
30
31
use tracing:: error;
31
32
32
33
use super :: EventFormat ;
33
- use crate :: { metadata:: SchemaVersion , utils:: arrow:: get_field} ;
34
+ use crate :: { metadata:: SchemaVersion , storage :: StreamType , utils:: arrow:: get_field} ;
34
35
35
36
pub struct Event {
36
37
pub json : Value ,
38
+ ingestion_time : DateTime < Utc > ,
39
+ }
40
+
41
+ impl Event {
42
+ pub fn new ( json : Value ) -> Self {
43
+ Self {
44
+ json,
45
+ ingestion_time : Utc :: now ( ) ,
46
+ }
47
+ }
37
48
}
38
49
39
50
impl EventFormat for Event {
@@ -120,6 +131,82 @@ impl EventFormat for Event {
120
131
Ok ( None ) => unreachable ! ( "all records are added to one rb" ) ,
121
132
}
122
133
}
134
+
135
+ fn into_event (
136
+ self ,
137
+ stream_name : String ,
138
+ origin_size : u64 ,
139
+ storage_schema : & HashMap < String , Arc < Field > > ,
140
+ static_schema_flag : bool ,
141
+ custom_partitions : Option < & String > ,
142
+ time_partition : Option < & String > ,
143
+ schema_version : SchemaVersion ,
144
+ stream_type : StreamType ,
145
+ ) -> Result < super :: Event , anyhow:: Error > {
146
+ let custom_partition_values = match custom_partitions. as_ref ( ) {
147
+ Some ( custom_partition) => {
148
+ let custom_partitions = custom_partition. split ( ',' ) . collect_vec ( ) ;
149
+ get_custom_partition_values ( & self . json , & custom_partitions)
150
+ }
151
+ None => HashMap :: new ( ) ,
152
+ } ;
153
+
154
+ let parsed_timestamp = match time_partition {
155
+ Some ( time_partition) => get_parsed_timestamp ( & self . json , time_partition) ?,
156
+ _ => self . ingestion_time . naive_utc ( ) ,
157
+ } ;
158
+
159
+ let ( rb, is_first_event) = self . into_recordbatch (
160
+ storage_schema,
161
+ static_schema_flag,
162
+ time_partition,
163
+ schema_version,
164
+ ) ?;
165
+
166
+ Ok ( super :: Event {
167
+ rb,
168
+ stream_name,
169
+ origin_format : "json" ,
170
+ origin_size,
171
+ is_first_event,
172
+ parsed_timestamp,
173
+ time_partition : None ,
174
+ custom_partition_values,
175
+ stream_type,
176
+ } )
177
+ }
178
+ }
179
+
180
+ pub fn get_custom_partition_values (
181
+ json : & Value ,
182
+ custom_partition_list : & [ & str ] ,
183
+ ) -> HashMap < String , String > {
184
+ let mut custom_partition_values: HashMap < String , String > = HashMap :: new ( ) ;
185
+ for custom_partition_field in custom_partition_list {
186
+ let custom_partition_value = json. get ( custom_partition_field. trim ( ) ) . unwrap ( ) . to_owned ( ) ;
187
+ let custom_partition_value = match custom_partition_value {
188
+ e @ Value :: Number ( _) | e @ Value :: Bool ( _) => e. to_string ( ) ,
189
+ Value :: String ( s) => s,
190
+ _ => "" . to_string ( ) ,
191
+ } ;
192
+ custom_partition_values. insert (
193
+ custom_partition_field. trim ( ) . to_string ( ) ,
194
+ custom_partition_value,
195
+ ) ;
196
+ }
197
+ custom_partition_values
198
+ }
199
+
200
+ fn get_parsed_timestamp (
201
+ json : & Value ,
202
+ time_partition : & str ,
203
+ ) -> Result < NaiveDateTime , anyhow:: Error > {
204
+ let current_time = json
205
+ . get ( time_partition)
206
+ . ok_or_else ( || anyhow ! ( "Missing field for time partition in json: {time_partition}" ) ) ?;
207
+ let parsed_time: DateTime < Utc > = serde_json:: from_value ( current_time. clone ( ) ) ?;
208
+
209
+ Ok ( parsed_time. naive_utc ( ) )
123
210
}
124
211
125
212
// Returns arrow schema with the fields that are present in the request body
@@ -225,3 +312,37 @@ fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion
225
312
}
226
313
}
227
314
}
315
+
316
+ #[ cfg( test) ]
317
+ mod tests {
318
+ use std:: str:: FromStr ;
319
+
320
+ use serde_json:: json;
321
+
322
+ use super :: * ;
323
+
324
+ #[ test]
325
+ fn parse_time_parition_from_value ( ) {
326
+ let json = json ! ( { "timestamp" : "2025-05-15T15:30:00Z" } ) ;
327
+ let parsed = get_parsed_timestamp ( & json, "timestamp" ) ;
328
+
329
+ let expected = NaiveDateTime :: from_str ( "2025-05-15T15:30:00" ) . unwrap ( ) ;
330
+ assert_eq ! ( parsed. unwrap( ) , expected) ;
331
+ }
332
+
333
+ #[ test]
334
+ fn time_parition_not_in_json ( ) {
335
+ let json = json ! ( { "timestamp" : "2025-05-15T15:30:00Z" } ) ;
336
+ let parsed = get_parsed_timestamp ( & json, "timestamp" ) ;
337
+
338
+ assert ! ( parsed. is_err( ) ) ;
339
+ }
340
+
341
+ #[ test]
342
+ fn time_parition_not_parseable_as_datetime ( ) {
343
+ let json = json ! ( { "timestamp" : "2025-05-15T15:30:00Z" } ) ;
344
+ let parsed = get_parsed_timestamp ( & json, "timestamp" ) ;
345
+
346
+ assert ! ( parsed. is_err( ) ) ;
347
+ }
348
+ }
0 commit comments