@@ -29,10 +29,15 @@ use datafusion::execution::context::SessionState;
29
29
use datafusion:: execution:: disk_manager:: DiskManagerConfig ;
30
30
use datafusion:: execution:: runtime_env:: RuntimeEnv ;
31
31
use datafusion:: prelude:: * ;
32
+ use futures_util:: stream:: FuturesUnordered ;
33
+ use futures_util:: { future, Future , TryStreamExt } ;
32
34
use itertools:: Itertools ;
35
+ use object_store:: path:: Path as StorePath ;
36
+ use object_store:: { ObjectMeta , ObjectStore } ;
33
37
use serde_json:: Value ;
34
38
use std:: collections:: HashMap ;
35
39
use std:: path:: { Path , PathBuf } ;
40
+ use std:: pin:: Pin ;
36
41
use std:: sync:: Arc ;
37
42
use sysinfo:: { System , SystemExt } ;
38
43
@@ -76,16 +81,14 @@ impl Query {
76
81
}
77
82
78
83
/// Return prefixes, each per day/hour/minutes as necessary
79
- fn _get_prefixes ( & self ) -> Vec < String > {
84
+ fn generate_prefixes ( & self ) -> Vec < String > {
80
85
TimePeriod :: new ( self . start , self . end , OBJECT_STORE_DATA_GRANULARITY ) . generate_prefixes ( )
81
86
}
82
87
83
- pub fn get_prefixes ( & self ) -> Vec < String > {
84
- self . _get_prefixes ( )
88
+ fn get_prefixes ( & self ) -> Vec < String > {
89
+ self . generate_prefixes ( )
85
90
. into_iter ( )
86
91
. map ( |key| format ! ( "{}/{}" , self . stream_name, key) )
87
- // latest first
88
- . rev ( )
89
92
. collect ( )
90
93
}
91
94
@@ -129,7 +132,15 @@ impl Query {
129
132
storage : Arc < dyn ObjectStorage + Send > ,
130
133
) -> Result < ( Vec < RecordBatch > , Vec < String > ) , ExecuteError > {
131
134
let ctx = self . create_session_context ( ) ;
132
- let remote_listing_table = self . _remote_query ( storage) ?;
135
+ let unresolved_prefixes = self . get_prefixes ( ) ;
136
+ let client = ctx
137
+ . runtime_env ( )
138
+ . object_store ( Box :: new ( storage. store_url ( ) ) )
139
+ . unwrap ( ) ;
140
+ let prefixes =
141
+ resolve_paths ( client, storage. normalize_prefixes ( unresolved_prefixes) ) . await ?;
142
+
143
+ let remote_listing_table = self . remote_query ( prefixes, storage) ?;
133
144
134
145
let current_minute = Utc :: now ( )
135
146
. with_second ( 0 )
@@ -164,11 +175,12 @@ impl Query {
164
175
Ok ( ( results, fields) )
165
176
}
166
177
167
- fn _remote_query (
178
+ fn remote_query (
168
179
& self ,
180
+ prefixes : Vec < String > ,
169
181
storage : Arc < dyn ObjectStorage + Send > ,
170
182
) -> Result < Option < Arc < ListingTable > > , ExecuteError > {
171
- let prefixes = storage. query_prefixes ( self . get_prefixes ( ) ) ;
183
+ let prefixes = storage. query_prefixes ( prefixes ) ;
172
184
if prefixes. is_empty ( ) {
173
185
return Ok ( None ) ;
174
186
}
@@ -231,6 +243,83 @@ fn time_from_path(path: &Path) -> DateTime<Utc> {
231
243
. unwrap ( )
232
244
}
233
245
246
+ // accepts relative paths to resolve the narrative
247
+ // returns list of prefixes sorted in descending order
248
+ async fn resolve_paths (
249
+ client : Arc < dyn ObjectStore > ,
250
+ prefixes : Vec < String > ,
251
+ ) -> Result < Vec < String > , ObjectStorageError > {
252
+ let mut minute_resolve: HashMap < String , Vec < String > > = HashMap :: new ( ) ;
253
+ let mut all_resolve = Vec :: new ( ) ;
254
+
255
+ for prefix in prefixes {
256
+ let components = prefix. split_terminator ( '/' ) ;
257
+ if components. last ( ) . is_some_and ( |x| x. starts_with ( "minute" ) ) {
258
+ let hour_prefix = & prefix[ 0 ..prefix. rfind ( "minute" ) . expect ( "minute exists" ) ] ;
259
+ minute_resolve
260
+ . entry ( hour_prefix. to_owned ( ) )
261
+ . and_modify ( |list| list. push ( prefix) )
262
+ . or_default ( ) ;
263
+ } else {
264
+ all_resolve. push ( prefix)
265
+ }
266
+ }
267
+
268
+ type ResolveFuture = Pin < Box < dyn Future < Output = Result < Vec < ObjectMeta > , ObjectStorageError > > > > ;
269
+
270
+ let tasks: FuturesUnordered < ResolveFuture > = FuturesUnordered :: new ( ) ;
271
+
272
+ for ( listing_prefix, prefix) in minute_resolve {
273
+ let client = Arc :: clone ( & client) ;
274
+ tasks. push ( Box :: pin ( async move {
275
+ let mut list = client
276
+ . list ( Some ( & StorePath :: from ( listing_prefix) ) )
277
+ . await ?
278
+ . try_collect :: < Vec < _ > > ( )
279
+ . await ?;
280
+
281
+ list. retain ( |object| {
282
+ prefix. iter ( ) . any ( |prefix| {
283
+ object
284
+ . location
285
+ . prefix_matches ( & StorePath :: from ( prefix. as_ref ( ) ) )
286
+ } )
287
+ } ) ;
288
+
289
+ Ok ( list)
290
+ } ) ) ;
291
+ }
292
+
293
+ for prefix in all_resolve {
294
+ let client = Arc :: clone ( & client) ;
295
+ tasks. push ( Box :: pin ( async move {
296
+ client
297
+ . list ( Some ( & StorePath :: from ( prefix) ) )
298
+ . await ?
299
+ . try_collect :: < Vec < _ > > ( )
300
+ . await
301
+ . map_err ( Into :: into)
302
+ } ) ) ;
303
+ }
304
+
305
+ let res: Vec < Vec < String > > = tasks
306
+ . and_then ( |res| {
307
+ future:: ok (
308
+ res. into_iter ( )
309
+ . map ( |res| res. location . to_string ( ) )
310
+ . collect_vec ( ) ,
311
+ )
312
+ } )
313
+ . try_collect ( )
314
+ . await ?;
315
+
316
+ let mut res = res. into_iter ( ) . flatten ( ) . collect_vec ( ) ;
317
+ res. sort ( ) ;
318
+ res. reverse ( ) ;
319
+
320
+ Ok ( res)
321
+ }
322
+
234
323
pub mod error {
235
324
use datafusion:: error:: DataFusionError ;
236
325
0 commit comments