config options updated

nikhilsinhaparseable · nikhilsinhaparseable · commit 475d1a700cab · 2025-03-06T02:29:16.000-05:00
diff --git a/src/query/mod.rs b/src/query/mod.rs
@@ -91,30 +91,23 @@ impl Query {
         let mut config = SessionConfig::default()
             .with_parquet_pruning(true)
             .with_prefer_existing_sort(true)
-            .with_round_robin_repartition(true)
-            .with_batch_size(8192);
+            .with_information_schema(true)
+            .with_batch_size(1000000)
+            .with_coalesce_batches(true);
 
         // For more details refer https://datafusion.apache.org/user-guide/configs.html
 
-        // Reduce the number of rows read (if possible)
-        config.options_mut().execution.parquet.enable_page_index = true;
-
         // Pushdown filters allows DF to push the filters as far down in the plan as possible
         // and thus, reducing the number of rows decoded
         config.options_mut().execution.parquet.pushdown_filters = true;
 
         // Reorder filters allows DF to decide the order of filters minimizing the cost of filter evaluation
         config.options_mut().execution.parquet.reorder_filters = true;
-
-        // Enable StringViewArray
-        // https://www.influxdata.com/blog/faster-queries-with-stringview-part-one-influxdb/
+        config.options_mut().execution.parquet.binary_as_string = true;
         config
             .options_mut()
             .execution
-            .parquet
-            .schema_force_view_types = true;
-
-        config.options_mut().execution.parquet.binary_as_string = true;
+            .use_row_number_estimates_to_optimize_partitioning = true;
 
         let state = SessionStateBuilder::new()
             .with_default_features()
@@ -149,7 +142,9 @@ impl Query {
             .execute_logical_plan(self.final_logical_plan(&time_partition))
             .await?;
 
-        let fields = df
+        let optimised_df = df.repartition(Partitioning::RoundRobinBatch(16))?;
+
+        let fields = optimised_df
             .schema()
             .fields()
             .iter()
@@ -161,7 +156,7 @@ impl Query {
             return Ok((vec![], fields));
         }
 
-        let results = df.collect().await?;
+        let results = optimised_df.collect().await?;
         Ok((results, fields))
     }