Skip to content

Commit c8f5d91

Browse files
authored
Align schemas for DataFusion plan and stream (#829)
1 parent 982db0e commit c8f5d91

File tree

2 files changed

+24
-6
lines changed

2 files changed

+24
-6
lines changed

crates/integrations/datafusion/src/physical_plan/scan.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,6 @@ pub(crate) struct IcebergTableScan {
4545
table: Table,
4646
/// Snapshot of the table to scan.
4747
snapshot_id: Option<i64>,
48-
/// A reference-counted arrow `Schema`.
49-
schema: ArrowSchemaRef,
5048
/// Stores certain, often expensive to compute,
5149
/// plan properties used in query optimization.
5250
plan_properties: PlanProperties,
@@ -76,7 +74,6 @@ impl IcebergTableScan {
7674
Self {
7775
table,
7876
snapshot_id,
79-
schema,
8077
plan_properties,
8178
projection,
8279
predicates,
@@ -134,7 +131,7 @@ impl ExecutionPlan for IcebergTableScan {
134131
let stream = futures::stream::once(fut).try_flatten();
135132

136133
Ok(Box::pin(RecordBatchStreamAdapter::new(
137-
self.schema.clone(),
134+
self.schema(),
138135
stream,
139136
)))
140137
}

crates/integrations/datafusion/tests/integration_datafusion_test.rs

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@ use std::sync::Arc;
2222
use std::vec;
2323

2424
use datafusion::arrow::array::{Array, StringArray};
25-
use datafusion::arrow::datatypes::DataType;
25+
use datafusion::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
2626
use datafusion::execution::context::SessionContext;
27+
use datafusion::parquet::arrow::PARQUET_FIELD_ID_META_KEY;
2728
use iceberg::io::FileIOBuilder;
2829
use iceberg::spec::{NestedField, PrimitiveType, Schema, StructType, Type};
2930
use iceberg::{Catalog, NamespaceIdent, Result, TableCreation};
@@ -83,7 +84,7 @@ fn get_table_creation(
8384
}
8485

8586
#[tokio::test]
86-
async fn test_provider_get_table_schema() -> Result<()> {
87+
async fn test_provider_plan_stream_schema() -> Result<()> {
8788
let iceberg_catalog = get_iceberg_catalog();
8889
let namespace = NamespaceIdent::new("test_provider_get_table_schema".to_string());
8990
set_test_namespace(&iceberg_catalog, &namespace).await?;
@@ -111,6 +112,26 @@ async fn test_provider_get_table_schema() -> Result<()> {
111112
assert!(!field.is_nullable())
112113
}
113114

115+
let df = ctx
116+
.sql("select foo2 from catalog.test_provider_get_table_schema.my_table")
117+
.await
118+
.unwrap();
119+
120+
let task_ctx = Arc::new(df.task_ctx());
121+
let plan = df.create_physical_plan().await.unwrap();
122+
let stream = plan.execute(1, task_ctx).unwrap();
123+
124+
// Ensure both the plan and the stream conform to the same schema
125+
assert_eq!(plan.schema(), stream.schema());
126+
assert_eq!(
127+
stream.schema().as_ref(),
128+
&ArrowSchema::new(vec![Field::new("foo2", DataType::Utf8, false)
129+
.with_metadata(HashMap::from([(
130+
PARQUET_FIELD_ID_META_KEY.to_string(),
131+
"2".to_string(),
132+
)]))]),
133+
);
134+
114135
Ok(())
115136
}
116137

0 commit comments

Comments
 (0)