Skip to content

Commit bad8e4e

Browse files
Fokkoroeapkevinjqliuemkornfield
authored
Expose Avro reader to PyIceberg (#1328)
## Which issue does this PR close? I've been looking into exposing the Avro readers to PyIceberg. This will give a huge benefit to PyIceberg because we can drop the Cython Avro reader. ## What changes are included in this PR? Exposing methods and structures to read the manifest lists, and manifests itself. <!-- Provide a summary of the modifications in this PR. List the main changes such as new features, bug fixes, refactoring, or any other updates. --> ## Are these changes tested? By using them in PyIceberg :) <!-- Specify what test covers (unit test, integration test, etc.). If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> --------- Co-authored-by: Robert Pack <[email protected]> Co-authored-by: Kevin Liu <[email protected]> Co-authored-by: Micah Kornfield <[email protected]> Co-authored-by: Kevin Liu <[email protected]>
1 parent d2f655e commit bad8e4e

File tree

6 files changed

+985
-1
lines changed

6 files changed

+985
-1
lines changed

bindings/python/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ include = [
4949
ignore = ["F403", "F405"]
5050

5151
[tool.hatch.envs.dev]
52-
dependencies = ["maturin>=1.0,<2.0", "pytest>=8.3.2", "datafusion==45.*", "pyiceberg[sql-sqlite,pyarrow]>=0.9.1"]
52+
dependencies = ["maturin>=1.0,<2.0", "pytest>=8.3.2", "datafusion==45.*", "pyiceberg[sql-sqlite,pyarrow]>=0.10.0", "fastavro>=1.11.1"]
5353

5454
[tool.hatch.envs.dev.scripts]
5555
build = "maturin build --out dist --sdist"

bindings/python/src/data_file.rs

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::collections::HashMap;
19+
20+
use iceberg::spec::{DataFile, DataFileFormat, PrimitiveLiteral};
21+
use pyo3::IntoPyObjectExt;
22+
use pyo3::exceptions::PyValueError;
23+
use pyo3::prelude::*;
24+
use pyo3::types::PyBytes;
25+
26+
#[pyclass()]
27+
pub struct PyPrimitiveLiteral {
28+
inner: PrimitiveLiteral,
29+
}
30+
31+
#[pymethods]
32+
impl PyPrimitiveLiteral {
33+
pub fn value(&self, py: Python<'_>) -> PyResult<Py<PyAny>> {
34+
match &self.inner {
35+
PrimitiveLiteral::Boolean(v) => v.into_py_any(py),
36+
PrimitiveLiteral::Int(v) => v.into_py_any(py),
37+
PrimitiveLiteral::Long(v) => v.into_py_any(py),
38+
PrimitiveLiteral::Float(v) => v.0.into_py_any(py), // unwrap OrderedFloat
39+
PrimitiveLiteral::Double(v) => v.0.into_py_any(py),
40+
PrimitiveLiteral::String(v) => v.into_py_any(py),
41+
PrimitiveLiteral::Binary(v) => PyBytes::new(py, v).into_py_any(py),
42+
PrimitiveLiteral::Int128(v) => v.into_py_any(py), // Python handles big ints
43+
PrimitiveLiteral::UInt128(v) => v.into_py_any(py),
44+
PrimitiveLiteral::AboveMax => Err(PyValueError::new_err("AboveMax is not supported")),
45+
PrimitiveLiteral::BelowMin => Err(PyValueError::new_err("BelowMin is not supported")),
46+
}
47+
}
48+
}
49+
50+
#[pyclass]
51+
pub struct PyDataFile {
52+
inner: DataFile,
53+
}
54+
55+
#[pymethods]
56+
impl PyDataFile {
57+
#[getter]
58+
fn content(&self) -> i32 {
59+
self.inner.content_type() as i32
60+
}
61+
62+
#[getter]
63+
fn file_path(&self) -> &str {
64+
self.inner.file_path()
65+
}
66+
67+
#[getter]
68+
fn file_format(&self) -> &str {
69+
match self.inner.file_format() {
70+
DataFileFormat::Avro => "avro",
71+
DataFileFormat::Orc => "orc",
72+
DataFileFormat::Parquet => "parquet",
73+
DataFileFormat::Puffin => "puffin",
74+
}
75+
}
76+
77+
#[getter]
78+
fn partition(&self) -> Vec<Option<PyPrimitiveLiteral>> {
79+
self.inner
80+
.partition()
81+
.iter()
82+
.map(|lit| {
83+
lit.and_then(|l| {
84+
Some(PyPrimitiveLiteral {
85+
inner: l.as_primitive_literal()?,
86+
})
87+
})
88+
})
89+
.collect()
90+
}
91+
92+
#[getter]
93+
fn record_count(&self) -> u64 {
94+
self.inner.record_count()
95+
}
96+
97+
#[getter]
98+
fn file_size_in_bytes(&self) -> u64 {
99+
self.inner.file_size_in_bytes()
100+
}
101+
102+
#[getter]
103+
fn column_sizes(&self) -> &HashMap<i32, u64> {
104+
self.inner.column_sizes()
105+
}
106+
107+
#[getter]
108+
fn value_counts(&self) -> &HashMap<i32, u64> {
109+
self.inner.value_counts()
110+
}
111+
112+
#[getter]
113+
fn null_value_counts(&self) -> &HashMap<i32, u64> {
114+
self.inner.null_value_counts()
115+
}
116+
117+
#[getter]
118+
fn nan_value_counts(&self) -> &HashMap<i32, u64> {
119+
self.inner.nan_value_counts()
120+
}
121+
122+
#[getter]
123+
fn upper_bounds(&self) -> HashMap<i32, Vec<u8>> {
124+
self.inner
125+
.upper_bounds()
126+
.iter()
127+
.map(|(k, v)| (*k, v.to_bytes().unwrap().to_vec()))
128+
.collect()
129+
}
130+
131+
#[getter]
132+
fn lower_bounds(&self) -> HashMap<i32, Vec<u8>> {
133+
self.inner
134+
.lower_bounds()
135+
.iter()
136+
.map(|(k, v)| (*k, v.to_bytes().unwrap().to_vec()))
137+
.collect()
138+
}
139+
140+
#[getter]
141+
fn key_metadata(&self) -> Option<&[u8]> {
142+
self.inner.key_metadata()
143+
}
144+
145+
#[getter]
146+
fn split_offsets(&self) -> &[i64] {
147+
self.inner.split_offsets()
148+
}
149+
150+
#[getter]
151+
fn equality_ids(&self) -> &[i32] {
152+
self.inner.equality_ids()
153+
}
154+
155+
#[getter]
156+
fn sort_order_id(&self) -> Option<i32> {
157+
self.inner.sort_order_id()
158+
}
159+
}
160+
161+
impl PyDataFile {
162+
pub fn new(inner: DataFile) -> Self {
163+
Self { inner }
164+
}
165+
}

bindings/python/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,17 @@
1717

1818
use pyo3::prelude::*;
1919

20+
mod data_file;
2021
mod datafusion_table_provider;
2122
mod error;
23+
mod manifest;
2224
mod runtime;
2325
mod transform;
2426

2527
#[pymodule]
2628
fn pyiceberg_core_rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
2729
datafusion_table_provider::register_module(py, m)?;
2830
transform::register_module(py, m)?;
31+
manifest::register_module(py, m)?;
2932
Ok(())
3033
}

0 commit comments

Comments
 (0)