From f803a8ed735e440e202a96b560efa7409b10bf1f Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Mon, 25 Aug 2025 14:46:57 -0400 Subject: [PATCH] WIP: Parquet native geometry type support --- .../src/extension/canonical/geometry.rs | 136 ++++++++++++++++++ arrow-schema/src/extension/canonical/mod.rs | 2 + parquet/src/arrow/schema/mod.rs | 58 +++++++- parquet/src/basic.rs | 38 +++-- parquet/src/schema/printer.rs | 4 +- 5 files changed, 223 insertions(+), 15 deletions(-) create mode 100644 arrow-schema/src/extension/canonical/geometry.rs diff --git a/arrow-schema/src/extension/canonical/geometry.rs b/arrow-schema/src/extension/canonical/geometry.rs new file mode 100644 index 000000000000..ba833ff11282 --- /dev/null +++ b/arrow-schema/src/extension/canonical/geometry.rs @@ -0,0 +1,136 @@ +use crate::extension::ExtensionType; +use crate::ArrowError; + +/// Geospatial features in the WKB format with linear/planar edges interpolation +#[derive(Debug, Default, Clone, PartialEq, Eq, Hash)] +pub struct Geometry { + crs: Option, +} + +impl Geometry { + /// Create a new Geometry extension type with an optional CRS. + pub fn new(crs: Option) -> Self { + Self { crs } + } + + /// Get the CRS of the Geometry type, if any. + pub fn crs(&self) -> Option<&str> { + self.crs.as_deref() + } +} + +impl ExtensionType for Geometry { + const NAME: &'static str = "geoarrow.wkb"; + + type Metadata = (); + + fn metadata(&self) -> &Self::Metadata { + &() + } + + fn serialize_metadata(&self) -> Option { + None + } + + fn deserialize_metadata(_metadata: Option<&str>) -> Result { + Ok(()) + } + + fn supports_data_type(&self, data_type: &crate::DataType) -> Result<(), ArrowError> { + match data_type { + crate::DataType::Binary + | crate::DataType::LargeBinary + | crate::DataType::BinaryView => Ok(()), + data_type => Err(ArrowError::InvalidArgumentError(format!( + "Geometry data type mismatch, expected one of Binary, LargeBinary, BinaryView, found {data_type}" + ))), + } + } + + fn try_new(data_type: &crate::DataType, _metadata: Self::Metadata) -> Result { + // TODO: fix + let geo = Self { crs: None }; + geo.supports_data_type(data_type)?; + Ok(geo) + } +} + +/// Edge interpolation algorithm for Geography logical type +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum GeographyAlgorithm { + /// Edges are interpolated as geodesics on a sphere. + SPHERICAL, + + /// + VINCENTY, + + /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970 + THOMAS, + + /// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965. + ANDOYER, + + /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55 + KARNEY, +} + +/// Geospatial features in the [WKB format](https://libgeos.org/specifications/wkb/) with an +/// explicit (non-linear/non-planar) edges interpolation algorithm. +#[derive(Debug, Default, Clone, PartialEq, Eq, Hash)] +pub struct Geography { + crs: Option, + algorithm: Option, +} + +impl Geography { + /// Create a new Geography extension type with an optional CRS and algorithm. + pub fn new(crs: Option, algorithm: Option) -> Self { + Self { crs, algorithm } + } + + /// Get the CRS of the Geography type, if any. + pub fn crs(&self) -> Option<&str> { + self.crs.as_deref() + } + + /// Get the edge interpolation algorithm of the Geography type, if any. + pub fn algorithm(&self) -> Option<&GeographyAlgorithm> { + self.algorithm.as_ref() + } +} + +impl ExtensionType for Geography { + const NAME: &'static str = "geoarrow.wkb"; + + type Metadata = (); + + fn metadata(&self) -> &Self::Metadata { + &() + } + + fn serialize_metadata(&self) -> Option { + None + } + + fn deserialize_metadata(_metadata: Option<&str>) -> Result { + Ok(()) + } + + fn supports_data_type(&self, data_type: &crate::DataType) -> Result<(), ArrowError> { + match data_type { + crate::DataType::Binary + | crate::DataType::LargeBinary + | crate::DataType::BinaryView => Ok(()), + data_type => Err(ArrowError::InvalidArgumentError(format!( + "Geography data type mismatch, expected one of Binary, LargeBinary, BinaryView, found {data_type}" + ))), + } + } + + fn try_new(data_type: &crate::DataType, _metadata: Self::Metadata) -> Result { + // TODO: fix + let geo = Self::default(); + geo.supports_data_type(data_type)?; + Ok(geo) + } +} diff --git a/arrow-schema/src/extension/canonical/mod.rs b/arrow-schema/src/extension/canonical/mod.rs index 3d66299ca885..fb5404c23af3 100644 --- a/arrow-schema/src/extension/canonical/mod.rs +++ b/arrow-schema/src/extension/canonical/mod.rs @@ -37,6 +37,8 @@ mod uuid; pub use uuid::Uuid; mod variable_shape_tensor; pub use variable_shape_tensor::{VariableShapeTensor, VariableShapeTensorMetadata}; +mod geometry; +pub use geometry::{Geography, GeographyAlgorithm, Geometry}; use crate::{ArrowError, Field}; diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 5b079b66276a..43d9986328c0 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -24,7 +24,7 @@ use std::sync::Arc; use arrow_ipc::writer; #[cfg(feature = "arrow_canonical_extension_types")] -use arrow_schema::extension::{Json, Uuid}; +use arrow_schema::extension::{Geography, Geometry, Json, Uuid}; use arrow_schema::{DataType, Field, Fields, Schema, TimeUnit}; use crate::basic::{ @@ -399,9 +399,32 @@ pub fn parquet_to_arrow_field(parquet_column: &ColumnDescriptor) -> Result ret.try_with_extension_type(Uuid)?, LogicalType::Json => ret.try_with_extension_type(Json::default())?, + LogicalType::Geometry { crs } => ret.try_with_extension_type(Geometry::new(crs))?, + LogicalType::Geography { crs, algorithm } => { + use arrow_schema::extension::GeographyAlgorithm; + + use crate::format::EdgeInterpolationAlgorithm; + + let algorithm = match algorithm { + Some(EdgeInterpolationAlgorithm::ANDOYER) => Some(GeographyAlgorithm::ANDOYER), + Some(EdgeInterpolationAlgorithm::KARNEY) => Some(GeographyAlgorithm::KARNEY), + Some(EdgeInterpolationAlgorithm::SPHERICAL) => { + Some(GeographyAlgorithm::SPHERICAL) + } + Some(EdgeInterpolationAlgorithm::THOMAS) => Some(GeographyAlgorithm::THOMAS), + Some(EdgeInterpolationAlgorithm::VINCENTY) => { + Some(GeographyAlgorithm::VINCENTY) + } + None => None, + _ => None, + }; + ret.try_with_extension_type(Geography::new(crs, algorithm))? + } _ => {} } } @@ -606,6 +629,39 @@ fn arrow_to_parquet_type(field: &Field, coerce_types: bool) -> Result { Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) .with_repetition(repetition) .with_id(id) + .with_logical_type( + #[cfg(feature = "arrow_canonical_extension_types")] + if let Ok(t) = field.try_extension_type::() { + Some(LogicalType::Geometry { + crs: t.crs().map(|s| s.to_string()), + }) + } else if let Ok(t) = field.try_extension_type::() { + Some(LogicalType::Geography { + crs: t.crs().map(|s| s.to_string()), + algorithm: t.algorithm().map(|alg| match alg { + arrow_schema::extension::GeographyAlgorithm::ANDOYER => { + crate::format::EdgeInterpolationAlgorithm::ANDOYER + } + arrow_schema::extension::GeographyAlgorithm::KARNEY => { + crate::format::EdgeInterpolationAlgorithm::KARNEY + } + arrow_schema::extension::GeographyAlgorithm::SPHERICAL => { + crate::format::EdgeInterpolationAlgorithm::SPHERICAL + } + arrow_schema::extension::GeographyAlgorithm::THOMAS => { + crate::format::EdgeInterpolationAlgorithm::THOMAS + } + arrow_schema::extension::GeographyAlgorithm::VINCENTY => { + crate::format::EdgeInterpolationAlgorithm::VINCENTY + } + }), + }) + } else { + None + }, + #[cfg(not(feature = "arrow_canonical_extension_types"))] + None, + ) .build() } DataType::FixedSizeBinary(length) => { diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index c1e301136d0e..4d7c9aa121b4 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -22,7 +22,7 @@ use std::str::FromStr; use std::{fmt, str}; pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel}; -use crate::format as parquet; +use crate::format::{self as parquet, EdgeInterpolationAlgorithm, GeographyType, GeometryType}; use crate::errors::{ParquetError, Result}; @@ -231,9 +231,18 @@ pub enum LogicalType { /// A Variant value. Variant, /// A geospatial feature in the Well-Known Binary (WKB) format with linear/planar edges interpolation. - Geometry, + Geometry { + /// A custom CRS. If unset, it defaults to "OGC:CRS84", which means that the geometries + /// must be stored in longitude, latitude based on the WGS84 datum. + crs: Option, + }, /// A geospatial feature in the WKB format with an explicit (non-linear/non-planar) edges interpolation. - Geography, + Geography { + /// A custom CRS. If unset, the CRS defaults to "OGC:CRS84". + crs: Option, + /// Edge interpolation method. + algorithm: Option, + }, } // ---------------------------------------------------------------------- @@ -584,9 +593,9 @@ impl ColumnOrder { LogicalType::Unknown => SortOrder::UNDEFINED, LogicalType::Uuid => SortOrder::UNSIGNED, LogicalType::Float16 => SortOrder::SIGNED, - LogicalType::Variant | LogicalType::Geometry | LogicalType::Geography => { - SortOrder::UNDEFINED - } + LogicalType::Variant + | LogicalType::Geometry { .. } + | LogicalType::Geography { .. } => SortOrder::UNDEFINED, }, // Fall back to converted type None => Self::get_converted_sort_order(converted_type, physical_type), @@ -850,8 +859,11 @@ impl From for LogicalType { parquet::LogicalType::UUID(_) => LogicalType::Uuid, parquet::LogicalType::FLOAT16(_) => LogicalType::Float16, parquet::LogicalType::VARIANT(_) => LogicalType::Variant, - parquet::LogicalType::GEOMETRY(_) => LogicalType::Geometry, - parquet::LogicalType::GEOGRAPHY(_) => LogicalType::Geography, + parquet::LogicalType::GEOMETRY(t) => LogicalType::Geometry { crs: t.crs }, + parquet::LogicalType::GEOGRAPHY(t) => LogicalType::Geography { + crs: t.crs, + algorithm: t.algorithm, + }, } } } @@ -894,8 +906,10 @@ impl From for parquet::LogicalType { LogicalType::Uuid => parquet::LogicalType::UUID(Default::default()), LogicalType::Float16 => parquet::LogicalType::FLOAT16(Default::default()), LogicalType::Variant => parquet::LogicalType::VARIANT(Default::default()), - LogicalType::Geometry => parquet::LogicalType::GEOMETRY(Default::default()), - LogicalType::Geography => parquet::LogicalType::GEOGRAPHY(Default::default()), + LogicalType::Geometry { crs } => parquet::LogicalType::GEOMETRY(GeometryType { crs }), + LogicalType::Geography { crs, algorithm } => { + parquet::LogicalType::GEOGRAPHY(GeographyType { crs, algorithm }) + } } } } @@ -948,8 +962,8 @@ impl From> for ConvertedType { LogicalType::Uuid | LogicalType::Float16 | LogicalType::Variant - | LogicalType::Geometry - | LogicalType::Geography + | LogicalType::Geometry { .. } + | LogicalType::Geography { .. } | LogicalType::Unknown => ConvertedType::NONE, }, None => ConvertedType::NONE, diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index 5ef068da915b..1a86c9be9697 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -327,8 +327,8 @@ fn print_logical_and_converted( LogicalType::Map => "MAP".to_string(), LogicalType::Float16 => "FLOAT16".to_string(), LogicalType::Variant => "VARIANT".to_string(), - LogicalType::Geometry => "GEOMETRY".to_string(), - LogicalType::Geography => "GEOGRAPHY".to_string(), + LogicalType::Geometry { .. } => "GEOMETRY".to_string(), + LogicalType::Geography { .. } => "GEOGRAPHY".to_string(), LogicalType::Unknown => "UNKNOWN".to_string(), }, None => {