From 9344ae9c2cd8cbde4aa39b9720334f00709640ca Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Thu, 24 Apr 2025 09:37:44 -0700 Subject: [PATCH 01/10] PARQUET-2417: Add geo logical type annotations to parquet-java --- parquet-column/pom.xml | 6 + .../parquet/schema/LogicalTypeAnnotation.java | 181 ++++++++++++++++++ .../parquet/schema/PrimitiveStringifier.java | 19 ++ parquet-hadoop/pom.xml | 6 + pom.xml | 1 + 5 files changed, 213 insertions(+) diff --git a/parquet-column/pom.xml b/parquet-column/pom.xml index 654fad27cd..01b5b8e8c6 100644 --- a/parquet-column/pom.xml +++ b/parquet-column/pom.xml @@ -76,6 +76,12 @@ ${slf4j.version} + + org.locationtech.jts + jts-core + ${jts.version} + + com.carrotsearch junit-benchmarks diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java index 749beaa95e..1aaff900e0 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java @@ -41,6 +41,7 @@ import java.util.Set; import java.util.function.Supplier; import org.apache.parquet.Preconditions; +import org.apache.parquet.format.EdgeInterpolationAlgorithm; public abstract class LogicalTypeAnnotation { enum LogicalTypeToken { @@ -155,6 +156,40 @@ protected LogicalTypeAnnotation fromString(List params) { return float16Type(); } }, + GEOMETRY { + @Override + protected LogicalTypeAnnotation fromString(List params) { + if (params.size() > 1) { + throw new RuntimeException("Expecting only crs for geometry logical type, got " + params.size()); + } + if (params.size() == 1) { + String crs = params.get(0); + return geometryType(crs); + } else { + return geometryType(); + } + } + }, + GEOGRAPHY { + @Override + protected LogicalTypeAnnotation fromString(List params) { + if (params.size() > 2) { + throw new RuntimeException( + "Expecting at most 2 parameters for geography logical type (crs and edgeAlgorithm), got " + + params.size()); + } + if (params.size() == 1) { + String crs = params.get(0); + return geographyType(crs, null); + } + if (params.size() == 2) { + String crs = params.get(0); + String edgeAlgorithm = params.get(1); + return geographyType(crs, EdgeInterpolationAlgorithm.valueOf(edgeAlgorithm)); + } + return geographyType(); + } + }, UNKNOWN { @Override protected LogicalTypeAnnotation fromString(List params) { @@ -334,6 +369,22 @@ public static Float16LogicalTypeAnnotation float16Type() { return Float16LogicalTypeAnnotation.INSTANCE; } + public static GeometryLogicalTypeAnnotation geometryType(String crs) { + return new GeometryLogicalTypeAnnotation(crs); + } + + public static GeometryLogicalTypeAnnotation geometryType() { + return new GeometryLogicalTypeAnnotation("OGC:CRS84"); + } + + public static GeographyLogicalTypeAnnotation geographyType(String crs, EdgeInterpolationAlgorithm edgeAlgorithm) { + return new GeographyLogicalTypeAnnotation(crs, edgeAlgorithm); + } + + public static GeographyLogicalTypeAnnotation geographyType() { + return new GeographyLogicalTypeAnnotation("OGC:CRS84", null); + } + public static UnknownLogicalTypeAnnotation unknownType() { return UnknownLogicalTypeAnnotation.INSTANCE; } @@ -1183,6 +1234,128 @@ public boolean equals(Object obj) { } } + public static class GeometryLogicalTypeAnnotation extends LogicalTypeAnnotation { + private final String crs; + + private GeometryLogicalTypeAnnotation(String crs) { + this.crs = crs; + } + + @Override + @Deprecated + public OriginalType toOriginalType() { + return null; + } + + @Override + public Optional accept(LogicalTypeAnnotationVisitor logicalTypeAnnotationVisitor) { + return logicalTypeAnnotationVisitor.visit(this); + } + + @Override + LogicalTypeToken getType() { + return LogicalTypeToken.GEOMETRY; + } + + @Override + protected String typeParametersAsString() { + StringBuilder sb = new StringBuilder(); + sb.append("(").append(crs != null && !crs.isEmpty() ? crs : "").append(")"); + return sb.toString(); + } + + public String getCrs() { + return crs; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof GeometryLogicalTypeAnnotation)) { + return false; + } + GeometryLogicalTypeAnnotation other = (GeometryLogicalTypeAnnotation) obj; + return Objects.equals(crs, other.crs); + } + + @Override + public int hashCode() { + return Objects.hash(crs); + } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + return PrimitiveStringifier.WKB_STRINGIFIER; + } + } + + public static class GeographyLogicalTypeAnnotation extends LogicalTypeAnnotation { + private final String crs; + private final EdgeInterpolationAlgorithm edgeAlgorithm; + + private GeographyLogicalTypeAnnotation(String crs, EdgeInterpolationAlgorithm edgeAlgorithm) { + this.crs = crs; + this.edgeAlgorithm = edgeAlgorithm; + } + + @Override + @Deprecated + public OriginalType toOriginalType() { + return null; + } + + @Override + public Optional accept(LogicalTypeAnnotationVisitor logicalTypeAnnotationVisitor) { + return logicalTypeAnnotationVisitor.visit(this); + } + + @Override + LogicalTypeToken getType() { + return LogicalTypeToken.GEOGRAPHY; + } + + @Override + protected String typeParametersAsString() { + StringBuilder sb = new StringBuilder(); + sb.append("("); + if (crs != null && !crs.isEmpty()) { + sb.append(crs); + } + if (edgeAlgorithm != null) { + if (crs != null && !crs.isEmpty()) sb.append(","); + sb.append(edgeAlgorithm); + } + sb.append(")"); + return sb.toString(); + } + + public String getCrs() { + return crs; + } + + public EdgeInterpolationAlgorithm getEdgeAlgorithm() { + return edgeAlgorithm; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof GeographyLogicalTypeAnnotation)) { + return false; + } + GeographyLogicalTypeAnnotation other = (GeographyLogicalTypeAnnotation) obj; + return Objects.equals(crs, other.crs) && Objects.equals(edgeAlgorithm, other.edgeAlgorithm); + } + + @Override + public int hashCode() { + return Objects.hash(crs, edgeAlgorithm); + } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + return PrimitiveStringifier.WKB_STRINGIFIER; + } + } + /** * Implement this interface to visit a logical type annotation in the schema. * The default implementation for each logical type specific visitor method is empty. @@ -1259,6 +1432,14 @@ default Optional visit(Float16LogicalTypeAnnotation float16LogicalType) { return empty(); } + default Optional visit(GeometryLogicalTypeAnnotation geometryLogicalType) { + return empty(); + } + + default Optional visit(GeographyLogicalTypeAnnotation geographyLogicalType) { + return empty(); + } + default Optional visit(UnknownLogicalTypeAnnotation unknownLogicalTypeAnnotation) { return empty(); } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java index 7aface72a7..0c696b029e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java @@ -35,6 +35,9 @@ import java.util.concurrent.TimeUnit; import javax.naming.OperationNotSupportedException; import org.apache.parquet.io.api.Binary; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.io.ParseException; +import org.locationtech.jts.io.WKBReader; /** * Class that provides string representations for the primitive values. These string values are to be used for @@ -442,6 +445,22 @@ private void appendHex(byte[] array, int offset, int length, StringBuilder build } }; + static final PrimitiveStringifier WKB_STRINGIFIER = new BinaryStringifierBase("WKB_STRINGIFIER") { + + @Override + String stringifyNotNull(Binary value) { + + Geometry geometry; + try { + WKBReader reader = new WKBReader(); + geometry = reader.read(value.getBytesUnsafe()); + return geometry.toText(); + } catch (ParseException e) { + return BINARY_INVALID; + } + } + }; + static final PrimitiveStringifier FLOAT16_STRINGIFIER = new BinaryStringifierBase("FLOAT16_STRINGIFIER") { @Override diff --git a/parquet-hadoop/pom.xml b/parquet-hadoop/pom.xml index adfebfbd05..687310d9e2 100644 --- a/parquet-hadoop/pom.xml +++ b/parquet-hadoop/pom.xml @@ -135,6 +135,12 @@ jar compile + + org.locationtech.jts + jts-core + ${jts.version} + test + io.airlift aircompressor diff --git a/pom.xml b/pom.xml index 73360fb72d..f2f3b46bc3 100644 --- a/pom.xml +++ b/pom.xml @@ -104,6 +104,7 @@ 2.0.9 0.27ea0 3.5.0 + 1.20.0 2.3 From a65945b3699fea91ee00300189eba1c3de5d04ff Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Thu, 24 Apr 2025 10:00:54 -0700 Subject: [PATCH 02/10] add types --- .../org/apache/parquet/schema/PrimitiveType.java | 12 ++++++++++++ .../main/java/org/apache/parquet/schema/Types.java | 6 ++++++ 2 files changed, 18 insertions(+) diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index e74d7cde02..6beff4da93 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -271,6 +271,18 @@ public Optional visit( LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { return of(PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR); } + + @Override + public Optional visit( + LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType) { + return of(PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR); + } + + @Override + public Optional visit( + LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyLogicalType) { + return of(PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR); + } }) .orElseThrow(() -> new ShouldNeverHappenException( "No comparator logic implemented for BINARY logical type: " + logicalType)); diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java index 399672022c..2dcbe8cf2d 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java @@ -577,6 +577,12 @@ public Optional visit( return checkBinaryPrimitiveType(enumLogicalType); } + @Override + public Optional visit( + LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyLogicalType) { + return checkBinaryPrimitiveType(geographyLogicalType); + } + private Optional checkFixedPrimitiveType( int l, LogicalTypeAnnotation logicalTypeAnnotation) { Preconditions.checkState( From 1022778fd4de1ef818d07fe38ce62715141bc6a5 Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Thu, 24 Apr 2025 10:36:40 -0700 Subject: [PATCH 03/10] converter --- .../java/org/apache/parquet/schema/Types.java | 6 ++ .../parquet/schema/TestTypeBuilders.java | 100 ++++++++++++++++++ .../converter/ParquetMetadataConverter.java | 34 ++++++ 3 files changed, 140 insertions(+) diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java index 2dcbe8cf2d..fd82d36768 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java @@ -577,6 +577,12 @@ public Optional visit( return checkBinaryPrimitiveType(enumLogicalType); } + @Override + public Optional visit( + LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType) { + return checkBinaryPrimitiveType(geometryLogicalType); + } + @Override public Optional visit( LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyLogicalType) { diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java index 71886d1208..f894ea4738 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java @@ -55,6 +55,7 @@ import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; +import org.apache.parquet.format.EdgeInterpolationAlgorithm; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type.Repetition; import org.junit.Assert; @@ -1477,6 +1478,105 @@ public void testDecimalLogicalTypeWithDeprecatedPrecisionMismatch() { .named("aDecimal"); } + @Test + public void testGeometryLogicalType() { + // Test with default CRS + PrimitiveType defaultCrsExpected = new PrimitiveType( + REQUIRED, BINARY, "aGeometry", LogicalTypeAnnotation.geometryType("OGC:CRS84")); + PrimitiveType defaultCrsActual = Types.required(BINARY) + .as(LogicalTypeAnnotation.geometryType("OGC:CRS84")) + .named("aGeometry"); + Assert.assertEquals(defaultCrsExpected, defaultCrsActual); + + // Test with custom CRS + PrimitiveType customCrsExpected = new PrimitiveType( + REQUIRED, BINARY, "aGeometry", LogicalTypeAnnotation.geometryType("EPSG:4326")); + PrimitiveType customCrsActual = Types.required(BINARY) + .as(LogicalTypeAnnotation.geometryType("EPSG:4326")) + .named("aGeometry"); + Assert.assertEquals(customCrsExpected, customCrsActual); + + // Test with optional repetition + PrimitiveType optionalGeometryExpected = new PrimitiveType( + OPTIONAL, BINARY, "aGeometry", LogicalTypeAnnotation.geometryType("OGC:CRS84")); + PrimitiveType optionalGeometryActual = Types.optional(BINARY) + .as(LogicalTypeAnnotation.geometryType("OGC:CRS84")) + .named("aGeometry"); + Assert.assertEquals(optionalGeometryExpected, optionalGeometryActual); + } + + @Test + public void testGeographyLogicalType() { + // Test with default CRS and no edge algorithm + PrimitiveType defaultCrsExpected = new PrimitiveType( + REQUIRED, BINARY, "aGeography", LogicalTypeAnnotation.geographyType("OGC:CRS84", null)); + PrimitiveType defaultCrsActual = Types.required(BINARY) + .as(LogicalTypeAnnotation.geographyType("OGC:CRS84", null)) + .named("aGeography"); + Assert.assertEquals(defaultCrsExpected, defaultCrsActual); + + // Test with custom CRS and no edge algorithm + PrimitiveType customCrsExpected = new PrimitiveType( + REQUIRED, BINARY, "aGeography", LogicalTypeAnnotation.geographyType("EPSG:4326", null)); + PrimitiveType customCrsActual = Types.required(BINARY) + .as(LogicalTypeAnnotation.geographyType("EPSG:4326", null)) + .named("aGeography"); + Assert.assertEquals(customCrsExpected, customCrsActual); + + // Test with custom CRS and edge algorithm + EdgeInterpolationAlgorithm greatCircle = EdgeInterpolationAlgorithm.SPHERICAL; + PrimitiveType customCrsWithEdgeAlgorithmExpected = new PrimitiveType( + REQUIRED, BINARY, "aGeography", LogicalTypeAnnotation.geographyType("EPSG:4326", greatCircle)); + PrimitiveType customCrsWithEdgeAlgorithmActual = Types.required(BINARY) + .as(LogicalTypeAnnotation.geographyType("EPSG:4326", greatCircle)) + .named("aGeography"); + Assert.assertEquals(customCrsWithEdgeAlgorithmExpected, customCrsWithEdgeAlgorithmActual); + + // Test with optional repetition + PrimitiveType optionalGeographyExpected = new PrimitiveType( + OPTIONAL, BINARY, "aGeography", LogicalTypeAnnotation.geographyType("OGC:CRS84", null)); + PrimitiveType optionalGeographyActual = Types.optional(BINARY) + .as(LogicalTypeAnnotation.geographyType("OGC:CRS84", null)) + .named("aGeography"); + Assert.assertEquals(optionalGeographyExpected, optionalGeographyActual); + } + + @Test + public void testGeographyLogicalTypeWithoutEdgeInterpolationAlgorithm() { + // Test with default CRS and no edge algorithm + PrimitiveType defaultCrsExpected = new PrimitiveType( + REQUIRED, BINARY, "aGeography", LogicalTypeAnnotation.geographyType()); + PrimitiveType defaultCrsActual = Types.required(BINARY) + .as(LogicalTypeAnnotation.geographyType()) + .named("aGeography"); + Assert.assertEquals(defaultCrsExpected, defaultCrsActual); + + // Test with custom CRS and no edge algorithm + PrimitiveType customCrsExpected = new PrimitiveType( + REQUIRED, BINARY, "aGeography", LogicalTypeAnnotation.geographyType("EPSG:4326", null)); + PrimitiveType customCrsActual = Types.required(BINARY) + .as(LogicalTypeAnnotation.geographyType("EPSG:4326", null)) + .named("aGeography"); + Assert.assertEquals(customCrsExpected, customCrsActual); + + // Test with custom CRS and edge algorithm + PrimitiveType customCrsWithEdgeAlgorithmExpected = new PrimitiveType( + REQUIRED, BINARY, "aGeography", + LogicalTypeAnnotation.geographyType("EPSG:4326", null)); + PrimitiveType customCrsWithEdgeAlgorithmActual = Types.required(BINARY) + .as(LogicalTypeAnnotation.geographyType("EPSG:4326", null)) + .named("aGeography"); + Assert.assertEquals(customCrsWithEdgeAlgorithmExpected, customCrsWithEdgeAlgorithmActual); + + // Test with optional repetition + PrimitiveType optionalGeographyExpected = new PrimitiveType( + OPTIONAL, BINARY, "aGeography", LogicalTypeAnnotation.geographyType()); + PrimitiveType optionalGeographyActual = Types.optional(BINARY) + .as(LogicalTypeAnnotation.geographyType()) + .named("aGeography"); + Assert.assertEquals(optionalGeographyExpected, optionalGeographyActual); + } + /** * A convenience method to avoid a large number of @Test(expected=...) tests * diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 5759be234f..1aaab3caf1 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -76,10 +76,13 @@ import org.apache.parquet.format.DataPageHeaderV2; import org.apache.parquet.format.DecimalType; import org.apache.parquet.format.DictionaryPageHeader; +import org.apache.parquet.format.EdgeInterpolationAlgorithm; import org.apache.parquet.format.Encoding; import org.apache.parquet.format.EncryptionWithColumnKey; import org.apache.parquet.format.FieldRepetitionType; import org.apache.parquet.format.FileMetaData; +import org.apache.parquet.format.GeographyType; +import org.apache.parquet.format.GeometryType; import org.apache.parquet.format.IntType; import org.apache.parquet.format.KeyValue; import org.apache.parquet.format.LogicalType; @@ -520,6 +523,31 @@ public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnno public Optional visit(LogicalTypeAnnotation.VariantLogicalTypeAnnotation variantLogicalType) { return of(LogicalTypes.VARIANT(variantLogicalType.getSpecVersion())); } + + @Override + public Optional visit(LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType) { + GeometryType geometryType = new GeometryType(); + if (geometryLogicalType.getCrs() != null) { + geometryType.setCrs(geometryLogicalType.getCrs()); + } + return of(LogicalType.GEOMETRY(geometryType)); + } + + @Override + public Optional visit(LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyLogicalType) { + GeographyType geographyType = new GeographyType(); + if (geographyLogicalType.getCrs() != null) { + geographyType.setCrs(geographyLogicalType.getCrs()); + } + if (geographyLogicalType.getEdgeAlgorithm() != null) { + EdgeInterpolationAlgorithm algorithm = + EdgeInterpolationAlgorithm.valueOf(String.valueOf(geographyLogicalType.getEdgeAlgorithm())); + if (algorithm != null) { + geographyType.setAlgorithm(algorithm); + } + } + return of(LogicalType.GEOGRAPHY(geographyType)); + } } private void addRowGroup( @@ -1183,6 +1211,12 @@ LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { return LogicalTypeAnnotation.uuidType(); case FLOAT16: return LogicalTypeAnnotation.float16Type(); + case GEOMETRY: + GeometryType geometry = type.getGEOMETRY(); + return LogicalTypeAnnotation.geometryType(geometry.getCrs()); + case GEOGRAPHY: + GeographyType geography = type.getGEOGRAPHY(); + return LogicalTypeAnnotation.geographyType(geography.getCrs(), null); case VARIANT: VariantType variant = type.getVARIANT(); return LogicalTypeAnnotation.variantType(variant.getSpecification_version()); From 742a5110dea8a234b54d6da563f4abdf5cf23168 Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Thu, 24 Apr 2025 10:53:00 -0700 Subject: [PATCH 04/10] fix to string --- .../parquet/schema/LogicalTypeAnnotation.java | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java index 1aaff900e0..610b6ec208 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java @@ -1316,12 +1316,20 @@ LogicalTypeToken getType() { @Override protected String typeParametersAsString() { StringBuilder sb = new StringBuilder(); + + boolean hasCrs = crs != null && !crs.isEmpty(); + boolean hasEdgeAlgorithm = edgeAlgorithm != null; + + if (!hasCrs && !hasEdgeAlgorithm) { + return ""; // Return empty string when both are empty + } + sb.append("("); - if (crs != null && !crs.isEmpty()) { + if (hasCrs) { sb.append(crs); } - if (edgeAlgorithm != null) { - if (crs != null && !crs.isEmpty()) sb.append(","); + if (hasEdgeAlgorithm) { + if (hasCrs) sb.append(","); sb.append(edgeAlgorithm); } sb.append(")"); From 1e9f1f3f3bc79d8b0a138ad52e0d71c11f5fefeb Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Thu, 24 Apr 2025 13:10:24 -0700 Subject: [PATCH 05/10] fix fmt --- .../parquet/schema/TestTypeBuilders.java | 33 +++++++++---------- parquet-hadoop/pom.xml | 6 ---- 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java index f894ea4738..f215323a1c 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java @@ -1481,24 +1481,24 @@ public void testDecimalLogicalTypeWithDeprecatedPrecisionMismatch() { @Test public void testGeometryLogicalType() { // Test with default CRS - PrimitiveType defaultCrsExpected = new PrimitiveType( - REQUIRED, BINARY, "aGeometry", LogicalTypeAnnotation.geometryType("OGC:CRS84")); + PrimitiveType defaultCrsExpected = + new PrimitiveType(REQUIRED, BINARY, "aGeometry", LogicalTypeAnnotation.geometryType("OGC:CRS84")); PrimitiveType defaultCrsActual = Types.required(BINARY) .as(LogicalTypeAnnotation.geometryType("OGC:CRS84")) .named("aGeometry"); Assert.assertEquals(defaultCrsExpected, defaultCrsActual); // Test with custom CRS - PrimitiveType customCrsExpected = new PrimitiveType( - REQUIRED, BINARY, "aGeometry", LogicalTypeAnnotation.geometryType("EPSG:4326")); + PrimitiveType customCrsExpected = + new PrimitiveType(REQUIRED, BINARY, "aGeometry", LogicalTypeAnnotation.geometryType("EPSG:4326")); PrimitiveType customCrsActual = Types.required(BINARY) .as(LogicalTypeAnnotation.geometryType("EPSG:4326")) .named("aGeometry"); Assert.assertEquals(customCrsExpected, customCrsActual); // Test with optional repetition - PrimitiveType optionalGeometryExpected = new PrimitiveType( - OPTIONAL, BINARY, "aGeometry", LogicalTypeAnnotation.geometryType("OGC:CRS84")); + PrimitiveType optionalGeometryExpected = + new PrimitiveType(OPTIONAL, BINARY, "aGeometry", LogicalTypeAnnotation.geometryType("OGC:CRS84")); PrimitiveType optionalGeometryActual = Types.optional(BINARY) .as(LogicalTypeAnnotation.geometryType("OGC:CRS84")) .named("aGeometry"); @@ -1544,11 +1544,10 @@ public void testGeographyLogicalType() { @Test public void testGeographyLogicalTypeWithoutEdgeInterpolationAlgorithm() { // Test with default CRS and no edge algorithm - PrimitiveType defaultCrsExpected = new PrimitiveType( - REQUIRED, BINARY, "aGeography", LogicalTypeAnnotation.geographyType()); - PrimitiveType defaultCrsActual = Types.required(BINARY) - .as(LogicalTypeAnnotation.geographyType()) - .named("aGeography"); + PrimitiveType defaultCrsExpected = + new PrimitiveType(REQUIRED, BINARY, "aGeography", LogicalTypeAnnotation.geographyType()); + PrimitiveType defaultCrsActual = + Types.required(BINARY).as(LogicalTypeAnnotation.geographyType()).named("aGeography"); Assert.assertEquals(defaultCrsExpected, defaultCrsActual); // Test with custom CRS and no edge algorithm @@ -1561,19 +1560,17 @@ public void testGeographyLogicalTypeWithoutEdgeInterpolationAlgorithm() { // Test with custom CRS and edge algorithm PrimitiveType customCrsWithEdgeAlgorithmExpected = new PrimitiveType( - REQUIRED, BINARY, "aGeography", - LogicalTypeAnnotation.geographyType("EPSG:4326", null)); + REQUIRED, BINARY, "aGeography", LogicalTypeAnnotation.geographyType("EPSG:4326", null)); PrimitiveType customCrsWithEdgeAlgorithmActual = Types.required(BINARY) .as(LogicalTypeAnnotation.geographyType("EPSG:4326", null)) .named("aGeography"); Assert.assertEquals(customCrsWithEdgeAlgorithmExpected, customCrsWithEdgeAlgorithmActual); // Test with optional repetition - PrimitiveType optionalGeographyExpected = new PrimitiveType( - OPTIONAL, BINARY, "aGeography", LogicalTypeAnnotation.geographyType()); - PrimitiveType optionalGeographyActual = Types.optional(BINARY) - .as(LogicalTypeAnnotation.geographyType()) - .named("aGeography"); + PrimitiveType optionalGeographyExpected = + new PrimitiveType(OPTIONAL, BINARY, "aGeography", LogicalTypeAnnotation.geographyType()); + PrimitiveType optionalGeographyActual = + Types.optional(BINARY).as(LogicalTypeAnnotation.geographyType()).named("aGeography"); Assert.assertEquals(optionalGeographyExpected, optionalGeographyActual); } diff --git a/parquet-hadoop/pom.xml b/parquet-hadoop/pom.xml index 687310d9e2..adfebfbd05 100644 --- a/parquet-hadoop/pom.xml +++ b/parquet-hadoop/pom.xml @@ -135,12 +135,6 @@ jar compile - - org.locationtech.jts - jts-core - ${jts.version} - test - io.airlift aircompressor From 1f0f9863642595ab0d05fccaaa76419556d4a1ca Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Thu, 24 Apr 2025 13:18:43 -0700 Subject: [PATCH 06/10] fix dependency issue --- .../schema/EdgeInterpolationAlgorithm.java | 64 +++++++++++++++++++ .../parquet/schema/LogicalTypeAnnotation.java | 2 +- .../parquet/schema/TestTypeBuilders.java | 2 +- 3 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 parquet-column/src/main/java/org/apache/parquet/column/schema/EdgeInterpolationAlgorithm.java diff --git a/parquet-column/src/main/java/org/apache/parquet/column/schema/EdgeInterpolationAlgorithm.java b/parquet-column/src/main/java/org/apache/parquet/column/schema/EdgeInterpolationAlgorithm.java new file mode 100644 index 0000000000..4b3a1fc599 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/schema/EdgeInterpolationAlgorithm.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.schema; + +/** + * Edge interpolation algorithm for Geography logical type + */ +public enum EdgeInterpolationAlgorithm { + SPHERICAL(0), + VINCENTY(1), + THOMAS(2), + ANDOYER(3), + KARNEY(4); + + private final int value; + + private EdgeInterpolationAlgorithm(int value) { + this.value = value; + } + + /** + * Get the integer value of this enum value, as defined in the Thrift IDL. + */ + public int getValue() { + return value; + } + + /** + * Find the enum type by its integer value, as defined in the Thrift IDL. + * @return null if the value is not found. + */ + public static EdgeInterpolationAlgorithm findByValue(int value) { + switch (value) { + case 0: + return SPHERICAL; + case 1: + return VINCENTY; + case 2: + return THOMAS; + case 3: + return ANDOYER; + case 4: + return KARNEY; + default: + return null; + } + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java index 610b6ec208..80f1c1ad41 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java @@ -41,7 +41,7 @@ import java.util.Set; import java.util.function.Supplier; import org.apache.parquet.Preconditions; -import org.apache.parquet.format.EdgeInterpolationAlgorithm; +import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm; public abstract class LogicalTypeAnnotation { enum LogicalTypeToken { diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java index f215323a1c..018ce5b276 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java @@ -55,7 +55,7 @@ import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; -import org.apache.parquet.format.EdgeInterpolationAlgorithm; +import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type.Repetition; import org.junit.Assert; From 0eee71b6b0c89b3e73c1031f1435ac6248d934a8 Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Sat, 26 Apr 2025 22:13:17 -0700 Subject: [PATCH 07/10] address pr review comments --- .../schema/EdgeInterpolationAlgorithm.java | 2 +- .../parquet/schema/LogicalTypeAnnotation.java | 79 +++++++------------ .../parquet/schema/PrimitiveStringifier.java | 3 +- .../converter/ParquetMetadataConverter.java | 17 ++-- 4 files changed, 43 insertions(+), 58 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/schema/EdgeInterpolationAlgorithm.java b/parquet-column/src/main/java/org/apache/parquet/column/schema/EdgeInterpolationAlgorithm.java index 4b3a1fc599..5357073a8f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/schema/EdgeInterpolationAlgorithm.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/schema/EdgeInterpolationAlgorithm.java @@ -58,7 +58,7 @@ public static EdgeInterpolationAlgorithm findByValue(int value) { case 4: return KARNEY; default: - return null; + throw new IllegalArgumentException("Unrecognized EdgeInterpolationAlgorithm value: " + value); } } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java index 80f1c1ad41..be98e071f6 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java @@ -44,6 +44,10 @@ import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm; public abstract class LogicalTypeAnnotation { + + public static final String DEFAULT_CRS = "OGC:CRS84"; + public static final EdgeInterpolationAlgorithm DEFAULT_ALGO = EdgeInterpolationAlgorithm.SPHERICAL; + enum LogicalTypeToken { MAP { @Override @@ -160,14 +164,11 @@ protected LogicalTypeAnnotation fromString(List params) { @Override protected LogicalTypeAnnotation fromString(List params) { if (params.size() > 1) { - throw new RuntimeException("Expecting only crs for geometry logical type, got " + params.size()); - } - if (params.size() == 1) { - String crs = params.get(0); - return geometryType(crs); - } else { - return geometryType(); + throw new RuntimeException( + "Expecting at most 1 parameter for geometry logical type, got " + params.size()); } + String crs = params.isEmpty() ? null : params.get(0); + return geometryType(crs); } }, GEOGRAPHY { @@ -175,19 +176,13 @@ protected LogicalTypeAnnotation fromString(List params) { protected LogicalTypeAnnotation fromString(List params) { if (params.size() > 2) { throw new RuntimeException( - "Expecting at most 2 parameters for geography logical type (crs and edgeAlgorithm), got " + "Expecting at most 2 parameters for geography logical type (crs and edge algorithm), got " + params.size()); } - if (params.size() == 1) { - String crs = params.get(0); - return geographyType(crs, null); - } - if (params.size() == 2) { - String crs = params.get(0); - String edgeAlgorithm = params.get(1); - return geographyType(crs, EdgeInterpolationAlgorithm.valueOf(edgeAlgorithm)); - } - return geographyType(); + String crs = !params.isEmpty() ? params.get(0) : null; + EdgeInterpolationAlgorithm algo = + params.size() > 1 ? EdgeInterpolationAlgorithm.valueOf(params.get(1)) : null; + return geographyType(crs, algo); } }, UNKNOWN { @@ -373,16 +368,12 @@ public static GeometryLogicalTypeAnnotation geometryType(String crs) { return new GeometryLogicalTypeAnnotation(crs); } - public static GeometryLogicalTypeAnnotation geometryType() { - return new GeometryLogicalTypeAnnotation("OGC:CRS84"); - } - public static GeographyLogicalTypeAnnotation geographyType(String crs, EdgeInterpolationAlgorithm edgeAlgorithm) { return new GeographyLogicalTypeAnnotation(crs, edgeAlgorithm); } public static GeographyLogicalTypeAnnotation geographyType() { - return new GeographyLogicalTypeAnnotation("OGC:CRS84", null); + return new GeographyLogicalTypeAnnotation(null, null); } public static UnknownLogicalTypeAnnotation unknownType() { @@ -1259,9 +1250,10 @@ LogicalTypeToken getType() { @Override protected String typeParametersAsString() { - StringBuilder sb = new StringBuilder(); - sb.append("(").append(crs != null && !crs.isEmpty() ? crs : "").append(")"); - return sb.toString(); + if (crs == null || crs.isEmpty()) { + return ""; + } + return String.format("(%s)", crs); } public String getCrs() { @@ -1290,11 +1282,11 @@ PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { public static class GeographyLogicalTypeAnnotation extends LogicalTypeAnnotation { private final String crs; - private final EdgeInterpolationAlgorithm edgeAlgorithm; + private final EdgeInterpolationAlgorithm algorithm; - private GeographyLogicalTypeAnnotation(String crs, EdgeInterpolationAlgorithm edgeAlgorithm) { + private GeographyLogicalTypeAnnotation(String crs, EdgeInterpolationAlgorithm algorithm) { this.crs = crs; - this.edgeAlgorithm = edgeAlgorithm; + this.algorithm = algorithm; } @Override @@ -1315,33 +1307,20 @@ LogicalTypeToken getType() { @Override protected String typeParametersAsString() { - StringBuilder sb = new StringBuilder(); - boolean hasCrs = crs != null && !crs.isEmpty(); - boolean hasEdgeAlgorithm = edgeAlgorithm != null; - - if (!hasCrs && !hasEdgeAlgorithm) { - return ""; // Return empty string when both are empty - } - - sb.append("("); - if (hasCrs) { - sb.append(crs); - } - if (hasEdgeAlgorithm) { - if (hasCrs) sb.append(","); - sb.append(edgeAlgorithm); + boolean hasAlgo = algorithm != null; + if (!hasCrs && !hasAlgo) { + return ""; } - sb.append(")"); - return sb.toString(); + return String.format("(%s,%s)", hasCrs ? crs : DEFAULT_CRS, hasAlgo ? algorithm : DEFAULT_ALGO); } public String getCrs() { return crs; } - public EdgeInterpolationAlgorithm getEdgeAlgorithm() { - return edgeAlgorithm; + public EdgeInterpolationAlgorithm getAlgorithm() { + return algorithm; } @Override @@ -1350,12 +1329,12 @@ public boolean equals(Object obj) { return false; } GeographyLogicalTypeAnnotation other = (GeographyLogicalTypeAnnotation) obj; - return Objects.equals(crs, other.crs) && Objects.equals(edgeAlgorithm, other.edgeAlgorithm); + return Objects.equals(crs, other.crs) && Objects.equals(algorithm, other.algorithm); } @Override public int hashCode() { - return Objects.hash(crs, edgeAlgorithm); + return Objects.hash(crs, algorithm); } @Override diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java index 0c696b029e..3bbcca981b 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java @@ -450,10 +450,9 @@ private void appendHex(byte[] array, int offset, int length, StringBuilder build @Override String stringifyNotNull(Binary value) { - Geometry geometry; try { WKBReader reader = new WKBReader(); - geometry = reader.read(value.getBytesUnsafe()); + Geometry geometry = reader.read(value.getBytesUnsafe()); return geometry.toText(); } catch (ParseException e) { return BINARY_INVALID; diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 1aaab3caf1..24a914cb2d 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -539,11 +539,15 @@ public Optional visit(LogicalTypeAnnotation.GeographyLogicalTypeAnn if (geographyLogicalType.getCrs() != null) { geographyType.setCrs(geographyLogicalType.getCrs()); } - if (geographyLogicalType.getEdgeAlgorithm() != null) { - EdgeInterpolationAlgorithm algorithm = - EdgeInterpolationAlgorithm.valueOf(String.valueOf(geographyLogicalType.getEdgeAlgorithm())); - if (algorithm != null) { + if (geographyLogicalType.getAlgorithm() != null) { + try { + // Convert from schema.EdgeInterpolationAlgorithm to format.EdgeInterpolationAlgorithm + EdgeInterpolationAlgorithm algorithm = EdgeInterpolationAlgorithm.valueOf( + geographyLogicalType.getAlgorithm().name()); geographyType.setAlgorithm(algorithm); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException( + "Unknown EdgeInterpolationAlgorithm value: " + geographyLogicalType.getAlgorithm(), e); } } return of(LogicalType.GEOGRAPHY(geographyType)); @@ -1216,7 +1220,10 @@ LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { return LogicalTypeAnnotation.geometryType(geometry.getCrs()); case GEOGRAPHY: GeographyType geography = type.getGEOGRAPHY(); - return LogicalTypeAnnotation.geographyType(geography.getCrs(), null); + return LogicalTypeAnnotation.geographyType( + geography.getCrs(), + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.valueOf( + geography.getAlgorithm().name())); case VARIANT: VariantType variant = type.getVARIANT(); return LogicalTypeAnnotation.variantType(variant.getSpecification_version()); From e3b9dbb8d730e44f4d2a9e1f0f428ee6fa1b2a19 Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Mon, 28 Apr 2025 07:31:56 -0700 Subject: [PATCH 08/10] add unit tests to TestParquetMetadataConverter --- .../converter/ParquetMetadataConverter.java | 17 ++- .../TestParquetMetadataConverter.java | 140 ++++++++++++++++++ 2 files changed, 153 insertions(+), 4 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 24a914cb2d..0a31e5a6ef 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -1220,10 +1220,19 @@ LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { return LogicalTypeAnnotation.geometryType(geometry.getCrs()); case GEOGRAPHY: GeographyType geography = type.getGEOGRAPHY(); - return LogicalTypeAnnotation.geographyType( - geography.getCrs(), - org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.valueOf( - geography.getAlgorithm().name())); + // Handle when either algorithm or CRS is null + if (geography == null) { + return LogicalTypeAnnotation.geographyType(null, null); + } + + EdgeInterpolationAlgorithm algorithm = geography.getAlgorithm(); + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgorithm = null; + if (algorithm != null) { + parquetAlgorithm = + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.valueOf(algorithm.name()); + } + + return LogicalTypeAnnotation.geographyType(geography.getCrs(), parquetAlgorithm); case VARIANT: VariantType variant = type.getVARIANT(); return LogicalTypeAnnotation.variantType(variant.getSpecification_version()); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 322d4c4abc..82c70bed95 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -80,6 +80,7 @@ import org.apache.parquet.column.Encoding; import org.apache.parquet.column.EncodingStats; import org.apache.parquet.column.ParquetProperties; +import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm; import org.apache.parquet.column.statistics.BinaryStatistics; import org.apache.parquet.column.statistics.BooleanStatistics; import org.apache.parquet.column.statistics.DoubleStatistics; @@ -101,6 +102,8 @@ import org.apache.parquet.format.DecimalType; import org.apache.parquet.format.FieldRepetitionType; import org.apache.parquet.format.FileMetaData; +import org.apache.parquet.format.GeographyType; +import org.apache.parquet.format.GeometryType; import org.apache.parquet.format.LogicalType; import org.apache.parquet.format.MapType; import org.apache.parquet.format.PageHeader; @@ -1661,4 +1664,141 @@ public void testSizeStatisticsConversion() { assertEquals(repLevelHistogram, sizeStatistics.getRepetitionLevelHistogram()); assertEquals(defLevelHistogram, sizeStatistics.getDefinitionLevelHistogram()); } + + @Test + public void testGeometryLogicalType() { + ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); + + // Create schema with geometry type + MessageType schema = Types.buildMessage() + .required(PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geometryType("EPSG:4326")) + .named("geomField") + .named("Message"); + + // Convert to parquet schema and back + List parquetSchema = parquetMetadataConverter.toParquetSchema(schema); + MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); + + // Verify the logical type is preserved + assertEquals(schema, actual); + + PrimitiveType primitiveType = actual.getType("geomField").asPrimitiveType(); + LogicalTypeAnnotation logicalType = primitiveType.getLogicalTypeAnnotation(); + assertTrue(logicalType instanceof LogicalTypeAnnotation.GeometryLogicalTypeAnnotation); + assertEquals("EPSG:4326", ((LogicalTypeAnnotation.GeometryLogicalTypeAnnotation) logicalType).getCrs()); + } + + @Test + public void testGeographyLogicalType() { + ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); + + // Create schema with geography type + MessageType schema = Types.buildMessage() + .required(PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geographyType("EPSG:4326", EdgeInterpolationAlgorithm.SPHERICAL)) + .named("geogField") + .named("Message"); + + // Convert to parquet schema and back + List parquetSchema = parquetMetadataConverter.toParquetSchema(schema); + MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); + + // Verify the logical type is preserved + assertEquals(schema, actual); + + PrimitiveType primitiveType = actual.getType("geogField").asPrimitiveType(); + LogicalTypeAnnotation logicalType = primitiveType.getLogicalTypeAnnotation(); + assertTrue(logicalType instanceof LogicalTypeAnnotation.GeographyLogicalTypeAnnotation); + + LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyType = + (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) logicalType; + assertEquals("EPSG:4326", geographyType.getCrs()); + assertEquals(EdgeInterpolationAlgorithm.SPHERICAL, geographyType.getAlgorithm()); + } + + @Test + public void testGeometryLogicalTypeWithMissingCrs() { + // Create a Geometry logical type without specifying CRS + GeometryType geometryType = new GeometryType(); + LogicalType logicalType = new LogicalType(); + logicalType.setGEOMETRY(geometryType); + + // Convert to LogicalTypeAnnotation + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType); + + // Verify the annotation is created correctly + assertNotNull("Geometry annotation should not be null", annotation); + assertTrue( + "Should be a GeometryLogicalTypeAnnotation", + annotation instanceof LogicalTypeAnnotation.GeometryLogicalTypeAnnotation); + + LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryAnnotation = + (LogicalTypeAnnotation.GeometryLogicalTypeAnnotation) annotation; + + // Default behavior should use null or empty CRS + assertNull("CRS should be null or empty when not specified", geometryAnnotation.getCrs()); + } + + @Test + public void testGeographyLogicalTypeWithMissingParameters() { + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + + // Create a Geography logical type without CRS and algorithm + GeographyType geographyType = new GeographyType(); + LogicalType logicalType = new LogicalType(); + logicalType.setGEOGRAPHY(geographyType); + + // Convert to LogicalTypeAnnotation + LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType); + + // Verify the annotation is created correctly + assertNotNull("Geography annotation should not be null", annotation); + assertTrue( + "Should be a GeographyLogicalTypeAnnotation", + annotation instanceof LogicalTypeAnnotation.GeographyLogicalTypeAnnotation); + + // Check that optional parameters are handled correctly + LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyAnnotation = + (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) annotation; + assertNull("CRS should be null when not specified", geographyAnnotation.getCrs()); + // Most implementations default to LINEAR when algorithm is not specified + assertNull("Algorithm should be null when not specified", geographyAnnotation.getAlgorithm()); + + // Now test the round-trip conversion + LogicalType roundTripType = converter.convertToLogicalType(annotation); + assertEquals("setField should be GEOGRAPHY", LogicalType._Fields.GEOGRAPHY, roundTripType.getSetField()); + assertNull( + "Round trip CRS should still be null", + roundTripType.getGEOGRAPHY().getCrs()); + assertNull( + "Round trip Algorithm should be null", + roundTripType.getGEOGRAPHY().getAlgorithm()); + } + + @Test + public void testGeographyLogicalTypeWithAlgorithmButNoCrs() { + // Create a Geography logical type with algorithm but no CRS + GeographyType geographyType = new GeographyType(); + geographyType.setAlgorithm(org.apache.parquet.format.EdgeInterpolationAlgorithm.SPHERICAL); + LogicalType logicalType = new LogicalType(); + logicalType.setGEOGRAPHY(geographyType); + + // Convert to LogicalTypeAnnotation + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType); + + // Verify the annotation is created correctly + Assert.assertNotNull("Geography annotation should not be null", annotation); + LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyAnnotation = + (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) annotation; + + // CRS should be null/empty but algorithm should be set + assertNull("CRS should be null or empty", geographyAnnotation.getCrs()); + assertEquals( + "Algorithm should be SPHERICAL", + EdgeInterpolationAlgorithm.SPHERICAL, + geographyAnnotation.getAlgorithm()); + } } From 3304de6f4b179accf53ac71f1647a56e3c3b4ae8 Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Mon, 28 Apr 2025 09:16:51 -0700 Subject: [PATCH 09/10] address review comments and refactor the algorithm conversion --- .../converter/ParquetMetadataConverter.java | 49 ++++++++++++------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 0a31e5a6ef..f38f77b461 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -527,7 +527,8 @@ public Optional visit(LogicalTypeAnnotation.VariantLogicalTypeAnnot @Override public Optional visit(LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType) { GeometryType geometryType = new GeometryType(); - if (geometryLogicalType.getCrs() != null) { + if (geometryLogicalType.getCrs() != null + && !geometryLogicalType.getCrs().isEmpty()) { geometryType.setCrs(geometryLogicalType.getCrs()); } return of(LogicalType.GEOMETRY(geometryType)); @@ -539,16 +540,12 @@ public Optional visit(LogicalTypeAnnotation.GeographyLogicalTypeAnn if (geographyLogicalType.getCrs() != null) { geographyType.setCrs(geographyLogicalType.getCrs()); } - if (geographyLogicalType.getAlgorithm() != null) { - try { - // Convert from schema.EdgeInterpolationAlgorithm to format.EdgeInterpolationAlgorithm - EdgeInterpolationAlgorithm algorithm = EdgeInterpolationAlgorithm.valueOf( - geographyLogicalType.getAlgorithm().name()); - geographyType.setAlgorithm(algorithm); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException( - "Unknown EdgeInterpolationAlgorithm value: " + geographyLogicalType.getAlgorithm(), e); - } + if (geographyLogicalType.getAlgorithm() != null + && !geographyLogicalType.getCrs().isEmpty()) { + // Convert from schema.EdgeInterpolationAlgorithm to format.EdgeInterpolationAlgorithm + EdgeInterpolationAlgorithm algorithm = + fromParquetEdgeInterpolationAlgorithm(geographyLogicalType.getAlgorithm()); + geographyType.setAlgorithm(algorithm); } return of(LogicalType.GEOGRAPHY(geographyType)); } @@ -1220,16 +1217,10 @@ LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { return LogicalTypeAnnotation.geometryType(geometry.getCrs()); case GEOGRAPHY: GeographyType geography = type.getGEOGRAPHY(); - // Handle when either algorithm or CRS is null - if (geography == null) { - return LogicalTypeAnnotation.geographyType(null, null); - } - EdgeInterpolationAlgorithm algorithm = geography.getAlgorithm(); org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgorithm = null; if (algorithm != null) { - parquetAlgorithm = - org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.valueOf(algorithm.name()); + parquetAlgorithm = toParquetEdgeInterpolationAlgorithm(algorithm); } return LogicalTypeAnnotation.geographyType(geography.getCrs(), parquetAlgorithm); @@ -2540,4 +2531,26 @@ public static SizeStatistics toParquetSizeStatistics(org.apache.parquet.column.s } return formatStats; } + + /** Convert Parquet Algorithm enum to Thrift Algorithm enum */ + public static EdgeInterpolationAlgorithm fromParquetEdgeInterpolationAlgorithm( + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo) { + if (parquetAlgo == null) { + return null; + } + EdgeInterpolationAlgorithm thriftAlgo = EdgeInterpolationAlgorithm.findByValue(parquetAlgo.getValue()); + if (thriftAlgo == null) { + throw new IllegalArgumentException("Unrecognized Parquet EdgeInterpolationAlgorithm: " + parquetAlgo); + } + return thriftAlgo; + } + + /** Convert Thrift Algorithm enum to Parquet Algorithm enum */ + public static org.apache.parquet.column.schema.EdgeInterpolationAlgorithm toParquetEdgeInterpolationAlgorithm( + EdgeInterpolationAlgorithm thriftAlgo) { + if (thriftAlgo == null) { + return null; + } + return org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.findByValue(thriftAlgo.getValue()); + } } From 89d8a74f6e82e7ccbd3eee9754ad85a7e3d9fa13 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 29 Apr 2025 09:36:02 +0800 Subject: [PATCH 10/10] fix and refactor metadata converter --- .../converter/ParquetMetadataConverter.java | 20 +++++-------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index f38f77b461..15fcd14a73 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -537,16 +537,11 @@ public Optional visit(LogicalTypeAnnotation.GeometryLogicalTypeAnno @Override public Optional visit(LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyLogicalType) { GeographyType geographyType = new GeographyType(); - if (geographyLogicalType.getCrs() != null) { - geographyType.setCrs(geographyLogicalType.getCrs()); - } - if (geographyLogicalType.getAlgorithm() != null + if (geographyLogicalType.getCrs() != null && !geographyLogicalType.getCrs().isEmpty()) { - // Convert from schema.EdgeInterpolationAlgorithm to format.EdgeInterpolationAlgorithm - EdgeInterpolationAlgorithm algorithm = - fromParquetEdgeInterpolationAlgorithm(geographyLogicalType.getAlgorithm()); - geographyType.setAlgorithm(algorithm); + geographyType.setCrs(geographyLogicalType.getCrs()); } + geographyType.setAlgorithm(fromParquetEdgeInterpolationAlgorithm(geographyLogicalType.getAlgorithm())); return of(LogicalType.GEOGRAPHY(geographyType)); } } @@ -1217,13 +1212,8 @@ LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { return LogicalTypeAnnotation.geometryType(geometry.getCrs()); case GEOGRAPHY: GeographyType geography = type.getGEOGRAPHY(); - EdgeInterpolationAlgorithm algorithm = geography.getAlgorithm(); - org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgorithm = null; - if (algorithm != null) { - parquetAlgorithm = toParquetEdgeInterpolationAlgorithm(algorithm); - } - - return LogicalTypeAnnotation.geographyType(geography.getCrs(), parquetAlgorithm); + return LogicalTypeAnnotation.geographyType( + geography.getCrs(), toParquetEdgeInterpolationAlgorithm(geography.getAlgorithm())); case VARIANT: VariantType variant = type.getVARIANT(); return LogicalTypeAnnotation.variantType(variant.getSpecification_version());