Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions parquet-column/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@
<version>${slf4j.version}</version>
</dependency>

<dependency>
<groupId>org.locationtech.jts</groupId>
<artifactId>jts-core</artifactId>
<version>${jts.version}</version>
</dependency>

<dependency>
<groupId>com.carrotsearch</groupId>
<artifactId>junit-benchmarks</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.column.schema;

/**
* Edge interpolation algorithm for Geography logical type
*/
public enum EdgeInterpolationAlgorithm {
SPHERICAL(0),
VINCENTY(1),
THOMAS(2),
ANDOYER(3),
KARNEY(4);

private final int value;

private EdgeInterpolationAlgorithm(int value) {
this.value = value;
}

/**
* Get the integer value of this enum value, as defined in the Thrift IDL.
*/
public int getValue() {
return value;
}

/**
* Find the enum type by its integer value, as defined in the Thrift IDL.
* @return null if the value is not found.
*/
public static EdgeInterpolationAlgorithm findByValue(int value) {
switch (value) {
case 0:
return SPHERICAL;
case 1:
return VINCENTY;
case 2:
return THOMAS;
case 3:
return ANDOYER;
case 4:
return KARNEY;
default:
throw new IllegalArgumentException("Unrecognized EdgeInterpolationAlgorithm value: " + value);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,13 @@
import java.util.Set;
import java.util.function.Supplier;
import org.apache.parquet.Preconditions;
import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm;

public abstract class LogicalTypeAnnotation {

public static final String DEFAULT_CRS = "OGC:CRS84";
public static final EdgeInterpolationAlgorithm DEFAULT_ALGO = EdgeInterpolationAlgorithm.SPHERICAL;

enum LogicalTypeToken {
MAP {
@Override
Expand Down Expand Up @@ -155,6 +160,31 @@ protected LogicalTypeAnnotation fromString(List<String> params) {
return float16Type();
}
},
GEOMETRY {
@Override
protected LogicalTypeAnnotation fromString(List<String> params) {
if (params.size() > 1) {
throw new RuntimeException(
"Expecting at most 1 parameter for geometry logical type, got " + params.size());
}
String crs = params.isEmpty() ? null : params.get(0);
return geometryType(crs);
}
},
GEOGRAPHY {
@Override
protected LogicalTypeAnnotation fromString(List<String> params) {
if (params.size() > 2) {
throw new RuntimeException(
"Expecting at most 2 parameters for geography logical type (crs and edge algorithm), got "
+ params.size());
}
String crs = !params.isEmpty() ? params.get(0) : null;
EdgeInterpolationAlgorithm algo =
params.size() > 1 ? EdgeInterpolationAlgorithm.valueOf(params.get(1)) : null;
return geographyType(crs, algo);
}
},
UNKNOWN {
@Override
protected LogicalTypeAnnotation fromString(List<String> params) {
Expand Down Expand Up @@ -334,6 +364,18 @@ public static Float16LogicalTypeAnnotation float16Type() {
return Float16LogicalTypeAnnotation.INSTANCE;
}

public static GeometryLogicalTypeAnnotation geometryType(String crs) {
return new GeometryLogicalTypeAnnotation(crs);
}

public static GeographyLogicalTypeAnnotation geographyType(String crs, EdgeInterpolationAlgorithm edgeAlgorithm) {
return new GeographyLogicalTypeAnnotation(crs, edgeAlgorithm);
}

public static GeographyLogicalTypeAnnotation geographyType() {
return new GeographyLogicalTypeAnnotation(null, null);
}

public static UnknownLogicalTypeAnnotation unknownType() {
return UnknownLogicalTypeAnnotation.INSTANCE;
}
Expand Down Expand Up @@ -1183,6 +1225,124 @@ public boolean equals(Object obj) {
}
}

public static class GeometryLogicalTypeAnnotation extends LogicalTypeAnnotation {
private final String crs;

private GeometryLogicalTypeAnnotation(String crs) {
this.crs = crs;
}

@Override
@Deprecated
public OriginalType toOriginalType() {
return null;
}

@Override
public <T> Optional<T> accept(LogicalTypeAnnotationVisitor<T> logicalTypeAnnotationVisitor) {
return logicalTypeAnnotationVisitor.visit(this);
}

@Override
LogicalTypeToken getType() {
return LogicalTypeToken.GEOMETRY;
}

@Override
protected String typeParametersAsString() {
if (crs == null || crs.isEmpty()) {
return "";
}
return String.format("(%s)", crs);
}

public String getCrs() {
return crs;
}

@Override
public boolean equals(Object obj) {
if (!(obj instanceof GeometryLogicalTypeAnnotation)) {
return false;
}
GeometryLogicalTypeAnnotation other = (GeometryLogicalTypeAnnotation) obj;
return Objects.equals(crs, other.crs);
}

@Override
public int hashCode() {
return Objects.hash(crs);
}

@Override
PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) {
return PrimitiveStringifier.WKB_STRINGIFIER;
}
}

public static class GeographyLogicalTypeAnnotation extends LogicalTypeAnnotation {
private final String crs;
private final EdgeInterpolationAlgorithm algorithm;

private GeographyLogicalTypeAnnotation(String crs, EdgeInterpolationAlgorithm algorithm) {
this.crs = crs;
this.algorithm = algorithm;
}

@Override
@Deprecated
public OriginalType toOriginalType() {
return null;
}

@Override
public <T> Optional<T> accept(LogicalTypeAnnotationVisitor<T> logicalTypeAnnotationVisitor) {
return logicalTypeAnnotationVisitor.visit(this);
}

@Override
LogicalTypeToken getType() {
return LogicalTypeToken.GEOGRAPHY;
}

@Override
protected String typeParametersAsString() {
boolean hasCrs = crs != null && !crs.isEmpty();
boolean hasAlgo = algorithm != null;
if (!hasCrs && !hasAlgo) {
return "";
}
return String.format("(%s,%s)", hasCrs ? crs : DEFAULT_CRS, hasAlgo ? algorithm : DEFAULT_ALGO);
}

public String getCrs() {
return crs;
}

public EdgeInterpolationAlgorithm getAlgorithm() {
return algorithm;
}

@Override
public boolean equals(Object obj) {
if (!(obj instanceof GeographyLogicalTypeAnnotation)) {
return false;
}
GeographyLogicalTypeAnnotation other = (GeographyLogicalTypeAnnotation) obj;
return Objects.equals(crs, other.crs) && Objects.equals(algorithm, other.algorithm);
}

@Override
public int hashCode() {
return Objects.hash(crs, algorithm);
}

@Override
PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) {
return PrimitiveStringifier.WKB_STRINGIFIER;
}
}

/**
* Implement this interface to visit a logical type annotation in the schema.
* The default implementation for each logical type specific visitor method is empty.
Expand Down Expand Up @@ -1259,6 +1419,14 @@ default Optional<T> visit(Float16LogicalTypeAnnotation float16LogicalType) {
return empty();
}

default Optional<T> visit(GeometryLogicalTypeAnnotation geometryLogicalType) {
return empty();
}

default Optional<T> visit(GeographyLogicalTypeAnnotation geographyLogicalType) {
return empty();
}

default Optional<T> visit(UnknownLogicalTypeAnnotation unknownLogicalTypeAnnotation) {
return empty();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
import java.util.concurrent.TimeUnit;
import javax.naming.OperationNotSupportedException;
import org.apache.parquet.io.api.Binary;
import org.locationtech.jts.geom.Geometry;
import org.locationtech.jts.io.ParseException;
import org.locationtech.jts.io.WKBReader;

/**
* Class that provides string representations for the primitive values. These string values are to be used for
Expand Down Expand Up @@ -442,6 +445,21 @@ private void appendHex(byte[] array, int offset, int length, StringBuilder build
}
};

static final PrimitiveStringifier WKB_STRINGIFIER = new BinaryStringifierBase("WKB_STRINGIFIER") {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wgtmac and @zhangfengcdt, what do you think about updating this to use WKT only if JTS is available, rather than adding JTS to the classpath for all downstream clients? This could just produce a generic string if a geo library isn't available.

Copy link
Member

@wgtmac wgtmac May 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes sense for the printer. To implement geospatial statistics, we need to depend on JTS to parse WKB. Otherwise we could only blindly write them as blobs without producing any bbox and other stats. If this is acceptable, I think we can make such changes.

@jiayuasu @szehon-ho WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe it’s reasonable to add JTS as a dependency. Its license is compatible with the ASF, and it’s the most widely used geospatial library in the industry. The API is also very stable. JTS has been around for over 20 years.

An alternative would be to implement a standalone WKB parser and maintain it within parquet-java, but that could introduce significant long-term maintenance overhead for the community.

For context, the Parquet Geo C++ PR included a standalone WKB parser because (1) there isn’t a clean, well-maintained WKB parser in C++, and (2) Dewey had already implemented WKB parsing in C++ multiple times before.


@Override
String stringifyNotNull(Binary value) {

try {
WKBReader reader = new WKBReader();
Geometry geometry = reader.read(value.getBytesUnsafe());
return geometry.toText();
} catch (ParseException e) {
return BINARY_INVALID;
}
}
};

static final PrimitiveStringifier FLOAT16_STRINGIFIER = new BinaryStringifierBase("FLOAT16_STRINGIFIER") {

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,18 @@ public Optional<PrimitiveComparator> visit(
LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) {
return of(PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR);
}

@Override
public Optional<PrimitiveComparator> visit(
LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType) {
return of(PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR);
}

@Override
public Optional<PrimitiveComparator> visit(
LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyLogicalType) {
return of(PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR);
}
})
.orElseThrow(() -> new ShouldNeverHappenException(
"No comparator logic implemented for BINARY logical type: " + logicalType));
Expand Down
12 changes: 12 additions & 0 deletions parquet-column/src/main/java/org/apache/parquet/schema/Types.java
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,18 @@ public Optional<Boolean> visit(
return checkBinaryPrimitiveType(enumLogicalType);
}

@Override
public Optional<Boolean> visit(
LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType) {
return checkBinaryPrimitiveType(geometryLogicalType);
}

@Override
public Optional<Boolean> visit(
LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyLogicalType) {
return checkBinaryPrimitiveType(geographyLogicalType);
}

private Optional<Boolean> checkFixedPrimitiveType(
int l, LogicalTypeAnnotation logicalTypeAnnotation) {
Preconditions.checkState(
Expand Down
Loading