diff --git a/api/src/main/java/org/apache/iceberg/expressions/BoundGeospatialPredicate.java b/api/src/main/java/org/apache/iceberg/expressions/BoundGeospatialPredicate.java new file mode 100644 index 000000000000..4b0ce6cc5498 --- /dev/null +++ b/api/src/main/java/org/apache/iceberg/expressions/BoundGeospatialPredicate.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.expressions; + +import java.nio.ByteBuffer; +import org.apache.iceberg.geospatial.BoundingBox; + +public class BoundGeospatialPredicate extends BoundPredicate { + private final Literal literal; + + BoundGeospatialPredicate(Operation op, BoundTerm term, Literal literal) { + super(op, term); + this.literal = literal; + } + + public Literal literal() { + return literal; + } + + @Override + public boolean test(ByteBuffer value) { + throw new UnsupportedOperationException( + "Evaluation of spatial predicate \"" + + op() + + "\" against geometry/geography value is not implemented."); + } + + @Override + public boolean isGeospatialPredicate() { + return true; + } + + @Override + public BoundGeospatialPredicate asGeospatialPredicate() { + return this; + } + + @Override + public Expression negate() { + return new BoundGeospatialPredicate(op().negate(), term(), literal); + } + + @Override + public boolean isEquivalentTo(Expression expr) { + if (!(expr instanceof BoundGeospatialPredicate)) { + return false; + } + + BoundGeospatialPredicate other = (BoundGeospatialPredicate) expr; + return op() == other.op() + && term().isEquivalentTo(other.term()) + && literal.value().equals(other.literal.value()); + } + + @Override + public String toString() { + switch (op()) { + case ST_INTERSECTS: + return term().toString() + " stIntersects " + literal.value(); + case ST_DISJOINT: + return term().toString() + " stDisjoint " + literal.value(); + default: + return "Invalid geospatial predicate: operation = " + op(); + } + } +} diff --git a/api/src/main/java/org/apache/iceberg/expressions/BoundPredicate.java b/api/src/main/java/org/apache/iceberg/expressions/BoundPredicate.java index 95e1aeaa2592..af4082514988 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/BoundPredicate.java +++ b/api/src/main/java/org/apache/iceberg/expressions/BoundPredicate.java @@ -65,4 +65,12 @@ public boolean isSetPredicate() { public BoundSetPredicate asSetPredicate() { throw new IllegalStateException("Not a set predicate: " + this); } + + public boolean isGeospatialPredicate() { + return false; + } + + public BoundGeospatialPredicate asGeospatialPredicate() { + throw new IllegalStateException("Not a geospatial predicate: " + this); + } } diff --git a/api/src/main/java/org/apache/iceberg/expressions/Evaluator.java b/api/src/main/java/org/apache/iceberg/expressions/Evaluator.java index 96e148a2d438..9b6c2b725579 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Evaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Evaluator.java @@ -23,6 +23,7 @@ import java.util.Set; import org.apache.iceberg.StructLike; import org.apache.iceberg.expressions.ExpressionVisitors.BoundVisitor; +import org.apache.iceberg.geospatial.BoundingBox; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.NaNUtil; @@ -156,5 +157,25 @@ public Boolean startsWith(Bound valueExpr, Literal lit) { public Boolean notStartsWith(Bound valueExpr, Literal lit) { return !startsWith(valueExpr, lit); } + + @Override + public Boolean stIntersects(Bound valueExpr, Literal literal) { + // Evaluation of stIntersects against geometry/geography value is not supported. Spatial + // predicates only + // supports data skipping but not filtering individual records in iceberg-api. Readers should + // expect + // false-positives and run the actual spatial filters on their own. + return true; + } + + @Override + public Boolean stDisjoint(Bound valueExpr, Literal literal) { + // Evaluation of stIntersects against geometry/geography value is not supported. Spatial + // predicates only + // supports data skipping but not filtering individual records in iceberg-api. Readers should + // expect + // false-positives and run the actual spatial filters on their own. + return true; + } } } diff --git a/api/src/main/java/org/apache/iceberg/expressions/Expression.java b/api/src/main/java/org/apache/iceberg/expressions/Expression.java index dc88172c590d..7618d67538a3 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Expression.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Expression.java @@ -44,6 +44,8 @@ enum Operation { OR, STARTS_WITH, NOT_STARTS_WITH, + ST_INTERSECTS, + ST_DISJOINT, COUNT, COUNT_STAR, MAX, @@ -90,6 +92,10 @@ public Operation negate() { return Operation.NOT_STARTS_WITH; case NOT_STARTS_WITH: return Operation.STARTS_WITH; + case ST_INTERSECTS: + return Operation.ST_DISJOINT; + case ST_DISJOINT: + return Operation.ST_INTERSECTS; default: throw new IllegalArgumentException("No negation for operation: " + this); } diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java index d3dc00d914c7..ef09551fe003 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java @@ -18,6 +18,7 @@ */ package org.apache.iceberg.expressions; +import java.nio.ByteBuffer; import java.time.Instant; import java.time.OffsetDateTime; import java.time.ZoneOffset; @@ -34,6 +35,7 @@ import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.Table; +import org.apache.iceberg.geospatial.BoundingBox; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.transforms.Transforms; @@ -318,6 +320,10 @@ public Expression predicate(BoundPredicate pred) { .map(lit -> (T) sanitize(bound.term().type(), lit, now, today)) .iterator(); return new UnboundPredicate<>(pred.op(), unbind(pred.term()), iter); + } else if (pred.isGeospatialPredicate()) { + BoundGeospatialPredicate bound = (BoundGeospatialPredicate) pred; + return Expressions.geospatialPredicate( + pred.op(), unbind(bound.term()), BoundingBox.empty()); } throw new UnsupportedOperationException("Cannot sanitize bound predicate type: " + pred.op()); @@ -343,6 +349,10 @@ public Expression predicate(UnboundPredicate pred) { case NOT_STARTS_WITH: return new UnboundPredicate<>( pred.op(), pred.term(), (T) sanitize(pred.literal(), now, today)); + case ST_INTERSECTS: + case ST_DISJOINT: + return Expressions.geospatialPredicate( + pred.op(), (UnboundTerm) pred.term(), BoundingBox.empty()); case IN: case NOT_IN: Iterable iter = @@ -493,6 +503,10 @@ public String predicate(UnboundPredicate pred) { return term + " STARTS WITH " + sanitize(pred.literal(), nowMicros, today); case NOT_STARTS_WITH: return term + " NOT STARTS WITH " + sanitize(pred.literal(), nowMicros, today); + case ST_INTERSECTS: + return term + " ST_INTERSECTS WITH (bounding-box)"; + case ST_DISJOINT: + return term + " ST_DISJOINT WITH (bounding-box)"; default: throw new UnsupportedOperationException( "Cannot sanitize unsupported predicate type: " + pred.op()); diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionVisitors.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionVisitors.java index 79ca6a712887..fc2a67822d8a 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionVisitors.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionVisitors.java @@ -21,6 +21,7 @@ import java.util.Set; import java.util.function.Supplier; import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.geospatial.BoundingBox; /** Utils for traversing {@link Expression expressions}. */ public class ExpressionVisitors { @@ -126,6 +127,16 @@ public R notStartsWith(BoundReference ref, Literal lit) { "notStartsWith expression is not supported by the visitor"); } + public R stIntersects(BoundReference ref, Literal lit) { + throw new UnsupportedOperationException( + "stIntersects expression is not supported by the visitor"); + } + + public R stDisjoint(BoundReference ref, Literal lit) { + throw new UnsupportedOperationException( + "stDisjoint expression is not supported by the visitor"); + } + /** * Handle a non-reference value in this visitor. * @@ -196,6 +207,19 @@ public R predicate(BoundPredicate pred) { throw new IllegalStateException( "Invalid operation for BoundSetPredicate: " + pred.op()); } + + } else if (pred.isGeospatialPredicate()) { + switch (pred.op()) { + case ST_INTERSECTS: + return stIntersects( + (BoundReference) pred.term(), pred.asGeospatialPredicate().literal()); + case ST_DISJOINT: + return stDisjoint( + (BoundReference) pred.term(), pred.asGeospatialPredicate().literal()); + default: + throw new IllegalStateException( + "Invalid operation for BoundGeospatialPredicate: " + pred.op()); + } } throw new IllegalStateException("Unsupported bound predicate: " + pred.getClass().getName()); @@ -266,6 +290,14 @@ public R notStartsWith(Bound expr, Literal lit) { throw new UnsupportedOperationException("Unsupported operation."); } + public R stIntersects(Bound term, Literal literal) { + throw new UnsupportedOperationException("ST_INTERSECTS is not supported by the visitor"); + } + + public R stDisjoint(Bound term, Literal literal) { + throw new UnsupportedOperationException("ST_DISJOINT is not supported by the visitor"); + } + @Override public R predicate(BoundPredicate pred) { if (pred.isLiteralPredicate()) { @@ -317,8 +349,15 @@ public R predicate(BoundPredicate pred) { throw new IllegalStateException( "Invalid operation for BoundSetPredicate: " + pred.op()); } - } + } else if (pred.isGeospatialPredicate()) { + switch (pred.op()) { + case ST_INTERSECTS: + return stIntersects(pred.term(), pred.asGeospatialPredicate().literal()); + case ST_DISJOINT: + return stDisjoint(pred.term(), pred.asGeospatialPredicate().literal()); + } + } throw new IllegalStateException("Unsupported bound predicate: " + pred.getClass().getName()); } @@ -495,6 +534,13 @@ public R predicate(BoundPredicate pred) { throw new IllegalStateException( "Invalid operation for BoundSetPredicate: " + pred.op()); } + } else if (pred.isGeospatialPredicate()) { + switch (pred.op()) { + case ST_INTERSECTS: + return stIntersects(pred.term(), pred.asGeospatialPredicate().literal()); + case ST_DISJOINT: + return stDisjoint(pred.term(), pred.asGeospatialPredicate().literal()); + } } throw new IllegalStateException("Unsupported bound predicate: " + pred.getClass().getName()); @@ -555,6 +601,14 @@ public R startsWith(BoundTerm term, Literal lit) { public R notStartsWith(BoundTerm term, Literal lit) { return null; } + + public R stIntersects(BoundTerm term, Literal lit) { + return null; + } + + public R stDisjoint(BoundTerm term, Literal lit) { + return null; + } } /** diff --git a/api/src/main/java/org/apache/iceberg/expressions/Expressions.java b/api/src/main/java/org/apache/iceberg/expressions/Expressions.java index ba1b5ad1cb52..83e1decda994 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Expressions.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Expressions.java @@ -18,8 +18,10 @@ */ package org.apache.iceberg.expressions; +import java.nio.ByteBuffer; import java.util.stream.Stream; import org.apache.iceberg.expressions.Expression.Operation; +import org.apache.iceberg.geospatial.BoundingBox; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.transforms.Transform; @@ -202,6 +204,44 @@ public static UnboundPredicate notStartsWith(UnboundTerm expr, S return new UnboundPredicate<>(Expression.Operation.NOT_STARTS_WITH, expr, value); } + public static UnboundPredicate stIntersects(String name, BoundingBox value) { + return geospatialPredicate(Operation.ST_INTERSECTS, name, value); + } + + public static UnboundPredicate stIntersects( + String name, ByteBuffer min, ByteBuffer max) { + return geospatialPredicate(Operation.ST_INTERSECTS, name, min, max); + } + + public static UnboundPredicate stIntersects( + UnboundTerm expr, BoundingBox value) { + return geospatialPredicate(Operation.ST_INTERSECTS, expr, value); + } + + public static UnboundPredicate stIntersects( + UnboundTerm expr, ByteBuffer min, ByteBuffer max) { + return geospatialPredicate(Operation.ST_INTERSECTS, expr, min, max); + } + + public static UnboundPredicate stDisjoint(String name, BoundingBox value) { + return geospatialPredicate(Operation.ST_DISJOINT, name, value); + } + + public static UnboundPredicate stDisjoint( + String name, ByteBuffer min, ByteBuffer max) { + return geospatialPredicate(Operation.ST_DISJOINT, name, min, max); + } + + public static UnboundPredicate stDisjoint( + UnboundTerm expr, BoundingBox value) { + return geospatialPredicate(Operation.ST_DISJOINT, expr, value); + } + + public static UnboundPredicate stDisjoint( + UnboundTerm expr, ByteBuffer min, ByteBuffer max) { + return geospatialPredicate(Operation.ST_DISJOINT, expr, min, max); + } + public static UnboundPredicate in(String name, T... values) { return predicate(Operation.IN, name, Lists.newArrayList(values)); } @@ -280,6 +320,27 @@ public static UnboundPredicate predicate(Operation op, UnboundTerm exp return new UnboundPredicate<>(op, expr); } + public static UnboundPredicate geospatialPredicate( + Operation op, String name, BoundingBox value) { + return geospatialPredicate( + op, ref(name), value.min().toByteBuffer(), value.max().toByteBuffer()); + } + + public static UnboundPredicate geospatialPredicate( + Operation op, UnboundTerm expr, BoundingBox value) { + return geospatialPredicate(op, expr, value.min().toByteBuffer(), value.max().toByteBuffer()); + } + + public static UnboundPredicate geospatialPredicate( + Operation op, String name, ByteBuffer min, ByteBuffer max) { + return geospatialPredicate(op, ref(name), min, max); + } + + public static UnboundPredicate geospatialPredicate( + Operation op, UnboundTerm expr, ByteBuffer min, ByteBuffer max) { + return new UnboundPredicate<>(op, expr, Lists.newArrayList(min, max)); + } + public static True alwaysTrue() { return True.INSTANCE; } diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index aa0441f49011..72d141670cd9 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -29,6 +29,8 @@ import org.apache.iceberg.ContentFile; import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialPredicateEvaluators; import org.apache.iceberg.transforms.Transform; import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Conversions; @@ -471,6 +473,35 @@ public Boolean notStartsWith(Bound term, Literal lit) { return ROWS_MIGHT_MATCH; } + @Override + public Boolean stIntersects(Bound term, Literal lit) { + T lower = lowerBound(term); + T upper = upperBound(term); + + if (lower == null || upper == null) { + return ROWS_MIGHT_MATCH; + } + + if (lit.value() != null && lower instanceof ByteBuffer && upper instanceof ByteBuffer) { + BoundingBox dataBox = BoundingBox.fromByteBuffers((ByteBuffer) lower, (ByteBuffer) upper); + BoundingBox queryBox = lit.value(); + + // If the data box and query box doesn't intersect, no records can match + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(term.ref().type()); + if (!evaluator.intersects(dataBox, queryBox)) { + return ROWS_CANNOT_MATCH; + } + } + + return ROWS_MIGHT_MATCH; + } + + @Override + public Boolean stDisjoint(Bound term, Literal lit) { + return ROWS_MIGHT_MATCH; + } + private boolean mayContainNull(Integer id) { return nullCounts == null || !nullCounts.containsKey(id) || nullCounts.get(id) != 0; } diff --git a/api/src/main/java/org/apache/iceberg/expressions/Literal.java b/api/src/main/java/org/apache/iceberg/expressions/Literal.java index b5d6f72f74d0..981ae0e6be1a 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Literal.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Literal.java @@ -23,6 +23,7 @@ import java.nio.ByteBuffer; import java.util.Comparator; import java.util.UUID; +import org.apache.iceberg.geospatial.BoundingBox; import org.apache.iceberg.types.Type; /** @@ -71,6 +72,10 @@ static Literal of(BigDecimal value) { return new Literals.DecimalLiteral(value); } + static Literal of(BoundingBox value) { + return new Literals.GeospatialBoundingBoxLiteral(value); + } + /** Returns the value wrapped by this literal. */ T value(); diff --git a/api/src/main/java/org/apache/iceberg/expressions/Literals.java b/api/src/main/java/org/apache/iceberg/expressions/Literals.java index 3a45eb804f35..b5694df13ad1 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Literals.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Literals.java @@ -32,6 +32,7 @@ import java.util.Comparator; import java.util.Objects; import java.util.UUID; +import org.apache.iceberg.geospatial.BoundingBox; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.io.BaseEncoding; import org.apache.iceberg.types.Comparators; @@ -85,6 +86,8 @@ static Literal from(T value) { return (Literal) new Literals.DecimalLiteral((BigDecimal) value); } else if (value instanceof Variant) { return (Literal) new Literals.VariantLiteral((Variant) value); + } else if (value instanceof BoundingBox) { + return (Literal) new Literals.GeospatialBoundingBoxLiteral((BoundingBox) value); } throw new IllegalArgumentException( @@ -719,4 +722,35 @@ public String toString() { return "X'" + BaseEncoding.base16().encode(bytes) + "'"; } } + + static class GeospatialBoundingBoxLiteral implements Literal { + private static final Comparator CMP = + Comparators.nullsFirst().thenComparing(Comparator.naturalOrder()); + + private final BoundingBox value; + + GeospatialBoundingBoxLiteral(BoundingBox value) { + this.value = value; + } + + @Override + public BoundingBox value() { + return value; + } + + @Override + public Literal to(Type type) { + return null; + } + + @Override + public Comparator comparator() { + return CMP; + } + + @Override + public String toString() { + return String.valueOf(value()); + } + } } diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index c225f21da8a8..8b49f31c7096 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -29,6 +29,7 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor; +import org.apache.iceberg.geospatial.BoundingBox; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.NaNUtil; @@ -472,6 +473,16 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_NOT_MATCH; } + @Override + public Boolean stIntersects(BoundReference ref, Literal lit) { + return ROWS_MIGHT_NOT_MATCH; + } + + @Override + public Boolean stDisjoint(BoundReference ref, Literal lit) { + return ROWS_MIGHT_NOT_MATCH; + } + private boolean isNestedColumn(int id) { return struct.field(id) == null; } diff --git a/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java b/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java index 4736ca4a8668..d3d25dee0662 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java +++ b/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java @@ -18,9 +18,11 @@ */ package org.apache.iceberg.expressions; +import java.nio.ByteBuffer; import java.util.List; import java.util.Set; import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.geospatial.BoundingBox; import org.apache.iceberg.relocated.com.google.common.base.Joiner; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; @@ -119,6 +121,10 @@ public Expression bind(StructType struct, boolean caseSensitive) { return bindInOperation(bound); } + if (op() == Operation.ST_INTERSECTS || op() == Operation.ST_DISJOINT) { + return bindGeospatialOperation(bound); + } + return bindLiteralOperation(bound); } @@ -249,6 +255,36 @@ private Expression bindInOperation(BoundTerm boundTerm) { return new BoundSetPredicate<>(op(), boundTerm, literalSet); } + @SuppressWarnings("unchecked") + private Expression bindGeospatialOperation(BoundTerm boundTerm) { + Type.TypeID typeId = boundTerm.type().typeId(); + if (typeId != Type.TypeID.GEOMETRY && typeId != Type.TypeID.GEOGRAPHY) { + throw new ValidationException( + "Cannot bind geospatial operation to non-geospatial type: %s", boundTerm); + } + + Literal minLiteral = literals.get(0); + Literal min = minLiteral.to(Types.BinaryType.get()); + if (min == null) { + throw new ValidationException( + "Invalid value for conversion to type %s: %s (%s)", + Types.BinaryType.get(), minLiteral.value(), minLiteral.value().getClass().getName()); + } + + Literal maxLiteral = literals.get(1); + Literal max = maxLiteral.to(Types.BinaryType.get()); + if (max == null) { + throw new ValidationException( + "Invalid value for conversion to type %s: %s (%s)", + Types.BinaryType.get(), maxLiteral.value(), maxLiteral.value().getClass().getName()); + } + + return new BoundGeospatialPredicate( + op(), + (BoundTerm) boundTerm, + Literals.from(BoundingBox.fromByteBuffers(min.value(), max.value()))); + } + @Override public String toString() { switch (op()) { @@ -276,6 +312,14 @@ public String toString() { return term() + " startsWith \"" + literal() + "\""; case NOT_STARTS_WITH: return term() + " notStartsWith \"" + literal() + "\""; + case ST_INTERSECTS: + case ST_DISJOINT: + Literal minLiteral = literals.get(0).to(Types.BinaryType.get()); + Literal maxLiteral = literals.get(1).to(Types.BinaryType.get()); + String opName = op() == Operation.ST_INTERSECTS ? " stIntersects " : " stDisjoint "; + return term() + + opName + + BoundingBox.fromByteBuffers(minLiteral.value(), maxLiteral.value()); case IN: return term() + " in (" + COMMA.join(literals()) + ")"; case NOT_IN: diff --git a/api/src/main/java/org/apache/iceberg/geospatial/BoundingBox.java b/api/src/main/java/org/apache/iceberg/geospatial/BoundingBox.java new file mode 100644 index 000000000000..1ee214a11d2d --- /dev/null +++ b/api/src/main/java/org/apache/iceberg/geospatial/BoundingBox.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.geospatial; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.util.Objects; + +/** + * Represents a geospatial bounding box composed of minimum and maximum bounds. + * + *

A bounding box (also called a Minimum Bounding Rectangle or MBR) is defined by two points: the + * minimum and maximum coordinates that define the box's corners. This provides a simple + * approximation of a more complex geometry for efficient filtering and data skipping. + */ +public class BoundingBox implements Serializable, Comparable { + /** + * Create a {@link BoundingBox} object from buffers containing min and max bounds + * + * @param min the serialized minimum bound + * @param max the serialized maximum bound + * @return a BoundingBox instance + */ + public static BoundingBox fromByteBuffers(ByteBuffer min, ByteBuffer max) { + return new BoundingBox( + GeospatialBound.fromByteBuffer(min), GeospatialBound.fromByteBuffer(max)); + } + + /** + * Create an empty bounding box + * + * @return an empty bounding box + */ + public static BoundingBox empty() { + return new BoundingBox( + GeospatialBound.createXY(Double.NaN, Double.NaN), + GeospatialBound.createXY(Double.NaN, Double.NaN)); + } + + public BoundingBox(GeospatialBound min, GeospatialBound max) { + this.min = min; + this.max = max; + } + + private final GeospatialBound min; + private final GeospatialBound max; + + /** + * Get the minimum corner of the bounding box. + * + * @return the minimum bound + */ + public GeospatialBound min() { + return min; + } + + /** + * Get the maximum corner of the bounding box. + * + * @return the maximum bound + */ + public GeospatialBound max() { + return max; + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } else if (!(other instanceof BoundingBox)) { + return false; + } + + BoundingBox that = (BoundingBox) other; + return Objects.equals(min, that.min) && Objects.equals(max, that.max); + } + + @Override + public int hashCode() { + return Objects.hash(min, max); + } + + @Override + public String toString() { + return "BoundingBox{min=" + min.simpleString() + ", max=" + max.simpleString() + '}'; + } + + @Override + public int compareTo(BoundingBox other) { + int minComparison = min.compareTo(other.min); + if (minComparison != 0) { + return minComparison; + } + + return max.compareTo(other.max); + } +} diff --git a/api/src/main/java/org/apache/iceberg/geospatial/GeospatialBound.java b/api/src/main/java/org/apache/iceberg/geospatial/GeospatialBound.java new file mode 100644 index 000000000000..0d0a853f277e --- /dev/null +++ b/api/src/main/java/org/apache/iceberg/geospatial/GeospatialBound.java @@ -0,0 +1,338 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.geospatial; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Comparator; +import java.util.Objects; + +/** + * Represents a geospatial bound (minimum or maximum) for Iceberg tables. + * + *

According to the Bound + * serialization section of Iceberg Table spec, geospatial bounds are serialized differently + * from the regular WKB representation. Geometry and geography bounds are single point encoded as a + * concatenation of 8-byte little-endian IEEE 754 coordinate values in the order X, Y, Z (optional), + * M (optional). + * + *

The encoding varies based on which coordinates are present: + * + *

    + *
  • x:y (2 doubles) when both z and m are unset + *
  • x:y:z (3 doubles) when only m is unset + *
  • x:y:NaN:m (4 doubles) when only z is unset + *
  • x:y:z:m (4 doubles) when all coordinates are set + *
+ * + *

This class represents a lower or upper geospatial bound and handles serialization and + * deserialization of these bounds to/from byte arrays, conforming to the Iceberg specification. + */ +public class GeospatialBound implements Serializable, Comparable { + /** + * Parses a geospatial bound from a byte buffer according to Iceberg spec. + * + *

Based on the buffer size, this method determines which coordinates are present: - 16 bytes + * (2 doubles): x and y only - 24 bytes (3 doubles): x, y, and z - 32 bytes (4 doubles): x, y, z + * (might be NaN), and m + * + *

The ordinates are encoded as 8-byte little-endian IEEE 754 values. + * + * @param buffer the ByteBuffer containing the serialized geospatial bound + * @return a GeospatialBound object representing the parsed bound + * @throws IllegalArgumentException if the buffer has an invalid size + */ + public static GeospatialBound fromByteBuffer(ByteBuffer buffer) { + // Save original position and byte order to restore them later + int originalPosition = buffer.position(); + ByteOrder originalOrder = buffer.order(); + + try { + buffer.order(ByteOrder.LITTLE_ENDIAN); + int size = buffer.remaining(); + + if (size == 2 * Double.BYTES) { + // x:y format (2 doubles) + double coordX = buffer.getDouble(); + double coordY = buffer.getDouble(); + return createXY(coordX, coordY); + } else if (size == 3 * Double.BYTES) { + // x:y:z format (3 doubles) + double coordX = buffer.getDouble(); + double coordY = buffer.getDouble(); + double coordZ = buffer.getDouble(); + return createXYZ(coordX, coordY, coordZ); + } else if (size == 4 * Double.BYTES) { + // x:y:z:m format (4 doubles) - z might be NaN + double coordX = buffer.getDouble(); + double coordY = buffer.getDouble(); + double coordZ = buffer.getDouble(); + double coordM = buffer.getDouble(); + return new GeospatialBound(coordX, coordY, coordZ, coordM); + } else { + throw new IllegalArgumentException( + "Invalid buffer size for GeospatialBound: expected 16, 24, or 32 bytes, got " + size); + } + } finally { + // Restore original position and byte order + buffer.position(originalPosition); + buffer.order(originalOrder); + } + } + + /** + * Parses a geospatial bound from a byte array according to Iceberg spec. + * + * @param bytes the byte array containing the serialized geospatial bound + * @return a GeospatialBound object representing the parsed bound + * @throws IllegalArgumentException if the byte array has an invalid length + */ + public static GeospatialBound fromByteArray(byte[] bytes) { + int length = bytes.length; + if (length != 2 * Double.BYTES && length != 3 * Double.BYTES && length != 4 * Double.BYTES) { + throw new IllegalArgumentException( + "Invalid byte array length for GeospatialBound: expected 16, 24, or 32 bytes, got " + + length); + } + + return fromByteBuffer(ByteBuffer.wrap(bytes)); + } + + /** + * Creates a GeospatialBound with X and Y coordinates only. + * + * @param x the X coordinate (longitude/easting) + * @param y the Y coordinate (latitude/northing) + * @return a GeospatialBound with XY coordinates + */ + @SuppressWarnings("ParameterName") + public static GeospatialBound createXY(double x, double y) { + return new GeospatialBound(x, y, Double.NaN, Double.NaN); + } + + /** + * Creates a GeospatialBound with X, Y, and Z coordinates, with no M value. + * + * @param x the X coordinate (longitude/easting) + * @param y the Y coordinate (latitude/northing) + * @param z the Z coordinate (elevation) + * @return a GeospatialBound with XYZ coordinates + */ + @SuppressWarnings("ParameterName") + public static GeospatialBound createXYZ(double x, double y, double z) { + return new GeospatialBound(x, y, z, Double.NaN); + } + + /** + * Creates a GeospatialBound with X, Y, Z, and M coordinates. + * + * @param x the X coordinate (longitude/easting) + * @param y the Y coordinate (latitude/northing) + * @param z the Z coordinate (elevation) + * @param m the M value (measure) + * @return a GeospatialBound with XYZM coordinates + */ + @SuppressWarnings("ParameterName") + public static GeospatialBound createXYZM(double x, double y, double z, double m) { + return new GeospatialBound(x, y, z, m); + } + + /** + * Creates a GeospatialBound with X, Y, and M values, with no Z coordinate. + * + * @param x the X coordinate (longitude/easting) + * @param y the Y coordinate (latitude/northing) + * @param m the M value (measure) + * @return a GeospatialBound with XYM coordinates + */ + @SuppressWarnings("ParameterName") + public static GeospatialBound createXYM(double x, double y, double m) { + return new GeospatialBound(x, y, Double.NaN, m); + } + + @SuppressWarnings("MemberName") + private final double x; + + @SuppressWarnings("MemberName") + private final double y; + + @SuppressWarnings("MemberName") + private final double z; + + @SuppressWarnings("MemberName") + private final double m; + + /** Private constructor - use factory methods instead. */ + @SuppressWarnings("ParameterName") + private GeospatialBound(double x, double y, double z, double m) { + this.x = x; + this.y = y; + this.z = z; + this.m = m; + } + + /** + * Get the X coordinate (longitude/easting). + * + * @return X coordinate value + */ + @SuppressWarnings("MethodName") + public double x() { + return x; + } + + /** + * Get the Y coordinate (latitude/northing). + * + * @return Y coordinate value + */ + @SuppressWarnings("MethodName") + public double y() { + return y; + } + + /** + * Get the Z coordinate (typically elevation). + * + * @return Z coordinate value or NaN if not set + */ + @SuppressWarnings("MethodName") + public double z() { + return z; + } + + /** + * Get the M value (measure). + * + * @return M value or NaN if not set + */ + @SuppressWarnings("MethodName") + public double m() { + return m; + } + + /** + * Check if this bound has a defined Z coordinate. + * + * @return true if Z is not NaN + */ + public boolean hasZ() { + return !Double.isNaN(z); + } + + /** + * Check if this bound has a defined M value. + * + * @return true if M is not NaN + */ + public boolean hasM() { + return !Double.isNaN(m); + } + + /** + * Serializes this geospatial bound to a byte buffer according to Iceberg spec. + * + *

Following the Iceberg spec, the bound is serialized based on which coordinates are set: - + * x:y (2 doubles) when both z and m are unset - x:y:z (3 doubles) when only m is unset - + * x:y:NaN:m (4 doubles) when only z is unset - x:y:z:m (4 doubles) when all coordinates are set + * + * @return A ByteBuffer containing the serialized geospatial bound + */ + public ByteBuffer toByteBuffer() { + // Calculate size based on which coordinates are present + int size; + if (!hasZ() && !hasM()) { + // Just x and y + size = 2 * Double.BYTES; + } else if (hasZ() && !hasM()) { + // x, y, and z (no m) + size = 3 * Double.BYTES; + } else { + // x, y, z (or NaN), and m + size = 4 * Double.BYTES; + } + + ByteBuffer buffer = ByteBuffer.allocate(size).order(ByteOrder.LITTLE_ENDIAN); + buffer.putDouble(x); + buffer.putDouble(y); + + if (hasZ() || hasM()) { + // If we have z or m or both, we need to include z (could be NaN) + buffer.putDouble(z); + } + + if (hasM()) { + // If we have m, include it + buffer.putDouble(m); + } + + buffer.flip(); + return buffer; + } + + @Override + public String toString() { + return "GeospatialBound(" + simpleString() + ")"; + } + + public String simpleString() { + StringBuilder sb = new StringBuilder(); + sb.append("x=").append(x).append(", y=").append(y); + + if (hasZ()) { + sb.append(", z=").append(z); + } + + if (hasM()) { + sb.append(", m=").append(m); + } + + return sb.toString(); + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } else if (!(other instanceof GeospatialBound)) { + return false; + } + + GeospatialBound that = (GeospatialBound) other; + return Double.compare(that.x, x) == 0 + && Double.compare(that.y, y) == 0 + && Double.compare(that.z, z) == 0 + && Double.compare(that.m, m) == 0; + } + + @Override + public int hashCode() { + return Objects.hash(GeospatialBound.class, x, y, z, m); + } + + @Override + public int compareTo(GeospatialBound other) { + return Comparator.comparingDouble(GeospatialBound::x) + .thenComparingDouble(GeospatialBound::y) + .thenComparingDouble(GeospatialBound::z) + .thenComparingDouble(GeospatialBound::m) + .compare(this, other); + } +} diff --git a/api/src/main/java/org/apache/iceberg/geospatial/GeospatialPredicateEvaluators.java b/api/src/main/java/org/apache/iceberg/geospatial/GeospatialPredicateEvaluators.java new file mode 100644 index 000000000000..64943902445c --- /dev/null +++ b/api/src/main/java/org/apache/iceberg/geospatial/GeospatialPredicateEvaluators.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.geospatial; + +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Type; + +public class GeospatialPredicateEvaluators { + private GeospatialPredicateEvaluators() {} + + public interface GeospatialPredicateEvaluator { + /** + * Test whether this bounding box intersects with another. + * + * @param bbox1 the first bounding box + * @param bbox2 the second bounding box + * @return true if this box intersects the other box + */ + boolean intersects(BoundingBox bbox1, BoundingBox bbox2); + } + + public static GeospatialPredicateEvaluator create(Type type) { + switch (type.typeId()) { + case GEOMETRY: + return new GeometryEvaluator(); + case GEOGRAPHY: + return new GeographyEvaluator(); + default: + throw new UnsupportedOperationException("Unsupported type for BoundingBox: " + type); + } + } + + static class GeometryEvaluator implements GeospatialPredicateEvaluator { + @Override + public boolean intersects(BoundingBox bbox1, BoundingBox bbox2) { + return intersectsWithWrapAround(bbox1, bbox2); + } + + /** + * Check if two bounding boxes intersect, taking wrap-around into account. + * + *

Wraparound (or antimeridian crossing) occurs when a geography crosses the 180°/-180° + * longitude line on a map. In these cases, the minimum X value is greater than the maximum X + * value (xmin > xmax). This represents a bounding box that wraps around the globe. + * + *

For example, a bounding box with xmin=170° and xmax=-170° represents an area that spans + * from 170° east to 190° east (or equivalently, -170° west). This is important for geometries + * that cross the antimeridian, like a path from Japan to Alaska. + * + *

When xmin > xmax, a point matches if its X coordinate is either X ≥ xmin OR X ≤ xmax, + * rather than the usual X ≥ xmin AND X ≤ xmax. In geographic terms, if the westernmost + * longitude is greater than the easternmost longitude, this indicates an antimeridian crossing. + * + *

The Iceberg specification does not explicitly rule out the use of wrap-around in bounding + * boxes for geometry types, so we handle wrap-around for both geography and geometry bounding + * boxes. + * + * @param bbox1 the first bounding box + * @param bbox2 the second bounding box + * @return true if the bounding boxes intersect + */ + static boolean intersectsWithWrapAround(BoundingBox bbox1, BoundingBox bbox2) { + // Let's check y first, and if y does not intersect, we can return false + if (bbox1.min().y() > bbox2.max().y() || bbox1.max().y() < bbox2.min().y()) { + return false; + } + + // Now check x, need to take wrap-around into account + if (bbox1.min().x() <= bbox1.max().x() && bbox2.min().x() <= bbox2.max().x()) { + // No wrap-around + return bbox1.min().x() <= bbox2.max().x() && bbox1.max().x() >= bbox2.min().x(); + } else if (bbox1.min().x() > bbox1.max().x() && bbox2.min().x() <= bbox2.max().x()) { + // bbox1 wraps around the antimeridian, bbox2 does not + return bbox1.min().x() <= bbox2.max().x() || bbox1.max().x() >= bbox2.min().x(); + } else if (bbox1.min().x() <= bbox1.max().x() && bbox2.min().x() > bbox2.max().x()) { + // bbox2 wraps around the antimeridian, bbox1 does not + return intersectsWithWrapAround(bbox2, bbox1); + } else { + // Both wrap around the antimeridian, they must intersect + return true; + } + } + } + + static class GeographyEvaluator implements GeospatialPredicateEvaluator { + @Override + public boolean intersects(BoundingBox bbox1, BoundingBox bbox2) { + validateBoundingBox(bbox1); + validateBoundingBox(bbox2); + return GeometryEvaluator.intersectsWithWrapAround(bbox1, bbox2); + } + + /** + * For geography types, coordinates are restricted to the canonical ranges of [-180°, 180°] for + * longitude (X) and [-90°, 90°] for latitude (Y). + * + * @param bbox the bounding box to validate + * @throws IllegalArgumentException if the bounding box is invalid + */ + private void validateBoundingBox(BoundingBox bbox) { + Preconditions.checkArgument( + bbox.min().y() >= -90 && bbox.max().y() <= 90, "Latitude out of range: %s", bbox); + Preconditions.checkArgument( + bbox.min().x() >= -180 + && bbox.min().x() <= 180 + && bbox.max().x() >= -180 + && bbox.max().x() <= 180, + "Longitude out of range: %s", + bbox); + } + } +} diff --git a/api/src/main/java/org/apache/iceberg/types/Conversions.java b/api/src/main/java/org/apache/iceberg/types/Conversions.java index e18c7b4362e6..54074756a1b1 100644 --- a/api/src/main/java/org/apache/iceberg/types/Conversions.java +++ b/api/src/main/java/org/apache/iceberg/types/Conversions.java @@ -117,6 +117,19 @@ public static ByteBuffer toByteBuffer(Type.TypeID typeId, Object value) { return (ByteBuffer) value; case DECIMAL: return ByteBuffer.wrap(((BigDecimal) value).unscaledValue().toByteArray()); + case GEOMETRY: + case GEOGRAPHY: + // There are 2 representations of geometry and geography in iceberg: + // + // 1. Well-known binary (WKB) format for general storage and processing + // 2. For bound values (partition and sort keys), points are encoded as little-endian + // doubles: + // X (longitude/easting), Y (latitude/northing), Z (optional elevation), and M (optional + // measure) + // + // No matter what representation is used, geospatial values are always represented as byte + // buffers, so we can just return the value as is. + return (ByteBuffer) value; default: throw new UnsupportedOperationException("Cannot serialize type: " + typeId); } @@ -177,6 +190,11 @@ private static Object internalFromByteBuffer(Type type, ByteBuffer buffer) { byte[] unscaledBytes = new byte[buffer.remaining()]; tmp.get(unscaledBytes); return new BigDecimal(new BigInteger(unscaledBytes), decimal.scale()); + case GEOMETRY: + case GEOGRAPHY: + // GEOMETRY and GEOGRAPHY values are represented as byte buffers. Please refer to the + // comment in toByteBuffer for more details. + return tmp; default: throw new UnsupportedOperationException("Cannot deserialize type: " + type); } diff --git a/api/src/main/java/org/apache/iceberg/types/Types.java b/api/src/main/java/org/apache/iceberg/types/Types.java index 1c16c444d4e6..ec6076b04fa0 100644 --- a/api/src/main/java/org/apache/iceberg/types/Types.java +++ b/api/src/main/java/org/apache/iceberg/types/Types.java @@ -81,7 +81,6 @@ public static Type fromTypeName(String typeString) { Matcher geometry = GEOMETRY_PARAMETERS.matcher(typeString); if (geometry.matches()) { String crs = geometry.group(1); - Preconditions.checkArgument(!crs.contains(","), "Invalid CRS: %s", crs); return GeometryType.of(crs); } @@ -599,7 +598,7 @@ public TypeID typeId() { } public String crs() { - return crs; + return crs != null ? crs : DEFAULT_CRS; } @Override @@ -631,6 +630,7 @@ public String toString() { public static class GeographyType extends PrimitiveType { public static final String DEFAULT_CRS = "OGC:CRS84"; + public static final EdgeAlgorithm DEFAULT_ALGORITHM = EdgeAlgorithm.SPHERICAL; public static GeographyType crs84() { return new GeographyType(); @@ -664,11 +664,11 @@ public TypeID typeId() { } public String crs() { - return crs; + return crs != null ? crs : DEFAULT_CRS; } public EdgeAlgorithm algorithm() { - return algorithm; + return algorithm != null ? algorithm : DEFAULT_ALGORITHM; } @Override diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestBoundGeospatialPredicate.java b/api/src/test/java/org/apache/iceberg/expressions/TestBoundGeospatialPredicate.java new file mode 100644 index 000000000000..4b605bbfe365 --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/expressions/TestBoundGeospatialPredicate.java @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.expressions; + +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.ByteBuffer; +import java.util.stream.Stream; +import org.apache.iceberg.Schema; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +public class TestBoundGeospatialPredicate { + private static final Schema SCHEMA = + new Schema( + required(1, "point", Types.GeometryType.crs84()), + required(2, "geography", Types.GeographyType.crs84()), + required(3, "point2", Types.GeometryType.crs84()), + required(4, "geography2", Types.GeographyType.crs84())); + + private static Stream geospatialOperators() { + return Stream.of( + Arguments.of(Expression.Operation.ST_INTERSECTS, "point", 1), + Arguments.of(Expression.Operation.ST_DISJOINT, "geography", 2)); + } + + @ParameterizedTest + @MethodSource("geospatialOperators") + public void testGeospatialPredicateBinding( + Expression.Operation op, String fieldName, int fieldId) { + // Create a bounding box for testing + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + BoundingBox bbox = new BoundingBox(min, max); + + // Create an unbound predicate based on the operation + UnboundPredicate unbound = Expressions.geospatialPredicate(op, fieldName, bbox); + + // Bind the predicate to the schema + Expression bound = unbound.bind(SCHEMA.asStruct()); + + // Verify the bound predicate is a BoundGeospatialPredicate + assertThat(bound).isInstanceOf(BoundGeospatialPredicate.class); + BoundGeospatialPredicate predicate = (BoundGeospatialPredicate) bound; + + // Verify the operation matches the expected operation + assertThat(predicate.op()).isEqualTo(op); + + // Verify the term references the correct field + assertThat(predicate.term().ref().fieldId()).isEqualTo(fieldId); + + // Verify the literal value is correct + assertThat(predicate.literal().value()).isEqualTo(bbox); + + // Verify the predicate is identified as a geospatial predicate + assertThat(predicate.isGeospatialPredicate()).isTrue(); + + // Only check asGeospatialPredicate for ST_INTERSECTS to maintain original test behavior + if (op == Expression.Operation.ST_INTERSECTS) { + assertThat(predicate.asGeospatialPredicate()).isSameAs(predicate); + } + } + + @ParameterizedTest + @MethodSource("geospatialOperators") + public void testNegation(Expression.Operation op, String fieldName, int fieldId) { + // Create a bounding box for testing + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + BoundingBox bbox = new BoundingBox(min, max); + + // Create an unbound predicate based on the operation + UnboundPredicate unbound = Expressions.geospatialPredicate(op, fieldName, bbox); + + // Bind the predicate to the schema + Expression bound = unbound.bind(SCHEMA.asStruct()); + BoundGeospatialPredicate predicate = (BoundGeospatialPredicate) bound; + + // Negate the predicate + Expression negated = predicate.negate(); + + // Verify the negated predicate is a BoundGeospatialPredicate + assertThat(negated).isInstanceOf(BoundGeospatialPredicate.class); + BoundGeospatialPredicate negatedPredicate = (BoundGeospatialPredicate) negated; + + // Verify the operation is the opposite of the original + Expression.Operation expectedNegatedOp = + (op == Expression.Operation.ST_INTERSECTS) + ? Expression.Operation.ST_DISJOINT + : Expression.Operation.ST_INTERSECTS; + assertThat(negatedPredicate.op()).isEqualTo(expectedNegatedOp); + + // Verify the term and literal are unchanged + assertThat(negatedPredicate.term()).isEqualTo(predicate.term()); + assertThat(negatedPredicate.literal().value()).isEqualTo(predicate.literal().value()); + + // Test double negation + Expression doubleNegated = negatedPredicate.negate(); + assertThat(doubleNegated).isInstanceOf(BoundGeospatialPredicate.class); + BoundGeospatialPredicate doubleNegatedPredicate = (BoundGeospatialPredicate) doubleNegated; + + // Verify the operation is back to the original + assertThat(doubleNegatedPredicate.op()).isEqualTo(op); + } + + @ParameterizedTest + @MethodSource("geospatialOperators") + public void testEquivalence(Expression.Operation op, String fieldName, int fieldId) { + // Create two identical bounding boxes + GeospatialBound min1 = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max1 = GeospatialBound.createXY(3.0, 4.0); + BoundingBox bbox1 = new BoundingBox(min1, max1); + GeospatialBound min2 = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max2 = GeospatialBound.createXY(3.0, 4.0); + BoundingBox bbox2 = new BoundingBox(min2, max2); + + // Create a different bounding box + GeospatialBound min3 = GeospatialBound.createXY(5.0, 6.0); + GeospatialBound max3 = GeospatialBound.createXY(7.0, 8.0); + BoundingBox bbox3 = new BoundingBox(min3, max3); + + // Create the main predicate with the current operation + UnboundPredicate unbound1 = Expressions.geospatialPredicate(op, fieldName, bbox1); + Expression bound1 = unbound1.bind(SCHEMA.asStruct()); + BoundGeospatialPredicate predicate1 = (BoundGeospatialPredicate) bound1; + + // Create a predicate with the same operation and same bounding box + UnboundPredicate unbound2 = Expressions.geospatialPredicate(op, fieldName, bbox2); + Expression bound2 = unbound2.bind(SCHEMA.asStruct()); + BoundGeospatialPredicate predicate2 = (BoundGeospatialPredicate) bound2; + + // Create a predicate with the same operation but different bounding box + UnboundPredicate unbound3 = Expressions.geospatialPredicate(op, fieldName, bbox3); + Expression bound3 = unbound3.bind(SCHEMA.asStruct()); + BoundGeospatialPredicate predicate3 = (BoundGeospatialPredicate) bound3; + + // Create a predicate with the opposite operation and same bounding box + UnboundPredicate unbound4 = + Expressions.geospatialPredicate(op.negate(), fieldName, bbox1); + Expression bound4 = unbound4.bind(SCHEMA.asStruct()); + BoundGeospatialPredicate predicate4 = (BoundGeospatialPredicate) bound4; + + // Create a predicate with the same operation and the same bounding box, but different field + // name + UnboundPredicate unbound5 = Expressions.geospatialPredicate(op, "point2", bbox1); + Expression bound5 = unbound5.bind(SCHEMA.asStruct()); + BoundGeospatialPredicate predicate5 = (BoundGeospatialPredicate) bound5; + + UnboundPredicate unbound6 = + Expressions.geospatialPredicate(op, "geography2", bbox1); + Expression bound6 = unbound6.bind(SCHEMA.asStruct()); + BoundGeospatialPredicate predicate6 = (BoundGeospatialPredicate) bound6; + + // Test equivalence + assertThat(predicate1.isEquivalentTo(predicate2)).isTrue(); + assertThat(predicate2.isEquivalentTo(predicate1)).isTrue(); + + // Different bounding box + assertThat(predicate1.isEquivalentTo(predicate3)).isFalse(); + + // Different operation + assertThat(predicate1.isEquivalentTo(predicate4)).isFalse(); + + // Different field name + assertThat(predicate1.isEquivalentTo(predicate5)).isFalse(); + assertThat(predicate1.isEquivalentTo(predicate6)).isFalse(); + + // Not a geospatial predicate + assertThat(predicate1.isEquivalentTo(Expressions.alwaysTrue())).isFalse(); + } + + @ParameterizedTest + @MethodSource("geospatialOperators") + public void testToString(Expression.Operation op, String fieldName, int fieldId) { + // Create a bounding box for testing + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + BoundingBox bbox = new BoundingBox(min, max); + + // Create an unbound predicate based on the operation + UnboundPredicate unbound = Expressions.geospatialPredicate(op, fieldName, bbox); + + // Bind the predicate to the schema + Expression bound = unbound.bind(SCHEMA.asStruct()); + BoundGeospatialPredicate predicate = (BoundGeospatialPredicate) bound; + + // Verify toString output contains the operation name + String expectedOpString = + (op == Expression.Operation.ST_INTERSECTS) ? "stIntersects" : "stDisjoint"; + assertThat(predicate.toString()).contains(expectedOpString); + + // Verify toString output contains the field ID + assertThat(predicate.toString()).contains("id=" + fieldId); + } + + @ParameterizedTest + @MethodSource("geospatialOperators") + public void testWithComplexBoundingBox(Expression.Operation op, String fieldName, int fieldId) { + // Create a bounding box with Z and M coordinates + GeospatialBound min = GeospatialBound.createXYZM(1.0, 2.0, 3.0, 4.0); + GeospatialBound max = GeospatialBound.createXYZM(5.0, 6.0, 7.0, 8.0); + BoundingBox bbox = new BoundingBox(min, max); + + // Create an unbound predicate based on the operation + UnboundPredicate unbound = Expressions.geospatialPredicate(op, fieldName, bbox); + + // Bind the predicate to the schema + Expression bound = unbound.bind(SCHEMA.asStruct()); + BoundGeospatialPredicate predicate = (BoundGeospatialPredicate) bound; + + // Verify the operation matches the expected operation + assertThat(predicate.op()).isEqualTo(op); + + // Verify the term references the correct field + assertThat(predicate.term().ref().fieldId()).isEqualTo(fieldId); + + // Verify the literal value is correct + assertThat(predicate.literal().value()).isEqualTo(bbox); + + // Verify Z and M coordinates are preserved + BoundingBox boundingBox = predicate.literal().value(); + assertThat(boundingBox.min().hasZ()).isTrue(); + assertThat(boundingBox.min().hasM()).isTrue(); + assertThat(boundingBox.min().z()).isEqualTo(3.0); + assertThat(boundingBox.min().m()).isEqualTo(4.0); + assertThat(boundingBox.max().hasZ()).isTrue(); + assertThat(boundingBox.max().hasM()).isTrue(); + assertThat(boundingBox.max().z()).isEqualTo(7.0); + assertThat(boundingBox.max().m()).isEqualTo(8.0); + } + + @ParameterizedTest + @MethodSource("geospatialOperators") + public void testWithSpecialValues(Expression.Operation op, String fieldName, int fieldId) { + // Create a bounding box with NaN and infinity values + GeospatialBound min = GeospatialBound.createXY(Double.NEGATIVE_INFINITY, Double.NaN); + GeospatialBound max = GeospatialBound.createXY(Double.POSITIVE_INFINITY, Double.NaN); + BoundingBox bbox = new BoundingBox(min, max); + + // Create an unbound predicate based on the operation + UnboundPredicate unbound = Expressions.geospatialPredicate(op, fieldName, bbox); + + // Bind the predicate to the schema + Expression bound = unbound.bind(SCHEMA.asStruct()); + BoundGeospatialPredicate predicate = (BoundGeospatialPredicate) bound; + + // Verify the operation matches the expected operation + assertThat(predicate.op()).isEqualTo(op); + + // Verify the term references the correct field + assertThat(predicate.term().ref().fieldId()).isEqualTo(fieldId); + + // Verify the literal value is correct + assertThat(predicate.literal().value()).isEqualTo(bbox); + + // Verify special values are preserved + BoundingBox boundingBox = predicate.literal().value(); + assertThat(boundingBox.min().x()).isEqualTo(Double.NEGATIVE_INFINITY); + assertThat(Double.isNaN(boundingBox.min().y())).isTrue(); + assertThat(boundingBox.max().x()).isEqualTo(Double.POSITIVE_INFINITY); + assertThat(Double.isNaN(boundingBox.max().y())).isTrue(); + } +} diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java index 995dde539f8c..1233e9ab6e46 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java @@ -43,15 +43,22 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.stream.Stream; import org.apache.avro.util.Utf8; import org.apache.iceberg.TestHelpers; import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.StructType; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; public class TestEvaluator { private static final StructType STRUCT = @@ -809,4 +816,33 @@ public void testNotInExceptions() { .isInstanceOf(ValidationException.class) .hasMessageContaining("Invalid value for conversion to type int"); } + + private static Stream geospatialPredicateParameters() { + return Stream.of( + Arguments.of(Expression.Operation.ST_INTERSECTS, "geom"), + Arguments.of(Expression.Operation.ST_INTERSECTS, "geog"), + Arguments.of(Expression.Operation.ST_DISJOINT, "geom"), + Arguments.of(Expression.Operation.ST_DISJOINT, "geog")); + } + + @ParameterizedTest + @MethodSource("geospatialPredicateParameters") + public void testGeospatialPredicates(Expression.Operation operation, String columnName) { + StructType geoStruct = + StructType.of( + required(1, "geom", Types.GeometryType.crs84()), + required(2, "geog", Types.GeographyType.crs84())); + + BoundingBox bbox = + new BoundingBox(GeospatialBound.createXY(1.0, 2.0), GeospatialBound.createXY(3.0, 4.0)); + + // Create a WKB point at (2, 3) + ByteBuffer wkb = + ByteBuffer.wrap( + new byte[] {1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 8, 64}); + + Evaluator evaluator = + new Evaluator(geoStruct, Expressions.geospatialPredicate(operation, columnName, bbox)); + assertThat(evaluator.eval(TestHelpers.Row.of(wkb, wkb))).isTrue(); + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java index 5293681f6f5d..8056e445aea3 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java @@ -40,6 +40,8 @@ import org.apache.iceberg.TestHelpers; import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.StructType; import org.junit.jupiter.api.Test; @@ -55,7 +57,9 @@ public class TestExpressionBinding { required(3, "data", Types.StringType.get()), required(4, "var", Types.VariantType.get()), optional(5, "nullable", Types.IntegerType.get()), - optional(6, "always_null", Types.UnknownType.get())); + optional(6, "always_null", Types.UnknownType.get()), + required(7, "point", Types.GeometryType.crs84()), + required(8, "geography", Types.GeographyType.crs84())); @Test public void testMissingReference() { @@ -421,4 +425,66 @@ public void testExtractBindingWithTypes(String typeName) { assertThat(pred.term()).as("Should use a BoundExtract").isInstanceOf(BoundExtract.class); assertThat(pred.term().type()).isEqualTo(Types.fromPrimitiveString(typeName)); } + + @Test + public void testStIntersects() { + // Create a bounding box for testing + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + BoundingBox bbox = new BoundingBox(min, max); + + Expression expr = + Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "point", bbox); + Expression bound = Binder.bind(STRUCT, expr); + + TestHelpers.assertAllReferencesBound("BoundGeospatialPredicate", bound); + BoundPredicate pred = TestHelpers.assertAndUnwrap(bound); + assertThat(pred.op()).isEqualTo(Expression.Operation.ST_INTERSECTS); + assertThat(pred.term().ref().fieldId()).as("Should bind point correctly").isEqualTo(7); + assertThat(bound).isInstanceOf(BoundGeospatialPredicate.class); + BoundGeospatialPredicate predicate = (BoundGeospatialPredicate) bound; + assertThat(predicate.literal().value()).isEqualTo(bbox); + } + + @Test + public void testStDisjoint() { + // Create a bounding box for testing + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + BoundingBox bbox = new BoundingBox(min, max); + + Expression expr = + Expressions.geospatialPredicate(Expression.Operation.ST_DISJOINT, "geography", bbox); + Expression bound = Binder.bind(STRUCT, expr); + + TestHelpers.assertAllReferencesBound("BoundGeospatialPredicate", bound); + BoundPredicate pred = TestHelpers.assertAndUnwrap(bound); + assertThat(pred.op()).isEqualTo(Expression.Operation.ST_DISJOINT); + assertThat(pred.term().ref().fieldId()).as("Should bind geography correctly").isEqualTo(8); + assertThat(bound).isInstanceOf(BoundGeospatialPredicate.class); + BoundGeospatialPredicate predicate = (BoundGeospatialPredicate) bound; + assertThat(predicate.literal().value()).isEqualTo(bbox); + } + + @Test + public void testGeospatialPredicateWithInvalidField() { + // Create a bounding box for testing + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + BoundingBox bbox = new BoundingBox(min, max); + + // Test with a field that doesn't exist + Expression expr = + Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "nonexistent", bbox); + assertThatThrownBy(() -> Binder.bind(STRUCT, expr)) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Cannot find field 'nonexistent' in struct"); + + // Test with a field that is not a geometry or geography type + Expression expr2 = + Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "x", bbox); + assertThatThrownBy(() -> Binder.bind(STRUCT, expr2)) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Cannot bind geospatial operation to non-geospatial type"); + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java index 2a1fab10a445..09c643afd164 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java @@ -50,6 +50,8 @@ import java.util.List; import java.util.concurrent.Callable; import org.apache.iceberg.Schema; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.NestedField; @@ -273,6 +275,73 @@ public void testInvalidateNaNInput() { assertInvalidateNaNThrows(() -> predicate(Expression.Operation.EQ, "a", Double.NaN)); } + @Test + public void testRewriteNotForGeospatialPredicates() { + // Create a schema with geometry and geography fields + StructType struct = + StructType.of( + NestedField.optional(1, "geom", Types.GeometryType.crs84()), + NestedField.optional(2, "geog", Types.GeographyType.crs84())); + + // Create a bounding box for testing + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + BoundingBox bbox = new BoundingBox(min, max); + + // Test pairs of expressions: (rewritten pred, original pred) + Expression[][] expressions = + new Expression[][] { + // ST_INTERSECTS and its negation (ST_DISJOINT) + { + Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "geom", bbox), + Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "geom", bbox) + }, + { + Expressions.geospatialPredicate(Expression.Operation.ST_DISJOINT, "geom", bbox), + not(Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "geom", bbox)) + }, + { + Expressions.geospatialPredicate(Expression.Operation.ST_DISJOINT, "geom", bbox), + Expressions.geospatialPredicate(Expression.Operation.ST_DISJOINT, "geom", bbox) + }, + { + Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "geom", bbox), + not(Expressions.geospatialPredicate(Expression.Operation.ST_DISJOINT, "geom", bbox)) + }, + // Same tests with geography type + { + Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "geog", bbox), + Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "geog", bbox) + }, + { + Expressions.geospatialPredicate(Expression.Operation.ST_DISJOINT, "geog", bbox), + not(Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "geog", bbox)) + }, + { + Expressions.geospatialPredicate(Expression.Operation.ST_DISJOINT, "geog", bbox), + Expressions.geospatialPredicate(Expression.Operation.ST_DISJOINT, "geog", bbox) + }, + { + Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "geog", bbox), + not(Expressions.geospatialPredicate(Expression.Operation.ST_DISJOINT, "geog", bbox)) + } + }; + + for (Expression[] pair : expressions) { + // unbound rewrite + assertThat(rewriteNot(pair[1])) + .as(String.format("rewriteNot(%s) should be %s", pair[1], pair[0])) + .hasToString(pair[0].toString()); + + // bound rewrite + Expression expectedBound = Binder.bind(struct, pair[0]); + Expression toRewriteBound = Binder.bind(struct, pair[1]); + assertThat(rewriteNot(toRewriteBound)) + .as(String.format("rewriteNot(%s) should be %s", toRewriteBound, expectedBound)) + .hasToString(expectedBound.toString()); + } + } + private void assertInvalidateNaNThrows(Callable> callable) { assertThatThrownBy(callable::call) .isInstanceOf(IllegalArgumentException.class) diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionSerialization.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionSerialization.java index fc7ddd035bf2..3a6f4be71ba8 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionSerialization.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionSerialization.java @@ -24,6 +24,8 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.TestHelpers; import org.apache.iceberg.expressions.Expression.Operation; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; @@ -33,7 +35,9 @@ public void testExpressions() throws Exception { Schema schema = new Schema( Types.NestedField.optional(34, "a", Types.IntegerType.get()), - Types.NestedField.required(35, "s", Types.StringType.get())); + Types.NestedField.required(35, "s", Types.StringType.get()), + Types.NestedField.required(36, "point", Types.GeometryType.crs84()), + Types.NestedField.required(37, "geography", Types.GeographyType.crs84())); Expression[] expressions = new Expression[] { @@ -61,7 +65,26 @@ public void testExpressions() throws Exception { Expressions.notIn("s", "abc", "xyz").bind(schema.asStruct()), Expressions.isNull("a").bind(schema.asStruct()), Expressions.startsWith("s", "abc").bind(schema.asStruct()), - Expressions.notStartsWith("s", "xyz").bind(schema.asStruct()) + Expressions.notStartsWith("s", "xyz").bind(schema.asStruct()), + Expressions.notStartsWith("s", "xyz").bind(schema.asStruct()), + Expressions.stIntersects( + "point", + new BoundingBox( + GeospatialBound.createXY(1.0, 2.0), GeospatialBound.createXY(3.0, 4.0))), + Expressions.stDisjoint( + "geography", + new BoundingBox( + GeospatialBound.createXY(5.0, 6.0), GeospatialBound.createXY(7.0, 8.0))), + Expressions.stIntersects( + "point", + new BoundingBox( + GeospatialBound.createXY(1.0, 2.0), GeospatialBound.createXY(3.0, 4.0))) + .bind(schema.asStruct()), + Expressions.stDisjoint( + "geography", + new BoundingBox( + GeospatialBound.createXY(5.0, 6.0), GeospatialBound.createXY(7.0, 8.0))) + .bind(schema.asStruct()) }; for (Expression expression : expressions) { @@ -149,12 +172,20 @@ private static boolean equals(Predicate left, Predicate right) { if (left instanceof UnboundPredicate) { UnboundPredicate lpred = (UnboundPredicate) left; UnboundPredicate rpred = (UnboundPredicate) right; - if (left.op() == Operation.IN || left.op() == Operation.NOT_IN) { + if (left.op() == Operation.IN + || left.op() == Operation.NOT_IN + || left.op() == Operation.ST_INTERSECTS + || left.op() == Operation.ST_DISJOINT) { return equals(lpred.literals(), rpred.literals()); } return lpred.literal().comparator().compare(lpred.literal().value(), rpred.literal().value()) == 0; + } else if (left instanceof BoundGeospatialPredicate) { + BoundGeospatialPredicate lpred = (BoundGeospatialPredicate) left; + BoundGeospatialPredicate rpred = (BoundGeospatialPredicate) right; + return lpred.isEquivalentTo(rpred); + } else if (left instanceof BoundPredicate) { BoundPredicate lpred = (BoundPredicate) left; BoundPredicate rpred = (BoundPredicate) right; diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java index ca08951b1f53..46a65a6d32e9 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java @@ -31,8 +31,11 @@ import java.util.Map; import java.util.regex.Pattern; import java.util.stream.IntStream; +import java.util.stream.Stream; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; @@ -45,6 +48,9 @@ import org.apache.iceberg.variants.VariantTestUtil; import org.apache.iceberg.variants.VariantValue; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; public class TestExpressionUtil { private static final Schema SCHEMA = @@ -1305,6 +1311,48 @@ private static VariantArray createArrayWithNestedTypes() { return (VariantArray) VariantValue.from(metadata, variantBB); } + private static Stream geospatialPredicateParameters() { + return Stream.of( + Arguments.of(Expression.Operation.ST_INTERSECTS, "geom"), + Arguments.of(Expression.Operation.ST_INTERSECTS, "geog"), + Arguments.of(Expression.Operation.ST_DISJOINT, "geom"), + Arguments.of(Expression.Operation.ST_DISJOINT, "geog")); + } + + @ParameterizedTest + @MethodSource("geospatialPredicateParameters") + public void testSanitizeGeospatialPredicates(Expression.Operation operation, String columnName) { + // Create a schema with geometry and geography fields + Schema geoSchema = + new Schema( + Types.NestedField.required(1, "geom", Types.GeometryType.crs84()), + Types.NestedField.required(2, "geog", Types.GeographyType.crs84())); + Types.StructType geoStruct = geoSchema.asStruct(); + + // Create a bounding box for testing + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + BoundingBox bbox = new BoundingBox(min, max); + + UnboundPredicate geoPredicate = + Expressions.geospatialPredicate(operation, columnName, bbox); + Expression predicateSanitized = + Expressions.geospatialPredicate(operation, columnName, BoundingBox.empty()); + assertEquals(predicateSanitized, ExpressionUtil.sanitize(geoPredicate)); + assertEquals(predicateSanitized, ExpressionUtil.sanitize(geoStruct, geoPredicate, true)); + + String opString = operation.name(); + String expectedSanitizedString = columnName + " " + opString + " WITH (bounding-box)"; + + assertThat(ExpressionUtil.toSanitizedString(geoPredicate)) + .as("Sanitized string should be identical for geospatial predicates") + .isEqualTo(expectedSanitizedString); + + assertThat(ExpressionUtil.toSanitizedString(geoStruct, geoPredicate, true)) + .as("Sanitized string should be identical for geospatial predicates") + .isEqualTo(expectedSanitizedString); + } + private void assertEquals(Expression expected, Expression actual) { assertThat(expected).isInstanceOf(UnboundPredicate.class); assertEquals((UnboundPredicate) expected, (UnboundPredicate) actual); diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java index 7069d891c38d..dec7b2cb71cb 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java @@ -34,6 +34,8 @@ import static org.apache.iceberg.expressions.Expressions.notNull; import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.stDisjoint; +import static org.apache.iceberg.expressions.Expressions.stIntersects; import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.iceberg.types.Conversions.toByteBuffer; import static org.apache.iceberg.types.Types.NestedField.optional; @@ -47,6 +49,8 @@ import org.apache.iceberg.TestHelpers.Row; import org.apache.iceberg.TestHelpers.TestDataFile; import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; @@ -71,7 +75,11 @@ public class TestInclusiveMetricsEvaluator { optional(11, "all_nans_v1_stats", Types.FloatType.get()), optional(12, "nan_and_null_only", Types.DoubleType.get()), optional(13, "no_nan_stats", Types.DoubleType.get()), - optional(14, "some_empty", Types.StringType.get())); + optional(14, "some_empty", Types.StringType.get()), + optional(15, "geom", Types.GeometryType.crs84()), + optional(16, "all_nulls_geom", Types.GeometryType.crs84()), + optional(17, "geog", Types.GeographyType.crs84()), + optional(18, "all_nulls_geog", Types.GeographyType.crs84())); private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; @@ -187,6 +195,40 @@ public class TestInclusiveMetricsEvaluator { // upper bounds ImmutableMap.of(3, toByteBuffer(StringType.get(), "abcdefghi"))); + private static final DataFile FILE_6 = + new TestDataFile( + "file_6.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.builder() + .put(15, 20L) + .put(16, 20L) + .put(17, 20L) + .put(18, 20L) + .buildOrThrow(), + // null value counts + ImmutableMap.builder() + .put(15, 2L) + .put(16, 20L) + .put(17, 2L) + .put(18, 20L) + .buildOrThrow(), + // nan value counts + null, + // lower bounds + ImmutableMap.of( + 15, + GeospatialBound.createXY(1, 2).toByteBuffer(), + 17, + GeospatialBound.createXY(1, 2).toByteBuffer()), + // upper bounds + ImmutableMap.of( + 15, + GeospatialBound.createXY(10, 20).toByteBuffer(), + 17, + GeospatialBound.createXY(10, 20).toByteBuffer())); + @Test public void testAllNulls() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE); @@ -863,4 +905,62 @@ public void testIntegerNotIn() { shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notIn("no_nulls", "abc", "def")).eval(FILE); assertThat(shouldRead).as("Should read: notIn on no nulls column").isTrue(); } + + @Test + public void testStIntersects() { + boolean shouldRead = + new InclusiveMetricsEvaluator( + SCHEMA, + stIntersects( + "geom", + new BoundingBox( + GeospatialBound.createXY(0, 0), GeospatialBound.createXY(3, 4)))) + .eval(FILE_6); + assertThat(shouldRead).as("Should read: query window intersects the boundary").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator( + SCHEMA, + stIntersects( + "geom", + new BoundingBox( + GeospatialBound.createXY(0, 0), GeospatialBound.createXY(0.5, 2)))) + .eval(FILE_6); + assertThat(shouldRead) + .as("Should skip: query window does not intersect with the boundary") + .isFalse(); + + shouldRead = + new InclusiveMetricsEvaluator( + SCHEMA, + stIntersects( + "geom", + new BoundingBox( + GeospatialBound.createXY(0, 0), GeospatialBound.createXY(0.5, 2)))) + .eval(FILE); + assertThat(shouldRead).as("Should read: stats is missing").isTrue(); + } + + @Test + public void testStDisjoint() { + boolean shouldRead = + new InclusiveMetricsEvaluator( + SCHEMA, + stDisjoint( + "geom", + new BoundingBox( + GeospatialBound.createXY(0, 0), GeospatialBound.createXY(3, 4)))) + .eval(FILE_6); + assertThat(shouldRead).as("Should read: always read no matter if it's disjoint").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator( + SCHEMA, + stDisjoint( + "geom", + new BoundingBox( + GeospatialBound.createXY(0, 0), GeospatialBound.createXY(0.5, 2)))) + .eval(FILE); + assertThat(shouldRead).as("Should read: stats is missing").isTrue(); + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestLiteralSerialization.java b/api/src/test/java/org/apache/iceberg/expressions/TestLiteralSerialization.java index 24fc458b37b4..d82032b258b6 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestLiteralSerialization.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestLiteralSerialization.java @@ -23,6 +23,8 @@ import java.math.BigDecimal; import java.util.UUID; import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; @@ -47,6 +49,9 @@ public void testLiterals() throws Exception { Literal.of(new byte[] {1, 2, 3}).to(Types.FixedType.ofLength(3)), Literal.of(new byte[] {3, 4, 5, 6}).to(Types.BinaryType.get()), Literal.of(new BigDecimal("122.50")), + Literal.of( + new BoundingBox( + GeospatialBound.createXY(1.0, 2.0), GeospatialBound.createXY(3.0, 4.0))) }; for (Literal lit : literals) { diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestMiscLiteralConversions.java b/api/src/test/java/org/apache/iceberg/expressions/TestMiscLiteralConversions.java index e2611ddb281f..cf6e66d0cbf9 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestMiscLiteralConversions.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestMiscLiteralConversions.java @@ -25,6 +25,8 @@ import java.util.Arrays; import java.util.List; import java.util.UUID; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; @@ -302,7 +304,9 @@ public void testInvalidStringConversions() { Types.FloatType.get(), Types.DoubleType.get(), Types.FixedType.ofLength(1), - Types.BinaryType.get()); + Types.BinaryType.get(), + Types.GeometryType.crs84(), + Types.GeographyType.crs84()); } @Test @@ -344,7 +348,9 @@ public void testInvalidFixedConversions() { Types.DecimalType.of(9, 2), Types.StringType.get(), Types.UUIDType.get(), - Types.FixedType.ofLength(1)); + Types.FixedType.ofLength(1), + Types.GeometryType.crs84(), + Types.GeographyType.crs84()); } @Test @@ -365,7 +371,34 @@ public void testInvalidBinaryConversions() { Types.DecimalType.of(9, 2), Types.StringType.get(), Types.UUIDType.get(), - Types.FixedType.ofLength(1)); + Types.FixedType.ofLength(1), + Types.GeometryType.crs84(), + Types.GeographyType.crs84()); + } + + @Test + public void testInvalidGeospatialBoundingBoxConversions() { + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + Literal geoBoundingBoxLiteral = Literal.of(new BoundingBox(min, max)); + + // Test that geospatial bounding box literals cannot be converted to other types + testInvalidConversions( + geoBoundingBoxLiteral, + Types.BooleanType.get(), + Types.IntegerType.get(), + Types.LongType.get(), + Types.FloatType.get(), + Types.DoubleType.get(), + Types.DateType.get(), + Types.TimeType.get(), + Types.DecimalType.of(9, 2), + Types.StringType.get(), + Types.UUIDType.get(), + Types.BinaryType.get(), + Types.FixedType.ofLength(1), + Types.GeometryType.crs84(), + Types.GeographyType.crs84()); } private void testInvalidConversions(Literal lit, Type... invalidTypes) { diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java b/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java index a07c8fd1569d..769cc48563a0 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java @@ -41,10 +41,13 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.math.BigDecimal; +import java.nio.ByteBuffer; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.StructType; import org.junit.jupiter.api.Test; @@ -648,4 +651,89 @@ public void testNotInPredicateBindingConversionToExpression() { .as("Should change NOT_IN to alwaysTrue expression") .isEqualTo(Expressions.alwaysTrue()); } + + @Test + public void testGeospatialPredicateBinding() { + StructType struct = + StructType.of( + required(20, "geom", Types.GeometryType.crs84()), + required(21, "geog", Types.GeographyType.crs84())); + + // Create a bounding box for testing + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + BoundingBox bbox = new BoundingBox(min, max); + + // Test ST_INTERSECTS with geometry + UnboundPredicate stIntersectsGeom = + Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "geom", bbox); + Expression boundStIntersectsGeom = stIntersectsGeom.bind(struct); + assertThat(boundStIntersectsGeom).isInstanceOf(BoundGeospatialPredicate.class); + BoundGeospatialPredicate boundGeomPred = (BoundGeospatialPredicate) boundStIntersectsGeom; + assertThat(boundGeomPred.op()).isEqualTo(Expression.Operation.ST_INTERSECTS); + assertThat(boundGeomPred.term().ref().fieldId()).isEqualTo(20); + assertThat(boundGeomPred.literal().value()).isEqualTo(bbox); + + // Test ST_DISJOINT with geometry + UnboundPredicate stDisjointGeom = + Expressions.geospatialPredicate(Expression.Operation.ST_DISJOINT, "geom", bbox); + Expression boundStDisjointGeom = stDisjointGeom.bind(struct); + assertThat(boundStDisjointGeom).isInstanceOf(BoundGeospatialPredicate.class); + BoundGeospatialPredicate boundDisjointGeomPred = (BoundGeospatialPredicate) boundStDisjointGeom; + assertThat(boundDisjointGeomPred.op()).isEqualTo(Expression.Operation.ST_DISJOINT); + assertThat(boundDisjointGeomPred.term().ref().fieldId()).isEqualTo(20); + assertThat(boundDisjointGeomPred.literal().value()).isEqualTo(bbox); + + // Test ST_INTERSECTS with geography + UnboundPredicate stIntersectsGeog = + Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "geog", bbox); + Expression boundStIntersectsGeog = stIntersectsGeog.bind(struct); + assertThat(boundStIntersectsGeog).isInstanceOf(BoundGeospatialPredicate.class); + BoundGeospatialPredicate boundGeogPred = (BoundGeospatialPredicate) boundStIntersectsGeog; + assertThat(boundGeogPred.op()).isEqualTo(Expression.Operation.ST_INTERSECTS); + assertThat(boundGeogPred.term().ref().fieldId()).isEqualTo(21); + assertThat(boundGeogPred.literal().value()).isEqualTo(bbox); + + // Test ST_DISJOINT with geography + UnboundPredicate stDisjointGeog = + Expressions.geospatialPredicate(Expression.Operation.ST_DISJOINT, "geog", bbox); + Expression boundStDisjointGeog = stDisjointGeog.bind(struct); + assertThat(boundStDisjointGeog).isInstanceOf(BoundGeospatialPredicate.class); + BoundGeospatialPredicate boundDisjointGeogPred = (BoundGeospatialPredicate) boundStDisjointGeog; + assertThat(boundDisjointGeogPred.op()).isEqualTo(Expression.Operation.ST_DISJOINT); + assertThat(boundDisjointGeogPred.term().ref().fieldId()).isEqualTo(21); + assertThat(boundDisjointGeogPred.literal().value()).isEqualTo(bbox); + } + + @Test + public void testMissingFieldGeospatialPredicate() { + StructType struct = StructType.of(required(22, "x", Types.IntegerType.get())); + + // Create a bounding box for testing + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + BoundingBox bbox = new BoundingBox(min, max); + + UnboundPredicate unbound = + Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "missing", bbox); + assertThatThrownBy(() -> unbound.bind(struct)) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Cannot find field 'missing' in struct:"); + } + + @Test + public void testInvalidTypeGeospatialPredicate() { + StructType struct = StructType.of(required(23, "x", Types.IntegerType.get())); + + // Create a bounding box for testing + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + BoundingBox bbox = new BoundingBox(min, max); + + UnboundPredicate unbound = + Expressions.geospatialPredicate(Expression.Operation.ST_INTERSECTS, "x", bbox); + assertThatThrownBy(() -> unbound.bind(struct)) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Cannot bind geospatial operation to non-geospatial type:"); + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index f34cd730df77..cfbd657ad5c9 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -20,6 +20,7 @@ import static org.apache.iceberg.expressions.Expressions.and; import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.geospatialPredicate; import static org.apache.iceberg.expressions.Expressions.greaterThan; import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; import static org.apache.iceberg.expressions.Expressions.in; @@ -39,16 +40,22 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.util.stream.Stream; import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.TestHelpers.Row; import org.apache.iceberg.TestHelpers.TestDataFile; import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.IntegerType; import org.apache.iceberg.types.Types.StringType; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; public class TestStrictMetricsEvaluator { private static final Schema SCHEMA = @@ -73,7 +80,9 @@ public class TestStrictMetricsEvaluator { Types.StructType.of( Types.NestedField.optional(16, "nested_col_no_stats", Types.IntegerType.get()), Types.NestedField.optional( - 17, "nested_col_with_stats", Types.IntegerType.get())))); + 17, "nested_col_with_stats", Types.IntegerType.get()))), + optional(18, "geom", Types.GeometryType.crs84()), + optional(19, "geog", Types.GeographyType.crs84())); private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; @@ -172,6 +181,32 @@ public class TestStrictMetricsEvaluator { // upper bounds ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb"))); + private static final DataFile FILE_4 = + new TestDataFile( + "file_4.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of( + 1, 50L, + 18, 50L, + 19, 50L), + // null value counts + ImmutableMap.of( + 1, 0L, + 18, 0L, + 19, 0L), + // nan value counts + null, + // lower bounds + ImmutableMap.of( + 18, GeospatialBound.createXY(1, 2).toByteBuffer(), + 19, GeospatialBound.createXY(1, 2).toByteBuffer()), + // upper bounds + ImmutableMap.of( + 18, GeospatialBound.createXY(10, 20).toByteBuffer(), + 19, GeospatialBound.createXY(10, 20).toByteBuffer())); + @Test public void testAllNulls() { boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE); @@ -684,4 +719,27 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE)) new StrictMetricsEvaluator(SCHEMA, notNull("struct.nested_col_with_stats")).eval(FILE); assertThat(shouldRead).as("notNull nested column should not match").isFalse(); } + + private static Stream geospatialPredicateParameters() { + return Stream.of( + Arguments.of(Expression.Operation.ST_INTERSECTS, "geom"), + Arguments.of(Expression.Operation.ST_INTERSECTS, "geog"), + Arguments.of(Expression.Operation.ST_DISJOINT, "geom"), + Arguments.of(Expression.Operation.ST_DISJOINT, "geog")); + } + + @ParameterizedTest + @MethodSource("geospatialPredicateParameters") + public void testGeospatialPredicates(Expression.Operation operation, String columnName) { + boolean shouldRead = + new StrictMetricsEvaluator( + SCHEMA, + geospatialPredicate( + operation, + columnName, + new BoundingBox( + GeospatialBound.createXY(1, 2), GeospatialBound.createXY(2, 3)))) + .eval(FILE_4); + assertThat(shouldRead).as("Geospatial predicate should never match").isFalse(); + } } diff --git a/api/src/test/java/org/apache/iceberg/geospatial/TestBoundingBox.java b/api/src/test/java/org/apache/iceberg/geospatial/TestBoundingBox.java new file mode 100644 index 000000000000..a0648ca85ee5 --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/geospatial/TestBoundingBox.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.geospatial; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import org.junit.jupiter.api.Test; + +public class TestBoundingBox { + + @Test + public void testConstructorAndAccessors() { + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + + BoundingBox box = new BoundingBox(min, max); + + assertThat(box.min()).isEqualTo(min); + assertThat(box.max()).isEqualTo(max); + assertThat(box.min().x()).isEqualTo(1.0); + assertThat(box.min().y()).isEqualTo(2.0); + assertThat(box.max().x()).isEqualTo(3.0); + assertThat(box.max().y()).isEqualTo(4.0); + } + + @Test + public void testCreateFromByteBuffers() { + // Create byte buffers for XY bounds + ByteBuffer minBuffer = ByteBuffer.allocate(16); + minBuffer.order(ByteOrder.LITTLE_ENDIAN); + minBuffer.putDouble(0, 1.0); // x + minBuffer.putDouble(8, 2.0); // y + + ByteBuffer maxBuffer = ByteBuffer.allocate(16); + maxBuffer.order(ByteOrder.LITTLE_ENDIAN); + maxBuffer.putDouble(0, 3.0); // x + maxBuffer.putDouble(8, 4.0); // y + + BoundingBox box = BoundingBox.fromByteBuffers(minBuffer, maxBuffer); + + assertThat(box.min().x()).isEqualTo(1.0); + assertThat(box.min().y()).isEqualTo(2.0); + assertThat(box.max().x()).isEqualTo(3.0); + assertThat(box.max().y()).isEqualTo(4.0); + assertThat(minBuffer.order()).isEqualTo(ByteOrder.LITTLE_ENDIAN); + assertThat(maxBuffer.order()).isEqualTo(ByteOrder.LITTLE_ENDIAN); + } + + @Test + public void testCreateFromBigEndianByteBuffers() { + // Create byte buffers for XY bounds + ByteBuffer minBuffer = ByteBuffer.allocate(16); + minBuffer.order(ByteOrder.LITTLE_ENDIAN); + minBuffer.putDouble(0, 10.0); // x + minBuffer.putDouble(8, 20.0); // y + minBuffer.order(ByteOrder.BIG_ENDIAN); + + ByteBuffer maxBuffer = ByteBuffer.allocate(16); + maxBuffer.order(ByteOrder.LITTLE_ENDIAN); + maxBuffer.putDouble(0, 30.0); // x + maxBuffer.putDouble(8, 40.0); // y + maxBuffer.order(ByteOrder.BIG_ENDIAN); + + BoundingBox box = BoundingBox.fromByteBuffers(minBuffer, maxBuffer); + + assertThat(box.min().x()).isEqualTo(10.0); + assertThat(box.min().y()).isEqualTo(20.0); + assertThat(box.max().x()).isEqualTo(30.0); + assertThat(box.max().y()).isEqualTo(40.0); + assertThat(minBuffer.order()).isEqualTo(ByteOrder.BIG_ENDIAN); + assertThat(maxBuffer.order()).isEqualTo(ByteOrder.BIG_ENDIAN); + } + + @Test + public void testEqualsAndHashCode() { + GeospatialBound min1 = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max1 = GeospatialBound.createXY(3.0, 4.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + // Same values + GeospatialBound min2 = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max2 = GeospatialBound.createXY(3.0, 4.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + // Different values + GeospatialBound min3 = GeospatialBound.createXY(0.0, 0.0); + GeospatialBound max3 = GeospatialBound.createXY(10.0, 10.0); + BoundingBox box3 = new BoundingBox(min3, max3); + + // Test equals + assertThat(box1).isEqualTo(box2); + assertThat(box1).isNotEqualTo(box3); + assertThat(box1).isNotEqualTo(null); + assertThat(box1).isNotEqualTo("not a box"); + + // Test hashCode + assertThat(box1.hashCode()).isEqualTo(box2.hashCode()); + assertThat(box1.hashCode()).isNotEqualTo(box3.hashCode()); + } + + @Test + public void testToString() { + GeospatialBound min = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound max = GeospatialBound.createXY(3.0, 4.0); + BoundingBox box = new BoundingBox(min, max); + assertThat(box.toString()).isEqualTo("BoundingBox{min=x=1.0, y=2.0, max=x=3.0, y=4.0}"); + } +} diff --git a/api/src/test/java/org/apache/iceberg/geospatial/TestGeospatialBound.java b/api/src/test/java/org/apache/iceberg/geospatial/TestGeospatialBound.java new file mode 100644 index 000000000000..fa01bf2a0c9a --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/geospatial/TestGeospatialBound.java @@ -0,0 +1,259 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.geospatial; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.ByteBuffer; +import org.apache.iceberg.util.ByteBuffers; +import org.junit.jupiter.api.Test; + +public class TestGeospatialBound { + + @Test + public void testCreateXY() { + GeospatialBound bound = GeospatialBound.createXY(1.0, 2.0); + assertThat(bound.x()).isEqualTo(1.0); + assertThat(bound.y()).isEqualTo(2.0); + assertThat(bound.hasZ()).isFalse(); + assertThat(bound.hasM()).isFalse(); + assertThat(Double.isNaN(bound.z())).isTrue(); + assertThat(Double.isNaN(bound.m())).isTrue(); + } + + @Test + public void testCreateXYZ() { + GeospatialBound bound = GeospatialBound.createXYZ(1.0, 2.0, 3.0); + assertThat(bound.x()).isEqualTo(1.0); + assertThat(bound.y()).isEqualTo(2.0); + assertThat(bound.z()).isEqualTo(3.0); + assertThat(bound.hasZ()).isTrue(); + assertThat(bound.hasM()).isFalse(); + assertThat(Double.isNaN(bound.m())).isTrue(); + } + + @Test + public void testCreateXYM() { + GeospatialBound bound = GeospatialBound.createXYM(1.0, 2.0, 4.0); + assertThat(bound.x()).isEqualTo(1.0); + assertThat(bound.y()).isEqualTo(2.0); + assertThat(bound.m()).isEqualTo(4.0); + assertThat(bound.hasZ()).isFalse(); + assertThat(bound.hasM()).isTrue(); + assertThat(Double.isNaN(bound.z())).isTrue(); + } + + @Test + public void testCreateXYZM() { + GeospatialBound bound = GeospatialBound.createXYZM(1.0, 2.0, 3.0, 4.0); + assertThat(bound.x()).isEqualTo(1.0); + assertThat(bound.y()).isEqualTo(2.0); + assertThat(bound.z()).isEqualTo(3.0); + assertThat(bound.m()).isEqualTo(4.0); + assertThat(bound.hasZ()).isTrue(); + assertThat(bound.hasM()).isTrue(); + } + + @Test + public void testEqualsAndHashCode() { + GeospatialBound xy1 = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound xy2 = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound xy3 = GeospatialBound.createXY(2.0, 1.0); + assertThat(xy1).isEqualTo(xy2); + assertThat(xy1.hashCode()).isEqualTo(xy2.hashCode()); + assertThat(xy1).isNotEqualTo(xy3); + + GeospatialBound xyz1 = GeospatialBound.createXYZ(1.0, 2.0, 3.0); + GeospatialBound xyz2 = GeospatialBound.createXYZ(1.0, 2.0, 3.0); + GeospatialBound xyz3 = GeospatialBound.createXYZ(1.0, 2.0, 4.0); + assertThat(xyz1).isEqualTo(xyz2); + assertThat(xyz1.hashCode()).isEqualTo(xyz2.hashCode()); + assertThat(xyz1).isNotEqualTo(xyz3); + assertThat(xyz1).isNotEqualTo(xy1); + + GeospatialBound xym1 = GeospatialBound.createXYM(1.0, 2.0, 4.0); + GeospatialBound xym2 = GeospatialBound.createXYM(1.0, 2.0, 4.0); + GeospatialBound xym3 = GeospatialBound.createXYM(1.0, 2.0, 5.0); + assertThat(xym1).isEqualTo(xym2); + assertThat(xym1.hashCode()).isEqualTo(xym2.hashCode()); + assertThat(xym1).isNotEqualTo(xym3); + assertThat(xym1).isNotEqualTo(xy1); + + GeospatialBound xyzm1 = GeospatialBound.createXYZM(1.0, 2.0, 3.0, 4.0); + GeospatialBound xyzm2 = GeospatialBound.createXYZM(1.0, 2.0, 3.0, 4.0); + GeospatialBound xyzm3 = GeospatialBound.createXYZM(1.0, 2.0, 3.0, 5.0); + assertThat(xyzm1).isEqualTo(xyzm2); + assertThat(xyzm1.hashCode()).isEqualTo(xyzm2.hashCode()); + assertThat(xyzm1).isNotEqualTo(xyzm3); + assertThat(xyzm1).isNotEqualTo(xyz1); + } + + @Test + public void testCompareTo() { + GeospatialBound xy1 = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound xy2 = GeospatialBound.createXY(1.0, 2.0); + GeospatialBound xy3 = GeospatialBound.createXY(1.0, 3.0); + assertThat(xy1.compareTo(xy2)).isEqualTo(0); + assertThat(xy1.compareTo(xy3)).isLessThan(0); + assertThat(xy3.compareTo(xy1)).isGreaterThan(0); + + GeospatialBound xyz1 = GeospatialBound.createXYZ(1.0, 2.0, 3.0); + GeospatialBound xyz2 = GeospatialBound.createXYZ(1.0, 2.0, 3.0); + GeospatialBound xyz3 = GeospatialBound.createXYZ(1.0, 2.0, 4.0); + assertThat(xyz1.compareTo(xyz2)).isEqualTo(0); + assertThat(xyz1.compareTo(xyz3)).isLessThan(0); + assertThat(xyz3.compareTo(xyz1)).isGreaterThan(0); + + GeospatialBound xym1 = GeospatialBound.createXYM(1.0, 2.0, 4.0); + GeospatialBound xym2 = GeospatialBound.createXYM(1.0, 2.0, 4.0); + GeospatialBound xym3 = GeospatialBound.createXYM(1.0, 2.0, 5.0); + assertThat(xym1.compareTo(xym2)).isEqualTo(0); + assertThat(xym1.compareTo(xym3)).isLessThan(0); + assertThat(xym3.compareTo(xym1)).isGreaterThan(0); + + GeospatialBound xyzm1 = GeospatialBound.createXYZM(1.0, 2.0, 3.0, 4.0); + GeospatialBound xyzm2 = GeospatialBound.createXYZM(1.0, 2.0, 3.0, 4.0); + GeospatialBound xyzm3 = GeospatialBound.createXYZM(1.0, 2.0, 3.0, 5.0); + assertThat(xyzm1.compareTo(xyzm2)).isEqualTo(0); + assertThat(xyzm1.compareTo(xyzm3)).isLessThan(0); + assertThat(xyzm3.compareTo(xyzm1)).isGreaterThan(0); + } + + @Test + public void testToString() { + GeospatialBound xy = GeospatialBound.createXY(1.0, 2.0); + assertThat(xy.toString()).isEqualTo("GeospatialBound(x=1.0, y=2.0)"); + + GeospatialBound xyz = GeospatialBound.createXYZ(1.0, 2.0, 3.0); + assertThat(xyz.toString()).isEqualTo("GeospatialBound(x=1.0, y=2.0, z=3.0)"); + + GeospatialBound xym = GeospatialBound.createXYM(1.0, 2.0, 4.0); + assertThat(xym.toString()).isEqualTo("GeospatialBound(x=1.0, y=2.0, m=4.0)"); + + GeospatialBound xyzm = GeospatialBound.createXYZM(1.0, 2.0, 3.0, 4.0); + assertThat(xyzm.toString()).isEqualTo("GeospatialBound(x=1.0, y=2.0, z=3.0, m=4.0)"); + } + + @Test + public void testSimpleString() { + GeospatialBound xy = GeospatialBound.createXY(1.0, 2.0); + assertThat(xy.simpleString()).isEqualTo("x=1.0, y=2.0"); + + GeospatialBound xyz = GeospatialBound.createXYZ(1.0, 2.0, 3.0); + assertThat(xyz.simpleString()).isEqualTo("x=1.0, y=2.0, z=3.0"); + + GeospatialBound xym = GeospatialBound.createXYM(1.0, 2.0, 4.0); + assertThat(xym.simpleString()).isEqualTo("x=1.0, y=2.0, m=4.0"); + + GeospatialBound xyzm = GeospatialBound.createXYZM(1.0, 2.0, 3.0, 4.0); + assertThat(xyzm.simpleString()).isEqualTo("x=1.0, y=2.0, z=3.0, m=4.0"); + } + + @Test + public void testSerde() { + // Test XY format (16 bytes: x:y) + // These bytes represent x=10.0, y=13.0 + byte[] xyBytes = + new byte[] { + 0, 0, 0, 0, 0, 0, 36, 64, // 10.0 in little-endian IEEE 754 + 0, 0, 0, 0, 0, 0, 42, 64 // 13.0 in little-endian IEEE 754 + }; + GeospatialBound xy = GeospatialBound.fromByteArray(xyBytes); + assertThat(xy.x()).isEqualTo(10.0); + assertThat(xy.y()).isEqualTo(13.0); + assertThat(xy.hasZ()).isFalse(); + assertThat(xy.hasM()).isFalse(); + assertThat(ByteBuffers.toByteArray(xy.toByteBuffer())).isEqualTo(xyBytes); + + // Test XYZ format (24 bytes: x:y:z) + // These bytes represent x=10.0, y=13.0, z=15.0 + byte[] xyzBytes = + new byte[] { + 0, 0, 0, 0, 0, 0, 36, 64, // 10.0 in little-endian IEEE 754 + 0, 0, 0, 0, 0, 0, 42, 64, // 13.0 in little-endian IEEE 754 + 0, 0, 0, 0, 0, 0, 46, 64 // 15.0 in little-endian IEEE 754 + }; + GeospatialBound xyz = GeospatialBound.fromByteArray(xyzBytes); + assertThat(xyz.x()).isEqualTo(10.0); + assertThat(xyz.y()).isEqualTo(13.0); + assertThat(xyz.z()).isEqualTo(15.0); + assertThat(xyz.hasZ()).isTrue(); + assertThat(xyz.hasM()).isFalse(); + assertThat(ByteBuffers.toByteArray(xyz.toByteBuffer())).isEqualTo(xyzBytes); + // Test XYM format (32 bytes: x:y:NaN:m) + // These bytes represent x=10.0, y=13.0, z=NaN, m=20.0 + byte[] xymBytes = + new byte[] { + 0, 0, 0, 0, 0, 0, 36, 64, // 10.0 in little-endian IEEE 754 + 0, 0, 0, 0, 0, 0, 42, 64, // 13.0 in little-endian IEEE 754 + 0, 0, 0, 0, 0, 0, (byte) 248, 127, // NaN in little-endian IEEE 754 + 0, 0, 0, 0, 0, 0, 52, 64 // 20.0 in little-endian IEEE 754 + }; + GeospatialBound xym = GeospatialBound.fromByteArray(xymBytes); + assertThat(xym.x()).isEqualTo(10.0); + assertThat(xym.y()).isEqualTo(13.0); + assertThat(Double.isNaN(xym.z())).isTrue(); + assertThat(xym.m()).isEqualTo(20.0); + assertThat(xym.hasZ()).isFalse(); + assertThat(xym.hasM()).isTrue(); + assertThat(ByteBuffers.toByteArray(xym.toByteBuffer())).isEqualTo(xymBytes); + + // Test XYZM format (32 bytes: x:y:z:m) + // These bytes represent x=10.0, y=13.0, z=15.0, m=20.0 + byte[] xyzmBytes = + new byte[] { + 0, 0, 0, 0, 0, 0, 36, 64, // 10.0 in little-endian IEEE 754 + 0, 0, 0, 0, 0, 0, 42, 64, // 13.0 in little-endian IEEE 754 + 0, 0, 0, 0, 0, 0, 46, 64, // 15.0 in little-endian IEEE 754 + 0, 0, 0, 0, 0, 0, 52, 64 // 20.0 in little-endian IEEE 754 + }; + GeospatialBound xyzm = GeospatialBound.fromByteArray(xyzmBytes); + assertThat(xyzm.x()).isEqualTo(10.0); + assertThat(xyzm.y()).isEqualTo(13.0); + assertThat(xyzm.z()).isEqualTo(15.0); + assertThat(xyzm.m()).isEqualTo(20.0); + assertThat(xyzm.hasZ()).isTrue(); + assertThat(xyzm.hasM()).isTrue(); + assertThat(ByteBuffers.toByteArray(xyzm.toByteBuffer())).isEqualTo(xyzmBytes); + } + + private GeospatialBound roundTripSerDe(GeospatialBound original) { + ByteBuffer buffer = original.toByteBuffer(); + return GeospatialBound.fromByteBuffer(buffer); + } + + @Test + public void testRoundTripSerDe() { + // Test XY serialization + GeospatialBound xy = GeospatialBound.createXY(1.1, 2.2); + assertThat(roundTripSerDe(xy)).isEqualTo(xy); + + // Test XYZ serialization + GeospatialBound xyz = GeospatialBound.createXYZ(1.1, 2.2, 3.3); + assertThat(roundTripSerDe(xyz)).isEqualTo(xyz); + + // Test XYM serialization + GeospatialBound xym = GeospatialBound.createXYM(1.1, 2.2, 4.4); + assertThat(roundTripSerDe(xym)).isEqualTo(xym); + + // Test XYZM serialization + GeospatialBound xyzm = GeospatialBound.createXYZM(1.1, 2.2, 3.3, 4.4); + assertThat(roundTripSerDe(xyzm)).isEqualTo(xyzm); + } +} diff --git a/api/src/test/java/org/apache/iceberg/geospatial/TestGeospatialPredicateEvaluators.java b/api/src/test/java/org/apache/iceberg/geospatial/TestGeospatialPredicateEvaluators.java new file mode 100644 index 000000000000..e3c2abbd1f86 --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/geospatial/TestGeospatialPredicateEvaluators.java @@ -0,0 +1,430 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.geospatial; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import org.apache.iceberg.types.EdgeAlgorithm; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestGeospatialPredicateEvaluators { + + @Test + public void testGeometryType() { + Type geometryType = Types.GeometryType.crs84(); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geometryType); + + assertThat(evaluator).isInstanceOf(GeospatialPredicateEvaluators.GeometryEvaluator.class); + } + + @Test + public void testOverlappingBoxesIntersect() { + Type geometryType = Types.GeometryType.crs84(); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geometryType); + + GeospatialBound min1 = GeospatialBound.createXY(0.0, 0.0); + GeospatialBound max1 = GeospatialBound.createXY(5.0, 5.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + GeospatialBound min2 = GeospatialBound.createXY(3.0, 3.0); + GeospatialBound max2 = GeospatialBound.createXY(8.0, 8.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + assertThat(evaluator.intersects(box1, box2)).isTrue(); + assertThat(evaluator.intersects(box2, box1)).isTrue(); + } + + @Test + public void testNonOverlappingBoxesDontIntersect() { + Type geometryType = Types.GeometryType.crs84(); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geometryType); + + GeospatialBound min1 = GeospatialBound.createXY(0.0, 0.0); + GeospatialBound max1 = GeospatialBound.createXY(2.0, 2.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + GeospatialBound min2 = GeospatialBound.createXY(3.0, 3.0); + GeospatialBound max2 = GeospatialBound.createXY(5.0, 5.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + assertThat(evaluator.intersects(box1, box2)).isFalse(); + assertThat(evaluator.intersects(box2, box1)).isFalse(); + } + + @Test + public void testBoxesTouchingAtCornerIntersect() { + Type geometryType = Types.GeometryType.crs84(); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geometryType); + + GeospatialBound min1 = GeospatialBound.createXY(0.0, 0.0); + GeospatialBound max1 = GeospatialBound.createXY(2.0, 2.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + GeospatialBound min2 = GeospatialBound.createXY(2.0, 2.0); + GeospatialBound max2 = GeospatialBound.createXY(4.0, 4.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + assertThat(evaluator.intersects(box1, box2)).isTrue(); + assertThat(evaluator.intersects(box2, box1)).isTrue(); + } + + @Test + public void testBoxesTouchingAtEdgeIntersect() { + Type geometryType = Types.GeometryType.crs84(); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geometryType); + + GeospatialBound min1 = GeospatialBound.createXY(0.0, 0.0); + GeospatialBound max1 = GeospatialBound.createXY(2.0, 2.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + GeospatialBound min2 = GeospatialBound.createXY(2.0, 0.0); + GeospatialBound max2 = GeospatialBound.createXY(4.0, 2.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + assertThat(evaluator.intersects(box1, box2)).isTrue(); + assertThat(evaluator.intersects(box2, box1)).isTrue(); + } + + @Test + public void testBoxContainedWithinAnotherIntersects() { + Type geometryType = Types.GeometryType.crs84(); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geometryType); + + GeospatialBound min1 = GeospatialBound.createXY(0.0, 0.0); + GeospatialBound max1 = GeospatialBound.createXY(10.0, 10.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + GeospatialBound min2 = GeospatialBound.createXY(2.0, 2.0); + GeospatialBound max2 = GeospatialBound.createXY(5.0, 5.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + assertThat(evaluator.intersects(box1, box2)).isTrue(); + assertThat(evaluator.intersects(box2, box1)).isTrue(); + } + + @Test + public void testBoxesWithZCoordinate() { + Type geometryType = Types.GeometryType.crs84(); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geometryType); + + // Two boxes with Z coordinates that overlap in X and Y but not in Z + // Note: The current implementation only checks X and Y coordinates + GeospatialBound min1 = GeospatialBound.createXYZ(0.0, 0.0, 0.0); + GeospatialBound max1 = GeospatialBound.createXYZ(2.0, 2.0, 1.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + GeospatialBound min2 = GeospatialBound.createXYZ(1.0, 1.0, 2.0); + GeospatialBound max2 = GeospatialBound.createXYZ(3.0, 3.0, 3.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + // They should intersect because the current implementation only checks X and Y + assertThat(evaluator.intersects(box1, box2)).isTrue(); + assertThat(evaluator.intersects(box2, box1)).isTrue(); + } + + @Test + public void testBoxesWithMCoordinate() { + Type geometryType = Types.GeometryType.crs84(); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geometryType); + + // Two boxes with M coordinates that overlap in X and Y but not in M + // Note: The current implementation only checks X and Y coordinates + GeospatialBound min1 = GeospatialBound.createXYM(0.0, 0.0, 0.0); + GeospatialBound max1 = GeospatialBound.createXYM(2.0, 2.0, 1.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + GeospatialBound min2 = GeospatialBound.createXYM(1.0, 1.0, 2.0); + GeospatialBound max2 = GeospatialBound.createXYM(3.0, 3.0, 3.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + // They should intersect because the current implementation only checks X and Y + assertThat(evaluator.intersects(box1, box2)).isTrue(); + assertThat(evaluator.intersects(box2, box1)).isTrue(); + } + + @Test + public void testGeometryWrapAroundOnA() { + Type geometryType = Types.GeometryType.crs84(); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geometryType); + + // First box wraps around antimeridian (min.x > max.x), second doesn't + GeospatialBound min1 = GeospatialBound.createXY(170.0, 0.0); + GeospatialBound max1 = GeospatialBound.createXY(-170.0, 10.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + // Box that overlaps with the part after the wrap around + GeospatialBound min2 = GeospatialBound.createXY(-175.0, 5.0); + GeospatialBound max2 = GeospatialBound.createXY(-160.0, 15.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + assertThat(evaluator.intersects(box1, box2)).isTrue(); + assertThat(evaluator.intersects(box2, box1)).isTrue(); + + // Box that overlaps with the part before the wrap around + GeospatialBound min3 = GeospatialBound.createXY(160.0, 5.0); + GeospatialBound max3 = GeospatialBound.createXY(175.0, 15.0); + BoundingBox box3 = new BoundingBox(min3, max3); + + assertThat(evaluator.intersects(box1, box3)).isTrue(); + assertThat(evaluator.intersects(box3, box1)).isTrue(); + + // Box that doesn't overlap with either part + GeospatialBound min4 = GeospatialBound.createXY(-150.0, 20.0); + GeospatialBound max4 = GeospatialBound.createXY(-140.0, 30.0); + BoundingBox box4 = new BoundingBox(min4, max4); + + assertThat(evaluator.intersects(box1, box4)).isFalse(); + assertThat(evaluator.intersects(box4, box1)).isFalse(); + } + + @Test + public void testGeometryWrapAroundOnB() { + Type geometryType = Types.GeometryType.crs84(); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geometryType); + + // First box doesn't wrap around, second does (min.x > max.x) + GeospatialBound min1 = GeospatialBound.createXY(-175.0, 5.0); + GeospatialBound max1 = GeospatialBound.createXY(-160.0, 15.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + GeospatialBound min2 = GeospatialBound.createXY(170.0, 0.0); + GeospatialBound max2 = GeospatialBound.createXY(-170.0, 10.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + assertThat(evaluator.intersects(box1, box2)).isTrue(); + assertThat(evaluator.intersects(box2, box1)).isTrue(); + } + + @Test + public void testBothGeometryWrappingAround() { + Type geometryType = Types.GeometryType.crs84(); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geometryType); + + // Both boxes wrap around (min.x > max.x) + GeospatialBound min1 = GeospatialBound.createXY(170.0, 0.0); + GeospatialBound max1 = GeospatialBound.createXY(-170.0, 10.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + GeospatialBound min2 = GeospatialBound.createXY(160.0, 5.0); + GeospatialBound max2 = GeospatialBound.createXY(-160.0, 15.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + // When both wrap around, they must intersect + assertThat(evaluator.intersects(box1, box2)).isTrue(); + assertThat(evaluator.intersects(box2, box1)).isTrue(); + } + + @Test + public void testBasicGeographyCases() { + Type geographyType = Types.GeographyType.of("srid:4326", EdgeAlgorithm.SPHERICAL); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geographyType); + + // Two overlapping boxes + GeospatialBound min1 = GeospatialBound.createXY(0.0, 0.0); + GeospatialBound max1 = GeospatialBound.createXY(10.0, 10.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + GeospatialBound min2 = GeospatialBound.createXY(5.0, 5.0); + GeospatialBound max2 = GeospatialBound.createXY(15.0, 15.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + assertThat(evaluator.intersects(box1, box2)).isTrue(); + assertThat(evaluator.intersects(box2, box1)).isTrue(); + + // Non-overlapping boxes + GeospatialBound min3 = GeospatialBound.createXY(20.0, 20.0); + GeospatialBound max3 = GeospatialBound.createXY(30.0, 30.0); + BoundingBox box3 = new BoundingBox(min3, max3); + + assertThat(evaluator.intersects(box1, box3)).isFalse(); + assertThat(evaluator.intersects(box3, box1)).isFalse(); + + // Boxes at extreme valid latitudes + GeospatialBound min4 = GeospatialBound.createXY(-10.0, -90.0); + GeospatialBound max4 = GeospatialBound.createXY(10.0, -80.0); + BoundingBox box4 = new BoundingBox(min4, max4); + + GeospatialBound min5 = GeospatialBound.createXY(-5.0, 80.0); + GeospatialBound max5 = GeospatialBound.createXY(15.0, 90.0); + BoundingBox box5 = new BoundingBox(min5, max5); + + assertThat(evaluator.intersects(box4, box5)).isFalse(); + assertThat(evaluator.intersects(box5, box4)).isFalse(); + } + + @Test + public void testGeographyWrapAround() { + Type geographyType = Types.GeographyType.of("srid:4326", EdgeAlgorithm.SPHERICAL); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geographyType); + + // Box that wraps around the antimeridian + GeospatialBound min1 = GeospatialBound.createXY(170.0, 0.0); + GeospatialBound max1 = GeospatialBound.createXY(-170.0, 10.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + // Box that overlaps with the part after the wrap around + GeospatialBound min2 = GeospatialBound.createXY(-175.0, 5.0); + GeospatialBound max2 = GeospatialBound.createXY(-160.0, 15.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + assertThat(evaluator.intersects(box1, box2)).isTrue(); + assertThat(evaluator.intersects(box2, box1)).isTrue(); + } + + @Test + public void testInvalidGeographyLatitude() { + Type geographyType = Types.GeographyType.of("srid:4326", EdgeAlgorithm.SPHERICAL); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geographyType); + + // Box with latitude below -90 + GeospatialBound min1 = GeospatialBound.createXY(0.0, -91.0); + GeospatialBound max1 = GeospatialBound.createXY(10.0, 0.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + // Box with latitude above 90 + GeospatialBound min2 = GeospatialBound.createXY(0.0, 0.0); + GeospatialBound max2 = GeospatialBound.createXY(10.0, 91.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + GeospatialBound validMin = GeospatialBound.createXY(0.0, 0.0); + GeospatialBound validMax = GeospatialBound.createXY(10.0, 10.0); + BoundingBox validBox = new BoundingBox(validMin, validMax); + + assertThatThrownBy(() -> evaluator.intersects(box1, validBox)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Latitude out of range"); + + assertThatThrownBy(() -> evaluator.intersects(validBox, box1)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Latitude out of range"); + + assertThatThrownBy(() -> evaluator.intersects(box2, validBox)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Latitude out of range"); + + assertThatThrownBy(() -> evaluator.intersects(validBox, box2)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Latitude out of range"); + } + + @Test + public void testInvalidGeographyLongitude() { + Type geographyType = Types.GeographyType.of("srid:4326", EdgeAlgorithm.SPHERICAL); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geographyType); + + // Box with longitude below -180 + GeospatialBound min1 = GeospatialBound.createXY(-181.0, 0.0); + GeospatialBound max1 = GeospatialBound.createXY(0.0, 10.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + // Box with longitude above 180 + GeospatialBound min2 = GeospatialBound.createXY(0.0, 0.0); + GeospatialBound max2 = GeospatialBound.createXY(181.0, 10.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + GeospatialBound validMin = GeospatialBound.createXY(0.0, 0.0); + GeospatialBound validMax = GeospatialBound.createXY(10.0, 10.0); + BoundingBox validBox = new BoundingBox(validMin, validMax); + + assertThatThrownBy(() -> evaluator.intersects(box1, validBox)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Longitude out of range"); + + assertThatThrownBy(() -> evaluator.intersects(validBox, box1)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Longitude out of range"); + + assertThatThrownBy(() -> evaluator.intersects(box2, validBox)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Longitude out of range"); + + assertThatThrownBy(() -> evaluator.intersects(validBox, box2)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Longitude out of range"); + } + + @Test + public void testExtremeLongitudeBoundaries() { + // Tests valid boxes at the extreme boundaries of longitude + Type geographyType = Types.GeographyType.of("srid:4326", EdgeAlgorithm.SPHERICAL); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geographyType); + + // Box at -180 longitude + GeospatialBound min1 = GeospatialBound.createXY(-180.0, 0.0); + GeospatialBound max1 = GeospatialBound.createXY(-170.0, 10.0); + BoundingBox box1 = new BoundingBox(min1, max1); + + // Box at 180 longitude + GeospatialBound min2 = GeospatialBound.createXY(170.0, 0.0); + GeospatialBound max2 = GeospatialBound.createXY(180.0, 10.0); + BoundingBox box2 = new BoundingBox(min2, max2); + + // These boxes should not intersect + assertThat(evaluator.intersects(box1, box2)).isFalse(); + assertThat(evaluator.intersects(box2, box1)).isFalse(); + + // Box that wraps around the antimeridian, touching -180 and 180 + GeospatialBound min3 = GeospatialBound.createXY(180.0, 0.0); + GeospatialBound max3 = GeospatialBound.createXY(-180.0, 10.0); + BoundingBox box3 = new BoundingBox(min3, max3); + + // This should intersect with both boxes at the extreme edges + assertThat(evaluator.intersects(box1, box3)).isTrue(); + assertThat(evaluator.intersects(box3, box1)).isTrue(); + assertThat(evaluator.intersects(box2, box3)).isTrue(); + assertThat(evaluator.intersects(box3, box2)).isTrue(); + } + + @Test + public void testSphericalGeographyType() { + Type geographyType = Types.GeographyType.of("srid:4326", EdgeAlgorithm.SPHERICAL); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(geographyType); + + assertThat(evaluator).isInstanceOf(GeospatialPredicateEvaluators.GeographyEvaluator.class); + } + + @Test + public void testUnsupportedType() { + Type stringType = Types.StringType.get(); + + assertThatThrownBy(() -> GeospatialPredicateEvaluators.create(stringType)) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("Unsupported type for BoundingBox"); + } +} diff --git a/api/src/test/java/org/apache/iceberg/types/TestConversions.java b/api/src/test/java/org/apache/iceberg/types/TestConversions.java index e207cfd8d59a..3e1edd031005 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestConversions.java +++ b/api/src/test/java/org/apache/iceberg/types/TestConversions.java @@ -45,6 +45,7 @@ public class TestConversions { @Test + @SuppressWarnings("MethodLength") public void testByteBufferConversions() { // booleans are stored as 0x00 for 'false' and a non-zero byte for 'true' assertConversion(false, BooleanType.get(), new byte[] {0x00}); @@ -189,6 +190,13 @@ public void testByteBufferConversions() { assertConversion(new BigDecimal("0.011"), DecimalType.of(10, 3), new byte[] {11}); assertThat(Literal.of(new BigDecimal("0.011")).toByteBuffer().array()) .isEqualTo(new byte[] {11}); + + // geospatial bounds were kept as-is + // this is a geospatial bound with x = 10.0, y = 20.0 + byte[] geospatialBound = new byte[] {0, 0, 0, 0, 0, 0, 36, 64, 0, 0, 0, 0, 0, 0, 52, 64}; + assertConversion(ByteBuffer.wrap(geospatialBound), Types.GeometryType.crs84(), geospatialBound); + assertConversion( + ByteBuffer.wrap(geospatialBound), Types.GeographyType.crs84(), geospatialBound); } private void assertConversion(T value, Type type, byte[] expectedBinary) { diff --git a/api/src/test/java/org/apache/iceberg/types/TestTypes.java b/api/src/test/java/org/apache/iceberg/types/TestTypes.java index cc8d3586b862..fa5ed4304d3c 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestTypes.java +++ b/api/src/test/java/org/apache/iceberg/types/TestTypes.java @@ -98,6 +98,8 @@ public void fromPrimitiveString() { assertThat(Types.fromPrimitiveString("geometry")).isEqualTo(Types.GeometryType.crs84()); assertThat(Types.fromPrimitiveString("Geometry")).isEqualTo(Types.GeometryType.crs84()); + assertThat(((Types.GeometryType) Types.fromPrimitiveString("geometry")).crs()) + .isEqualTo(Types.GeometryType.DEFAULT_CRS); assertThat(Types.fromPrimitiveString("geometry(srid:3857)")) .isEqualTo(Types.GeometryType.of("srid:3857")); assertThat(Types.fromPrimitiveString("geometry( srid:3857 )")) @@ -113,12 +115,13 @@ public void fromPrimitiveString() { assertThatExceptionOfType(IllegalArgumentException.class) .isThrownBy(() -> Types.fromPrimitiveString("geometry( )")) .withMessageContaining("Invalid CRS: (empty string)"); - assertThatExceptionOfType(IllegalArgumentException.class) - .isThrownBy(() -> Types.fromPrimitiveString("geometry(srid:123,456)")) - .withMessageContaining("Invalid CRS: srid:123,456"); assertThat(Types.fromPrimitiveString("geography")).isEqualTo(Types.GeographyType.crs84()); assertThat(Types.fromPrimitiveString("Geography")).isEqualTo(Types.GeographyType.crs84()); + assertThat(((Types.GeographyType) Types.fromPrimitiveString("geography")).crs()) + .isEqualTo(Types.GeographyType.DEFAULT_CRS); + assertThat(((Types.GeographyType) Types.fromPrimitiveString("geography")).algorithm()) + .isEqualTo(Types.GeographyType.DEFAULT_ALGORITHM); assertThat(Types.fromPrimitiveString("geography(srid:4269)")) .isEqualTo(Types.GeographyType.of("srid:4269")); assertThat(Types.fromPrimitiveString("geography(srid: 4269)")) diff --git a/api/src/test/java/org/apache/iceberg/util/RandomUtil.java b/api/src/test/java/org/apache/iceberg/util/RandomUtil.java index b437b0bbf51c..7e8247195257 100644 --- a/api/src/test/java/org/apache/iceberg/util/RandomUtil.java +++ b/api/src/test/java/org/apache/iceberg/util/RandomUtil.java @@ -20,6 +20,8 @@ import java.math.BigDecimal; import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.Arrays; import java.util.List; import java.util.Map; @@ -157,6 +159,19 @@ public static Object generatePrimitive(Type.PrimitiveType primitive, Random rand BigDecimal bigDecimal = new BigDecimal(unscaled, type.scale()); return negate(choice) ? bigDecimal.negate() : bigDecimal; + case GEOMETRY: + case GEOGRAPHY: + // Generate a random point in range [0, 10) for both x and y coordinates + double coordX = random.nextDouble() * 10; + double coordY = random.nextDouble() * 10; + ByteBuffer buffer = ByteBuffer.allocate(21); + buffer.order(ByteOrder.LITTLE_ENDIAN); + buffer.put((byte) 1); // Byte order (1 for Little Endian) + buffer.putInt(1); // Geometry type (1 for Point) + buffer.putDouble(coordX); + buffer.putDouble(coordY); + return buffer.flip(); + default: throw new IllegalArgumentException( "Cannot generate random value for unknown type: " + primitive); diff --git a/core/src/main/java/org/apache/iceberg/expressions/ExpressionParser.java b/core/src/main/java/org/apache/iceberg/expressions/ExpressionParser.java index 9bb5b7d05f0b..9d8e684edf69 100644 --- a/core/src/main/java/org/apache/iceberg/expressions/ExpressionParser.java +++ b/core/src/main/java/org/apache/iceberg/expressions/ExpressionParser.java @@ -31,6 +31,8 @@ import java.util.function.Supplier; import org.apache.iceberg.Schema; import org.apache.iceberg.SingleValueParser; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; @@ -167,6 +169,9 @@ public Void predicate(BoundPredicate pred) { SingleValueParser.toJson(pred.term().type(), value, gen); } gen.writeEndArray(); + } else if (pred.isGeospatialPredicate()) { + gen.writeFieldName(VALUE); + geospatialBoundingBox(pred.asGeospatialPredicate().literal().value()); } gen.writeEndObject(); @@ -192,6 +197,13 @@ public Void predicate(UnboundPredicate pred) { } gen.writeEndArray(); + } else if (pred.op() == Expression.Operation.ST_INTERSECTS + || pred.op() == Expression.Operation.ST_DISJOINT) { + gen.writeFieldName(VALUE); + Literal min = pred.literals().get(0).to(Types.BinaryType.get()); + Literal max = pred.literals().get(1).to(Types.BinaryType.get()); + BoundingBox bbox = BoundingBox.fromByteBuffers(min.value(), max.value()); + geospatialBoundingBox(bbox); } else { gen.writeFieldName(VALUE); unboundLiteral(pred.literal().value()); @@ -229,6 +241,44 @@ private void unboundLiteral(Object object) throws IOException { } } + private void geospatialBoundingBox(BoundingBox value) throws IOException { + gen.writeStartObject(); + + // Write x coordinate + gen.writeFieldName("x"); + gen.writeStartObject(); + gen.writeNumberField("min", value.min().x()); + gen.writeNumberField("max", value.max().x()); + gen.writeEndObject(); + + // Write y coordinate + gen.writeFieldName("y"); + gen.writeStartObject(); + gen.writeNumberField("min", value.min().y()); + gen.writeNumberField("max", value.max().y()); + gen.writeEndObject(); + + // Write z coordinate if present + if (value.min().hasZ() || value.max().hasZ()) { + gen.writeFieldName("z"); + gen.writeStartObject(); + gen.writeNumberField("min", value.min().z()); + gen.writeNumberField("max", value.max().z()); + gen.writeEndObject(); + } + + // Write m coordinate if present + if (value.min().hasM() || value.max().hasM()) { + gen.writeFieldName("m"); + gen.writeStartObject(); + gen.writeNumberField("min", value.min().m()); + gen.writeNumberField("max", value.max().m()); + gen.writeEndObject(); + } + + gen.writeEndObject(); + } + private String operationType(Expression.Operation op) { return op.toString().replaceAll("_", "-").toLowerCase(Locale.ENGLISH); } @@ -306,6 +356,9 @@ static Expression fromJson(JsonNode json, Schema schema) { return Expressions.or( fromJson(JsonUtil.get(LEFT, json), schema), fromJson(JsonUtil.get(RIGHT, json), schema)); + case ST_INTERSECTS: + case ST_DISJOINT: + return geospatialPredicateFromJson(op, json); } return predicateFromJson(op, json, schema); @@ -374,6 +427,15 @@ private static UnboundPredicate predicateFromJson( } } + private static Expression geospatialPredicateFromJson(Expression.Operation op, JsonNode node) { + UnboundTerm term = term(JsonUtil.get(TERM, node)); + Preconditions.checkArgument(node.has(VALUE), "Cannot parse %s predicate: missing value", op); + Preconditions.checkArgument( + !node.has(VALUES), "Cannot parse %s predicate: has invalid values field", op); + BoundingBox boundingBox = geospatialBoundingBox(JsonUtil.get(VALUE, node)); + return Expressions.geospatialPredicate(op, term, boundingBox); + } + private static T literal(JsonNode valueNode, Function toValue) { if (valueNode.isObject() && valueNode.has(TYPE)) { String type = JsonUtil.getString(TYPE, valueNode); @@ -386,6 +448,51 @@ private static T literal(JsonNode valueNode, Function toValue) return toValue.apply(valueNode); } + private static BoundingBox geospatialBoundingBox(JsonNode valueNode) { + // X and Y coordinates are required + double xMin = valueNode.get("x").get("min").asDouble(); + double xMax = valueNode.get("x").get("max").asDouble(); + double yMin = valueNode.get("y").get("min").asDouble(); + double yMax = valueNode.get("y").get("max").asDouble(); + + // Create GeospatialBound objects for min and max + GeospatialBound minBound; + GeospatialBound maxBound; + + // Check if Z coordinate exists + boolean hasZ = valueNode.has("z"); + // Check if M coordinate exists + boolean hasM = valueNode.has("m"); + + if (hasZ && hasM) { + // Both Z and M present + double zMin = valueNode.get("z").get("min").asDouble(); + double zMax = valueNode.get("z").get("max").asDouble(); + double mMin = valueNode.get("m").get("min").asDouble(); + double mMax = valueNode.get("m").get("max").asDouble(); + minBound = GeospatialBound.createXYZM(xMin, yMin, zMin, mMin); + maxBound = GeospatialBound.createXYZM(xMax, yMax, zMax, mMax); + } else if (hasZ) { + // Only Z present, no M + double zMin = valueNode.get("z").get("min").asDouble(); + double zMax = valueNode.get("z").get("max").asDouble(); + minBound = GeospatialBound.createXYZ(xMin, yMin, zMin); + maxBound = GeospatialBound.createXYZ(xMax, yMax, zMax); + } else if (hasM) { + // Only M present, no Z + double mMin = valueNode.get("m").get("min").asDouble(); + double mMax = valueNode.get("m").get("max").asDouble(); + minBound = GeospatialBound.createXYM(xMin, yMin, mMin); + maxBound = GeospatialBound.createXYM(xMax, yMax, mMax); + } else { + // Only X and Y present + minBound = GeospatialBound.createXY(xMin, yMin); + maxBound = GeospatialBound.createXY(xMax, yMax); + } + + return new BoundingBox(minBound, maxBound); + } + private static Object asObject(JsonNode node) { if (node.isIntegralNumber() && node.canConvertToLong()) { return node.asLong(); diff --git a/core/src/test/java/org/apache/iceberg/InternalTestHelpers.java b/core/src/test/java/org/apache/iceberg/InternalTestHelpers.java index 781051f11d7b..226f105f5310 100644 --- a/core/src/test/java/org/apache/iceberg/InternalTestHelpers.java +++ b/core/src/test/java/org/apache/iceberg/InternalTestHelpers.java @@ -91,6 +91,8 @@ private static void assertEquals(Type type, Object expected, Object actual) { case FIXED: case BINARY: case DECIMAL: + case GEOMETRY: + case GEOGRAPHY: assertThat(actual).as("Primitive value should be equal to expected").isEqualTo(expected); break; case STRUCT: diff --git a/core/src/test/java/org/apache/iceberg/expressions/TestExpressionParser.java b/core/src/test/java/org/apache/iceberg/expressions/TestExpressionParser.java index 43e2f13b55c9..fca3ed192501 100644 --- a/core/src/test/java/org/apache/iceberg/expressions/TestExpressionParser.java +++ b/core/src/test/java/org/apache/iceberg/expressions/TestExpressionParser.java @@ -28,6 +28,8 @@ import java.nio.ByteBuffer; import java.util.UUID; import org.apache.iceberg.Schema; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; @@ -51,7 +53,9 @@ public class TestExpressionParser { required(114, "dec_9_0", Types.DecimalType.of(9, 0)), required(115, "dec_11_2", Types.DecimalType.of(11, 2)), required(116, "dec_38_10", Types.DecimalType.of(38, 10)), // maximum precision - required(117, "time", Types.TimeType.get())); + required(117, "time", Types.TimeType.get()), + required(118, "geom", Types.GeometryType.crs84()), + required(119, "geog", Types.GeographyType.crs84())); private static final Schema SCHEMA = new Schema(SUPPORTED_PRIMITIVES.fields()); @Test @@ -94,7 +98,22 @@ public void testSimpleExpressions() { Expressions.or( Expressions.greaterThan(Expressions.day("ts"), "2022-08-14"), Expressions.equal("date", "2022-08-14")), - Expressions.not(Expressions.in("l", 1, 2, 3, 4)) + Expressions.not(Expressions.in("l", 1, 2, 3, 4)), + Expressions.stIntersects( + "geom", + new BoundingBox(GeospatialBound.createXY(1, 2), GeospatialBound.createXY(3, 4))), + Expressions.stDisjoint( + "geom", + new BoundingBox( + GeospatialBound.createXYM(1, 2, 3), GeospatialBound.createXYM(3, 4, 5))), + Expressions.stIntersects( + "geog", + new BoundingBox( + GeospatialBound.createXYZ(1, 2, 3), GeospatialBound.createXYZ(3, 4, 5))), + Expressions.stDisjoint( + "geog", + new BoundingBox( + GeospatialBound.createXYZM(1, 2, 3, 4), GeospatialBound.createXYZM(3, 4, 5, 6))) }; for (Expression expr : expressions) { @@ -544,4 +563,122 @@ public void testNegativeScaleDecimalLiteral() { assertThat(ExpressionParser.toJson(ExpressionParser.fromJson(expected), true)) .isEqualTo(expected); } + + @Test + public void testSpatialPredicate() { + String expected = + "{\n" + + " \"type\" : \"st-intersects\",\n" + + " \"term\" : \"column-name\",\n" + + " \"value\" : {\n" + + " \"x\" : {\n" + + " \"min\" : 1.0,\n" + + " \"max\" : 3.0\n" + + " },\n" + + " \"y\" : {\n" + + " \"min\" : 2.0,\n" + + " \"max\" : 4.0\n" + + " }\n" + + " }\n" + + "}"; + + Expression expression = + Expressions.stIntersects( + "column-name", + new BoundingBox(GeospatialBound.createXY(1, 2), GeospatialBound.createXY(3, 4))); + assertThat(ExpressionParser.toJson(expression, true)).isEqualTo(expected); + assertThat(ExpressionParser.toJson(ExpressionParser.fromJson(expected), true)) + .isEqualTo(expected); + + expected = + "{\n" + + " \"type\" : \"st-intersects\",\n" + + " \"term\" : \"column-name\",\n" + + " \"value\" : {\n" + + " \"x\" : {\n" + + " \"min\" : 1.0,\n" + + " \"max\" : 3.0\n" + + " },\n" + + " \"y\" : {\n" + + " \"min\" : 2.0,\n" + + " \"max\" : 4.0\n" + + " },\n" + + " \"m\" : {\n" + + " \"min\" : 3.0,\n" + + " \"max\" : 5.0\n" + + " }\n" + + " }\n" + + "}"; + + expression = + Expressions.stIntersects( + "column-name", + new BoundingBox( + GeospatialBound.createXYM(1, 2, 3), GeospatialBound.createXYM(3, 4, 5))); + assertThat(ExpressionParser.toJson(expression, true)).isEqualTo(expected); + assertThat(ExpressionParser.toJson(ExpressionParser.fromJson(expected), true)) + .isEqualTo(expected); + + expected = + "{\n" + + " \"type\" : \"st-intersects\",\n" + + " \"term\" : \"column-name\",\n" + + " \"value\" : {\n" + + " \"x\" : {\n" + + " \"min\" : 1.0,\n" + + " \"max\" : 3.0\n" + + " },\n" + + " \"y\" : {\n" + + " \"min\" : 2.0,\n" + + " \"max\" : 4.0\n" + + " },\n" + + " \"z\" : {\n" + + " \"min\" : 3.0,\n" + + " \"max\" : 5.0\n" + + " }\n" + + " }\n" + + "}"; + + expression = + Expressions.stIntersects( + "column-name", + new BoundingBox( + GeospatialBound.createXYZ(1, 2, 3), GeospatialBound.createXYZ(3, 4, 5))); + assertThat(ExpressionParser.toJson(expression, true)).isEqualTo(expected); + assertThat(ExpressionParser.toJson(ExpressionParser.fromJson(expected), true)) + .isEqualTo(expected); + + expected = + "{\n" + + " \"type\" : \"st-intersects\",\n" + + " \"term\" : \"column-name\",\n" + + " \"value\" : {\n" + + " \"x\" : {\n" + + " \"min\" : 1.0,\n" + + " \"max\" : 3.0\n" + + " },\n" + + " \"y\" : {\n" + + " \"min\" : 2.0,\n" + + " \"max\" : 4.0\n" + + " },\n" + + " \"z\" : {\n" + + " \"min\" : 3.0,\n" + + " \"max\" : 5.0\n" + + " },\n" + + " \"m\" : {\n" + + " \"min\" : 4.0,\n" + + " \"max\" : 6.0\n" + + " }\n" + + " }\n" + + "}"; + + expression = + Expressions.stIntersects( + "column-name", + new BoundingBox( + GeospatialBound.createXYZM(1, 2, 3, 4), GeospatialBound.createXYZM(3, 4, 5, 6))); + assertThat(ExpressionParser.toJson(expression, true)).isEqualTo(expected); + assertThat(ExpressionParser.toJson(ExpressionParser.fromJson(expected), true)) + .isEqualTo(expected); + } } diff --git a/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java b/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java index fc8d47680b0f..80afa4a0f487 100644 --- a/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java +++ b/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java @@ -120,6 +120,8 @@ private static void assertEquals(Type type, Object expected, Object actual) { case UUID: case BINARY: case DECIMAL: + case GEOMETRY: + case GEOGRAPHY: assertThat(actual) .as("Primitive value should be equal to expected for type " + type) .isEqualTo(expected); diff --git a/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericData.java b/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericData.java index f19f9728299a..ee4c43bbb57e 100644 --- a/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericData.java +++ b/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericData.java @@ -68,6 +68,11 @@ protected boolean supportsRowLineage() { return true; } + @Override + protected boolean supportsGeospatial() { + return true; + } + @Override protected void writeAndValidate(Schema schema) throws IOException { writeAndValidate(schema, schema); diff --git a/data/src/test/java/org/apache/iceberg/data/parquet/TestGeographyReadersAndWriters.java b/data/src/test/java/org/apache/iceberg/data/parquet/TestGeographyReadersAndWriters.java new file mode 100644 index 000000000000..455e708812ee --- /dev/null +++ b/data/src/test/java/org/apache/iceberg/data/parquet/TestGeographyReadersAndWriters.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data.parquet; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Path; +import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TestTables; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.IcebergGenerics; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.hadoop.HadoopInputFile; +import org.apache.iceberg.hadoop.HadoopOutputFile; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ByteBuffers; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.geom.GeometryFactory; +import org.locationtech.jts.geom.Point; +import org.locationtech.jts.io.ParseException; +import org.locationtech.jts.io.WKBReader; +import org.locationtech.jts.io.WKBWriter; + +public class TestGeographyReadersAndWriters { + private final Schema schema; + private static final Configuration CONF = new Configuration(); + + @TempDir Path tempDir; + + private final List testData; + + public TestGeographyReadersAndWriters() { + this.schema = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(3, "geog", Types.GeographyType.crs84())); + testData = prepareTestData(); + } + + private List prepareTestData() { + List recordList = Lists.newArrayList(); + GeometryFactory factory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + for (long id = 0; id < 1000; id++) { + // lng: -100 to 100, lat: -50 to 50 + double lng = id * 0.2 - 100; + double lat = id * 0.1 - 50; + Coordinate center = new Coordinate(lng, lat); + byte[] wkb = wkbWriter.write(factory.createPoint(center)); + ByteBuffer geog = ByteBuffer.wrap(wkb); + Record record = GenericRecord.create(schema); + record.setField("id", id); + record.setField("geog", geog); + recordList.add(record); + } + return recordList; + } + + @Test + public void testWriteAndReadGeometryValues() throws IOException, ParseException { + // Create a table + File location = tempDir.toFile(); + Table table = + TestTables.create( + location, + "geog_table", + schema, + PartitionSpec.unpartitioned(), + 3, + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, "parquet")); + + // Write some data + GenericAppenderFactory appenderFactory = new GenericAppenderFactory(table.schema()); + Path path = tempDir.resolve("data.parquet"); + OutputFile outputFile = + HadoopOutputFile.fromPath(new org.apache.hadoop.fs.Path(path.toString()), CONF); + try (var fileAppender = appenderFactory.newAppender(outputFile, FileFormat.PARQUET)) { + fileAppender.addAll(testData); + fileAppender.close(); + Metrics metrics = fileAppender.metrics(); + + // Commit the data file to the table + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withInputFile( + HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toString()), CONF)) + .withFormat(FileFormat.PARQUET) + .withMetrics(metrics) + .build(); + table.newAppend().appendFile(dataFile).commit(); + + // Read the data + WKBReader wkbReader = new WKBReader(); + try (CloseableIterable reader = IcebergGenerics.read(table).build()) { + int numRecords = 0; + for (Record record : reader) { + ByteBuffer geogWkb = (ByteBuffer) record.getField("geog"); + Geometry geometry = wkbReader.read(ByteBuffers.toByteArray(geogWkb)); + assertThat(geometry).isInstanceOf(Point.class); + numRecords++; + } + assertThat(numRecords).as("Record count must match").isEqualTo(testData.size()); + } + } + } +} diff --git a/data/src/test/java/org/apache/iceberg/data/parquet/TestGeometryReadersAndWriters.java b/data/src/test/java/org/apache/iceberg/data/parquet/TestGeometryReadersAndWriters.java new file mode 100644 index 000000000000..50f3d2c07c1f --- /dev/null +++ b/data/src/test/java/org/apache/iceberg/data/parquet/TestGeometryReadersAndWriters.java @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data.parquet; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import org.apache.commons.collections.IteratorUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.PartitionData; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TestTables; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.IcebergGenerics; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; +import org.apache.iceberg.hadoop.HadoopInputFile; +import org.apache.iceberg.hadoop.HadoopOutputFile; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.GeometryFactory; +import org.locationtech.jts.io.WKBWriter; + +public class TestGeometryReadersAndWriters { + + private final Schema schema; + private static final Configuration CONF = new Configuration(); + private final List> testData; + + @TempDir java.nio.file.Path tempDir; + + public TestGeometryReadersAndWriters() { + this.schema = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.required(2, "part", Types.IntegerType.get()), + Types.NestedField.optional(3, "geom", Types.GeometryType.crs84())); + testData = prepareTestData(); + } + + private List> prepareTestData() { + List> recordsInDataFiles = Lists.newArrayList(); + GeometryFactory factory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + long id = 0; + for (int i = 0; i < 4; i++) { + List recordList = Lists.newArrayList(); + for (int k = 1; k <= 10; k++) { + Coordinate center; + switch (i) { + case 0: + center = new Coordinate(k, k); + break; + case 1: + center = new Coordinate(-k, k); + break; + case 2: + center = new Coordinate(-k, -k); + break; + case 3: + default: + center = new Coordinate(k, -k); + break; + } + byte[] pointWkb = wkbWriter.write(factory.createPoint(center)); + ByteBuffer pointBuffer = ByteBuffer.wrap(pointWkb); + + Record record = GenericRecord.create(schema); + record.setField("id", id); + record.setField("part", i); + record.setField("geom", pointBuffer); + recordList.add(record); + id++; + } + recordsInDataFiles.add(recordList); + } + return recordsInDataFiles; + } + + @Test + public void testFilterTableWithSpatialPredicates() throws IOException { + File location = tempDir.toFile(); + Table table = + TestTables.create( + location, + "geom_table", + schema, + PartitionSpec.unpartitioned(), + 3, + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, "parquet")); + + AppendFiles append = table.newAppend(); + for (int i = 0; i < testData.size(); i++) { + List fileContent = testData.get(i); + GenericAppenderFactory appenderFactory = new GenericAppenderFactory(table.schema()); + Path path = new Path(location.toString(), "data-" + i + ".parquet"); + OutputFile outputFile = HadoopOutputFile.fromPath(path, CONF); + FileAppender fileAppender = + appenderFactory.newAppender(outputFile, FileFormat.PARQUET); + fileAppender.addAll(fileContent); + fileAppender.close(); + Metrics metrics = fileAppender.metrics(); + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withInputFile(HadoopInputFile.fromPath(path, CONF)) + .withFormat(FileFormat.PARQUET) + .withMetrics(metrics) + .build(); + append.appendFile(dataFile); + } + append.commit(); + + // Replace the createPoint and toGeometry calls with createBoundingBox helper calls. + Expression expr = Expressions.stIntersects("geom", createBoundingBox(1, 1)); + validateScan(table, expr, 1); + expr = Expressions.stIntersects("geom", createBoundingBox(0, 0)); + validateScan(table, expr, 0); + expr = Expressions.stIntersects("geom", createBoundingBox(1.5, 1.5)); + validateScan(table, expr, 1); + expr = Expressions.stIntersects("geom", createBoundingBox(0.5, -1.1, 1.1, 1.1)); + validateScan(table, expr, 2); + expr = Expressions.stIntersects("geom", createBoundingBox(0, 0, 0.75, 0.75)); + validateScan(table, expr, 0); + expr = Expressions.stIntersects("geom", createBoundingBox(0.75, 0.75, 1.25, 1.25)); + validateScan(table, expr, 1); + + expr = + Expressions.and( + Expressions.lessThanOrEqual("id", 10L), + Expressions.stIntersects("geom", createBoundingBox(0.5, -1.1, 1.1, 1.1))); + validateScan(table, expr, 1); + } + + @Test + public void testPartitionedGeometryTable() throws IOException { + File location = tempDir.toFile(); + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("part").build(); + Table table = + TestTables.create( + location, + "test_partitioned", + schema, + spec, + 3, + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, "parquet")); + + AppendFiles append = table.newAppend(); + for (int i = 0; i < testData.size(); i++) { + List fileContent = testData.get(i); + GenericAppenderFactory appenderFactory = new GenericAppenderFactory(table.schema()); + Path path = new Path(location.toString(), "data-" + i + ".parquet"); + OutputFile outputFile = HadoopOutputFile.fromPath(path, CONF); + FileAppender fileAppender = + appenderFactory.newAppender(outputFile, FileFormat.PARQUET); + fileAppender.addAll(fileContent); + fileAppender.close(); + Metrics metrics = fileAppender.metrics(); + PartitionData partition = new PartitionData(spec.partitionType()); + partition.set(0, i); + DataFile dataFile = + DataFiles.builder(spec) + .withInputFile(HadoopInputFile.fromPath(path, CONF)) + .withPartition(partition) + .withFormat(FileFormat.PARQUET) + .withMetrics(metrics) + .build(); + append.appendFile(dataFile); + } + append.commit(); + + // Use createBoundingBox in spatial predicate call. + Expression expr = Expressions.stIntersects("geom", createBoundingBox(1, 1)); + validateScan(table, expr, 1); + expr = + Expressions.and( + Expressions.equal("part", 3), + Expressions.stIntersects("geom", createBoundingBox(0.5, -1.1, 1.1, 1.1))); + validateScan(table, expr, 1); + } + + @SuppressWarnings("unchecked") + private void validateScan(Table table, Expression expr, int expectedScannedFileNum) + throws IOException { + List results; + try (CloseableIterable reader = IcebergGenerics.read(table).where(expr).build(); + CloseableIterator iter = reader.iterator()) { + results = (List) IteratorUtils.toList(iter); + if (expectedScannedFileNum > 0) { + assertThat(results).isNotEmpty(); + } + } + try (CloseableIterable planFiles = table.newScan().filter(expr).planFiles(); + CloseableIterator fileScanTasks = planFiles.iterator()) { + int numScannedFiles = IteratorUtils.toList(fileScanTasks).size(); + assertThat(numScannedFiles).isEqualTo(expectedScannedFileNum); + } + } + + // Helper method for a point bounding box when both min and max coordinates are equal. + @SuppressWarnings("ParameterName") + private static BoundingBox createBoundingBox(double x, double y) { + return new BoundingBox(GeospatialBound.createXY(x, y), GeospatialBound.createXY(x, y)); + } + + // Helper method for a bounding box defined by min and max coordinates. + private static BoundingBox createBoundingBox(double minX, double minY, double maxX, double maxY) { + return new BoundingBox( + GeospatialBound.createXY(minX, minY), GeospatialBound.createXY(maxX, maxY)); + } +} diff --git a/data/src/test/java/org/apache/iceberg/data/parquet/TestGeospatialMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/parquet/TestGeospatialMetricsRowGroupFilter.java new file mode 100644 index 000000000000..d0e445e273c8 --- /dev/null +++ b/data/src/test/java/org/apache/iceberg/data/parquet/TestGeospatialMetricsRowGroupFilter.java @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data.parquet; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.iceberg.Files; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.parquet.ParquetMetricsRowGroupFilter; +import org.apache.iceberg.types.Types; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.io.LocalInputFile; +import org.apache.parquet.schema.MessageType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.locationtech.jts.geom.Envelope; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.geom.GeometryFactory; +import org.locationtech.jts.io.WKBWriter; + +public class TestGeospatialMetricsRowGroupFilter { + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "geom", Types.GeometryType.crs84())); + + @TempDir private File tempDir; + + private static class ParquetFileMetadata { + BlockMetaData blockMetaData; + MessageType schema; + } + + private ParquetFileMetadata nonEmptyBlockMetadata; + private ParquetFileMetadata emptyBlockMetadata; + private ParquetFileMetadata nullBlockMetadata; + + @BeforeEach + public void createTestFiles() throws IOException { + File nonEmptyFile = new File(tempDir, "test_file_non_empty.parquet"); + File emptyFile = new File(tempDir, "test_file_empty.parquet"); + File nullFile = new File(tempDir, "test_file_null.parquet"); + + GeometryFactory factory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + + // Create test files with different geometries + GenericRecord record = GenericRecord.create(SCHEMA); + record.setField("id", 1); + Geometry polygon = factory.toGeometry(new Envelope(1, 2, 3, 4)); + byte[] polygonWkb = wkbWriter.write(polygon); + + record.setField("geom", ByteBuffer.wrap(polygonWkb)); + nonEmptyBlockMetadata = createFileWithRecord(nonEmptyFile, record); + + byte[] emptyLineString = wkbWriter.write(factory.createLineString()); + record.setField("geom", ByteBuffer.wrap(emptyLineString)); + emptyBlockMetadata = createFileWithRecord(emptyFile, record); + + record.setField("geom", null); + nullBlockMetadata = createFileWithRecord(nullFile, record); + } + + private ParquetFileMetadata createFileWithRecord(File file, GenericRecord record) + throws IOException { + OutputFile outputFile = Files.localOutput(file); + try (FileAppender appender = + Parquet.write(outputFile) + .schema(SCHEMA) + .createWriterFunc(fileSchema -> InternalWriter.create(SCHEMA.asStruct(), fileSchema)) + .build()) { + appender.add(record); + } + + LocalInputFile inFile = new LocalInputFile(file.toPath()); + try (ParquetFileReader reader = ParquetFileReader.open(inFile)) { + assertThat(reader.getRowGroups()).as("Should create only one row group").hasSize(1); + ParquetFileMetadata metadata = new ParquetFileMetadata(); + metadata.schema = reader.getFileMetaData().getSchema(); + metadata.blockMetaData = reader.getRowGroups().get(0); + return metadata; + } + } + + @Test + public void testHitNonEmptyFile() { + boolean shouldRead = + shouldReadParquet( + Expressions.stIntersects("geom", createBoundingBox(1, 3, 2, 4)), nonEmptyBlockMetadata); + assertThat(shouldRead).isTrue(); + + shouldRead = + shouldReadParquet( + Expressions.stIntersects("geom", createBoundingBox(1, 2, 3, 4)), nonEmptyBlockMetadata); + assertThat(shouldRead).isTrue(); + + shouldRead = + shouldReadParquet( + Expressions.stIntersects("geom", createBoundingBox(0, 0, 5, 5)), nonEmptyBlockMetadata); + assertThat(shouldRead).isTrue(); + + shouldRead = + shouldReadParquet( + Expressions.stIntersects("geom", createBoundingBox(1, 3, 1, 3)), nonEmptyBlockMetadata); + assertThat(shouldRead).isTrue(); + + shouldRead = + shouldReadParquet( + Expressions.stIntersects("geom", createBoundingBox(2, 4, 2, 4)), nonEmptyBlockMetadata); + assertThat(shouldRead).isTrue(); + } + + @Test + public void testNotHitNonEmptyFile() { + boolean shouldRead = + shouldReadParquet( + Expressions.stIntersects("geom", createBoundingBox(0, 0, 1, 1)), nonEmptyBlockMetadata); + assertThat(shouldRead).isFalse(); + + shouldRead = + shouldReadParquet( + Expressions.stIntersects("geom", createBoundingBox(4, 4, 5, 5)), nonEmptyBlockMetadata); + assertThat(shouldRead).isFalse(); + } + + @Test + public void testHitEmptyFile() { + // We cannot skip row groups without geospatial bounding box. + boolean shouldRead = + shouldReadParquet( + Expressions.stIntersects("geom", createBoundingBox(1, 3, 2, 4)), emptyBlockMetadata); + assertThat(shouldRead).isTrue(); + + shouldRead = + shouldReadParquet( + Expressions.stIntersects("geom", createBoundingBox(1, 2, 3, 4)), emptyBlockMetadata); + assertThat(shouldRead).isTrue(); + } + + @Test + public void testNotHitNullFile() { + boolean shouldRead = + shouldReadParquet( + Expressions.stIntersects("geom", createBoundingBox(1, 3, 2, 4)), nullBlockMetadata); + assertThat(shouldRead).isFalse(); + + shouldRead = + shouldReadParquet( + Expressions.stIntersects("geom", createBoundingBox(1, 2, 3, 4)), nullBlockMetadata); + assertThat(shouldRead).isFalse(); + } + + private boolean shouldReadParquet(Expression expression, ParquetFileMetadata metadata) { + return new ParquetMetricsRowGroupFilter(SCHEMA, expression, true) + .shouldRead(metadata.schema, metadata.blockMetaData); + } + + private static BoundingBox createBoundingBox(double minX, double minY, double maxX, double maxY) { + return new BoundingBox( + GeospatialBound.createXY(minX, minY), GeospatialBound.createXY(maxX, maxY)); + } +} diff --git a/data/src/test/java/org/apache/iceberg/data/parquet/TestParquetEncryptionWithWriteSupport.java b/data/src/test/java/org/apache/iceberg/data/parquet/TestParquetEncryptionWithWriteSupport.java index 32d858d93289..23ea2c28250f 100644 --- a/data/src/test/java/org/apache/iceberg/data/parquet/TestParquetEncryptionWithWriteSupport.java +++ b/data/src/test/java/org/apache/iceberg/data/parquet/TestParquetEncryptionWithWriteSupport.java @@ -71,6 +71,11 @@ protected boolean supportsVariant() { return true; } + @Override + protected boolean supportsGeospatial() { + return true; + } + @Override protected void writeAndValidate(Schema schema) throws IOException { List expected = RandomGenericData.generate(schema, 100, 0L); diff --git a/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetReaders.java b/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetReaders.java index 8f2957e1c60d..e65964fb8a6b 100644 --- a/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetReaders.java +++ b/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetReaders.java @@ -202,6 +202,18 @@ public Optional> visit( LogicalTypeAnnotation.UUIDLogicalTypeAnnotation uuidLogicalType) { return Optional.of(ParquetValueReaders.uuids(desc)); } + + @Override + public Optional> visit( + LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType) { + return Optional.of(ParquetValueReaders.byteBuffers(desc)); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyLogicalType) { + return Optional.of(ParquetValueReaders.byteBuffers(desc)); + } } private class ReadBuilder extends TypeWithSchemaVisitor> { diff --git a/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetWriter.java b/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetWriter.java index 2a986fc62d00..e0ff6a4713bb 100644 --- a/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetWriter.java +++ b/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetWriter.java @@ -270,5 +270,17 @@ public Optional> visit( LogicalTypeAnnotation.UUIDLogicalTypeAnnotation uuidLogicalType) { return Optional.of(ParquetValueWriters.uuids(desc)); } + + @Override + public Optional> visit( + LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryType) { + return Optional.of(ParquetValueWriters.byteBuffers(desc)); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyType) { + return Optional.of(ParquetValueWriters.byteBuffers(desc)); + } } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java b/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java index 26ef6e468ede..babc3ee36118 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java @@ -29,9 +29,11 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.EdgeAlgorithm; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.TimestampType; +import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; @@ -242,6 +244,41 @@ public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation json public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonType) { return Optional.of(Types.BinaryType.get()); } + + @Override + public Optional visit(LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryType) { + String crs = geometryType.getCrs(); + return Optional.of(Types.GeometryType.of(crs)); + } + + @Override + public Optional visit( + LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyType) { + String crs = geographyType.getCrs(); + EdgeInterpolationAlgorithm algorithm = geographyType.getAlgorithm(); + EdgeAlgorithm edgeAlgorithm; + switch (algorithm) { + case SPHERICAL: + edgeAlgorithm = EdgeAlgorithm.SPHERICAL; + break; + case VINCENTY: + edgeAlgorithm = EdgeAlgorithm.VINCENTY; + break; + case THOMAS: + edgeAlgorithm = EdgeAlgorithm.THOMAS; + break; + case ANDOYER: + edgeAlgorithm = EdgeAlgorithm.ANDOYER; + break; + case KARNEY: + edgeAlgorithm = EdgeAlgorithm.KARNEY; + break; + default: + throw new UnsupportedOperationException( + "Cannot convert unknown edge algorithm: " + algorithm); + } + return Optional.of(Types.GeographyType.of(crs, edgeAlgorithm)); + } } private void addAlias(String name, int fieldId) { diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetBloomRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetBloomRowGroupFilter.java index 553b8a0fa3f8..67b94e38f09e 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetBloomRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetBloomRowGroupFilter.java @@ -32,6 +32,7 @@ import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.expressions.Literal; +import org.apache.iceberg.geospatial.BoundingBox; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.relocated.com.google.common.collect.Sets; @@ -259,6 +260,18 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; } + @Override + public Boolean stIntersects(BoundReference ref, Literal lit) { + // bloom filter is based on hash and cannot eliminate based on stIntersects + return ROWS_MIGHT_MATCH; + } + + @Override + public Boolean stDisjoint(BoundReference ref, Literal lit) { + // bloom filter is based on hash and cannot eliminate based on stDisjoint + return ROWS_MIGHT_MATCH; + } + private BloomFilter loadBloomFilter(int id) { if (bloomCache.containsKey(id)) { return bloomCache.get(id); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java index 1d24b7ccd71f..213aa84bfb18 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java @@ -33,6 +33,7 @@ import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.expressions.Literal; +import org.apache.iceberg.geospatial.BoundingBox; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.relocated.com.google.common.collect.Sets; @@ -406,6 +407,16 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { return ROWS_CANNOT_MATCH; } + @Override + public Boolean stIntersects(BoundReference ref, Literal lit) { + return ROWS_MIGHT_MATCH; + } + + @Override + public Boolean stDisjoint(BoundReference ref, Literal lit) { + return ROWS_MIGHT_MATCH; + } + @SuppressWarnings("unchecked") private Set dict(int id, Comparator comparator) { Preconditions.checkNotNull(dictionaries, "Dictionary is required"); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetrics.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetrics.java index af6566e747b2..0974d7c823a0 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetrics.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetrics.java @@ -33,6 +33,7 @@ import org.apache.iceberg.MetricsModes; import org.apache.iceberg.MetricsUtil; import org.apache.iceberg.Schema; +import org.apache.iceberg.geospatial.GeospatialBound; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; @@ -53,6 +54,8 @@ import org.apache.iceberg.variants.VariantValue; import org.apache.iceberg.variants.Variants; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.statistics.geospatial.BoundingBox; +import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.ColumnPath; @@ -266,6 +269,9 @@ private FieldMetrics metricsFromFooter( return null; } else if (truncateLength <= 0) { return counts(fieldId); + } else if (icebergType.typeId() == org.apache.iceberg.types.Type.TypeID.GEOMETRY + || icebergType.typeId() == org.apache.iceberg.types.Type.TypeID.GEOGRAPHY) { + return geospatialBounds(fieldId, icebergType); } else { return bounds(fieldId, icebergType, primitive, truncateLength); } @@ -342,6 +348,70 @@ private FieldMetrics bounds( return new FieldMetrics<>(fieldId, valueCount, nullCount, lower, upper, icebergType); } + private FieldMetrics geospatialBounds( + int fieldId, org.apache.iceberg.types.Type.PrimitiveType icebergType) { + if (icebergType == null) { + return null; + } + + long valueCount = 0; + long nullCount = 0; + boolean isBoundValid = true; + double minX = Double.POSITIVE_INFINITY; + double minY = Double.POSITIVE_INFINITY; + double minZ = Double.POSITIVE_INFINITY; + double minM = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + double maxY = Double.NEGATIVE_INFINITY; + double maxZ = Double.NEGATIVE_INFINITY; + double maxM = Double.NEGATIVE_INFINITY; + + ColumnPath path = ColumnPath.get(currentPath()); + for (ColumnChunkMetaData column : columns.get(path)) { + Statistics stats = column.getStatistics(); + if (stats == null || stats.isEmpty()) { + return null; + } + + nullCount += stats.getNumNulls(); + valueCount += column.getValueCount(); + + // We cannot make any assumption about the geospatial values when geospatial bound is + // missing. + // See https://github.com/apache/parquet-format/pull/494 + GeospatialStatistics geoStats = column.getGeospatialStatistics(); + if (geoStats == null) { + isBoundValid = false; + continue; + } + + BoundingBox boundingBox = geoStats.getBoundingBox(); + if (boundingBox == null || !boundingBox.isValid() || boundingBox.isXYEmpty()) { + isBoundValid = false; + continue; + } + + minX = Math.min(minX, boundingBox.getXMin()); + minY = Math.min(minY, boundingBox.getYMin()); + minZ = Math.min(minZ, boundingBox.getZMin()); + minM = Math.min(minM, boundingBox.getMMin()); + maxX = Math.max(maxX, boundingBox.getXMax()); + maxY = Math.max(maxY, boundingBox.getYMax()); + maxZ = Math.max(maxZ, boundingBox.getZMax()); + maxM = Math.max(maxM, boundingBox.getMMax()); + } + + // X and Y should be valid, otherwise the bound will be invalid and should be ignored + if (!isBoundValid) { + return new FieldMetrics<>(fieldId, valueCount, nullCount); + } + + GeospatialBound lower = GeospatialBound.createXYZM(minX, minY, minZ, minM); + GeospatialBound upper = GeospatialBound.createXYZM(maxX, maxY, maxZ, maxM); + return new FieldMetrics<>( + fieldId, valueCount, nullCount, lower.toByteBuffer(), upper.toByteBuffer()); + } + @Override @SuppressWarnings("CyclomaticComplexity") public Iterable> variant( diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index 1ad346d39ab7..97dbedd40aa3 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -34,12 +34,16 @@ import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.expressions.Literal; +import org.apache.iceberg.geospatial.BoundingBox; +import org.apache.iceberg.geospatial.GeospatialBound; +import org.apache.iceberg.geospatial.GeospatialPredicateEvaluators; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.BinaryUtil; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.io.api.Binary; @@ -78,6 +82,7 @@ public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup) { private class MetricsEvalVisitor extends BoundExpressionVisitor { private Map> stats = null; + private Map geoStats = null; private Map valueCounts = null; private Map> conversions = null; @@ -87,6 +92,7 @@ private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { } this.stats = Maps.newHashMap(); + this.geoStats = Maps.newHashMap(); this.valueCounts = Maps.newHashMap(); this.conversions = Maps.newHashMap(); for (ColumnChunkMetaData col : rowGroup.getColumns()) { @@ -95,6 +101,7 @@ private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { int id = colType.getId().intValue(); Type icebergType = schema.findType(id); stats.put(id, col.getStatistics()); + geoStats.put(id, col.getGeospatialStatistics()); valueCounts.put(id, col.getValueCount()); conversions.put(id, ParquetConversions.converterFromParquet(colType, icebergType)); } @@ -549,6 +556,57 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; } + @Override + public Boolean stIntersects(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + + Long valueCount = valueCounts.get(id); + if (valueCount == null) { + // the column is not present and is all nulls + return ROWS_CANNOT_MATCH; + } + + Statistics colStats = stats.get(id); + if (colStats != null && !colStats.isEmpty()) { + if (allNulls(colStats, valueCount)) { + return ROWS_CANNOT_MATCH; + } + } + + GeospatialStatistics colGeoStats = geoStats.get(id); + if (colGeoStats == null) { + // No geospatial statistics, we cannot make any assumptions about the geospatial data. + return ROWS_MIGHT_MATCH; + } + + org.apache.parquet.column.statistics.geospatial.BoundingBox boundingBox = + colGeoStats.getBoundingBox(); + if (boundingBox == null || !boundingBox.isValid() || boundingBox.isXYEmpty()) { + // No valid geospatial bounds, we cannot make any assumptions about the geospatial data. + return ROWS_MIGHT_MATCH; + } + + // Found valid geospatial bounds from statistics, evaluate the spatial predicate to see if we + // can + // skip this row group. + BoundingBox dataBoundingBox = + new BoundingBox( + GeospatialBound.createXY(boundingBox.getXMin(), boundingBox.getYMin()), + GeospatialBound.createXY(boundingBox.getXMax(), boundingBox.getYMax())); + GeospatialPredicateEvaluators.GeospatialPredicateEvaluator evaluator = + GeospatialPredicateEvaluators.create(ref.type()); + if (!evaluator.intersects(dataBoundingBox, lit.value())) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + } + + @Override + public Boolean stDisjoint(BoundReference ref, Literal lit) { + return ROWS_MIGHT_MATCH; + } + @SuppressWarnings("unchecked") private T min(Statistics statistics, int id) { return (T) conversions.get(id).apply(statistics.genericGetMin()); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java b/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java index d648cbf0694b..217473410d39 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java @@ -36,6 +36,8 @@ import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types.DecimalType; import org.apache.iceberg.types.Types.FixedType; +import org.apache.iceberg.types.Types.GeographyType; +import org.apache.iceberg.types.Types.GeometryType; import org.apache.iceberg.types.Types.ListType; import org.apache.iceberg.types.Types.MapType; import org.apache.iceberg.types.Types.NestedField; @@ -43,6 +45,7 @@ import org.apache.iceberg.types.Types.TimestampNanoType; import org.apache.iceberg.types.Types.TimestampType; import org.apache.iceberg.variants.Variant; +import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit; @@ -238,6 +241,39 @@ public Type primitive( return Types.primitive(BINARY, repetition).as(STRING).id(id).named(name); case BINARY: return Types.primitive(BINARY, repetition).id(id).named(name); + case GEOMETRY: + GeometryType geometryType = ((GeometryType) primitive); + return Types.primitive(BINARY, repetition) + .as(LogicalTypeAnnotation.geometryType(geometryType.crs())) + .id(id) + .named(name); + case GEOGRAPHY: + GeographyType geographyType = ((GeographyType) primitive); + EdgeInterpolationAlgorithm algorithm; + switch (geographyType.algorithm()) { + case SPHERICAL: + algorithm = EdgeInterpolationAlgorithm.SPHERICAL; + break; + case VINCENTY: + algorithm = EdgeInterpolationAlgorithm.VINCENTY; + break; + case THOMAS: + algorithm = EdgeInterpolationAlgorithm.THOMAS; + break; + case ANDOYER: + algorithm = EdgeInterpolationAlgorithm.ANDOYER; + break; + case KARNEY: + algorithm = EdgeInterpolationAlgorithm.KARNEY; + break; + default: + throw new UnsupportedOperationException( + "Unsupported edge interpolation algorithm: " + geographyType.algorithm()); + } + return Types.primitive(BINARY, repetition) + .as(LogicalTypeAnnotation.geographyType(geographyType.crs(), algorithm)) + .id(id) + .named(name); case FIXED: FixedType fixed = (FixedType) primitive; diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestGeospatial.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestGeospatial.java new file mode 100644 index 000000000000..186bcb017418 --- /dev/null +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestGeospatial.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.parquet; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Path; +import java.util.List; +import org.apache.iceberg.Files; +import org.apache.iceberg.InternalTestHelpers; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.InternalReader; +import org.apache.iceberg.data.parquet.InternalWriter; +import org.apache.iceberg.geospatial.GeospatialBound; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.EdgeAlgorithm; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.GeometryFactory; +import org.locationtech.jts.io.WKBWriter; + +public class TestGeospatial { + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "geom", Types.GeometryType.crs84()), + Types.NestedField.optional(3, "geog", Types.GeographyType.crs84())); + + private static final Schema SCHEMA_NON_CRS84 = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "geom_3857", Types.GeometryType.of("srid:3857")), + Types.NestedField.optional( + 3, "geog_4269_karney", Types.GeographyType.of("srid:4269", EdgeAlgorithm.KARNEY))); + + private static final GenericRecord RECORD = GenericRecord.create(SCHEMA); + private static final GenericRecord RECORD_NON_CRS_84 = GenericRecord.create(SCHEMA_NON_CRS84); + private static final GeometryFactory GEOMETRY_FACTORY = new GeometryFactory(); + private static final WKBWriter WKB_WRITER = new WKBWriter(); + + private static ByteBuffer createPointWKB(double coordX, double coordY) { + return ByteBuffer.wrap( + WKB_WRITER.write(GEOMETRY_FACTORY.createPoint(new Coordinate(coordX, coordY)))); + } + + private static ByteBuffer createEmptyLineStringWKB() { + return ByteBuffer.wrap(WKB_WRITER.write(GEOMETRY_FACTORY.createLineString())); + } + + // WKB for POINT (1 1) + private static final ByteBuffer POINT_1_1_WKB = createPointWKB(1, 1); + // WKB for POINT (2 3) + private static final ByteBuffer POINT_2_3_WKB = createPointWKB(2, 3); + // WKB for LINESTRING EMPTY + private static final ByteBuffer LINESTRING_EMPTY_WKB = createEmptyLineStringWKB(); + + @TempDir private Path temp; + + @Test + public void testGeospatialTypes() throws IOException { + Record record = + RECORD.copy( + "id", 1, + "geom", POINT_1_1_WKB.slice(), // use slice to ensure independent buffer positions + "geog", POINT_2_3_WKB.slice()); + + Record actual = writeAndRead(SCHEMA, record); + InternalTestHelpers.assertEquals(SCHEMA.asStruct(), record, actual); + } + + @Test + public void testNullGeospatialTypes() throws IOException { + Record record = + RECORD.copy( + "id", 2, + "geom", null, + "geog", null); + + Record actual = writeAndRead(SCHEMA, record); + InternalTestHelpers.assertEquals(SCHEMA.asStruct(), record, actual); + } + + @Test + public void testEmptyGeospatialTypes() throws IOException { + Record record = + RECORD.copy( + "id", 3, + "geom", LINESTRING_EMPTY_WKB.slice(), + "geog", LINESTRING_EMPTY_WKB.slice()); + + Record actual = writeAndRead(SCHEMA, record); + InternalTestHelpers.assertEquals(SCHEMA.asStruct(), record, actual); + } + + @Test + public void testNonCrs84GeospatialTypes() throws IOException { + Record record = + RECORD_NON_CRS_84.copy( + "id", 4, + "geom_3857", POINT_1_1_WKB.slice(), + "geog_4269_karney", POINT_2_3_WKB.slice()); + + Record actual = writeAndRead(SCHEMA_NON_CRS84, record); + InternalTestHelpers.assertEquals(SCHEMA_NON_CRS84.asStruct(), record, actual); + } + + @Test + public void testGeometryMetrics() throws IOException { + Record record0 = + RECORD.copy( + "id", 1, + "geom", POINT_1_1_WKB.slice(), // use slice to ensure independent buffer positions + "geog", POINT_1_1_WKB.slice()); + Record record1 = + RECORD.copy( + "id", 1, + "geom", POINT_2_3_WKB.slice(), // use slice to ensure independent buffer positions + "geog", POINT_2_3_WKB.slice()); + Metrics metrics = writeAndRetrieveMetrics(SCHEMA, ImmutableList.of(record0, record1)); + ByteBuffer lowerBound = metrics.lowerBounds().get(2); + ByteBuffer upperBound = metrics.upperBounds().get(2); + GeospatialBound geoLowerBound = GeospatialBound.fromByteBuffer(lowerBound); + GeospatialBound geoUpperBound = GeospatialBound.fromByteBuffer(upperBound); + assertThat(geoLowerBound.x()).isEqualTo(1); + assertThat(geoLowerBound.y()).isEqualTo(1); + assertThat(geoUpperBound.x()).isEqualTo(2); + assertThat(geoUpperBound.y()).isEqualTo(3); + } + + @Test + public void testEmptyGeometryMetrics() throws IOException { + Record record = + RECORD.copy( + "id", 3, + "geom", LINESTRING_EMPTY_WKB.slice(), + "geog", LINESTRING_EMPTY_WKB.slice()); + Metrics metrics = writeAndRetrieveMetrics(SCHEMA, ImmutableList.of(record)); + ByteBuffer lowerBound = metrics.lowerBounds().get(2); + ByteBuffer upperBound = metrics.upperBounds().get(2); + assertThat(lowerBound).isNull(); + assertThat(upperBound).isNull(); + } + + @Test + public void testNullGeometryMetrics() throws IOException { + Record record = + RECORD.copy( + "id", 1, + "geom", null, + "geog", null); + Metrics metrics = writeAndRetrieveMetrics(SCHEMA, ImmutableList.of(record)); + ByteBuffer lowerBound = metrics.lowerBounds().get(2); + ByteBuffer upperBound = metrics.upperBounds().get(2); + assertThat(lowerBound).isNull(); + assertThat(upperBound).isNull(); + } + + private Record writeAndRead(Schema schema, Record record) throws IOException { + return Iterables.getOnlyElement(writeAndRead(schema, ImmutableList.of(record))); + } + + private List writeAndRead(Schema schema, List records) throws IOException { + OutputFile outputFile = Files.localOutput(temp.resolve("geospatial-test.parquet").toFile()); + + try (FileAppender writer = + Parquet.write(outputFile) + .schema(schema) + .createWriterFunc(fileSchema -> InternalWriter.create(schema.asStruct(), fileSchema)) + .build()) { + for (Record r : records) { + writer.add(r); + } + } + + InputFile inputFile = outputFile.toInputFile(); + try (CloseableIterable reader = + Parquet.read(inputFile) + .project(schema) + .createReaderFunc(fileSchema -> InternalReader.create(schema, fileSchema)) + .build()) { + return Lists.newArrayList(reader); + } + } + + private Metrics writeAndRetrieveMetrics(Schema schema, List records) throws IOException { + OutputFile outputFile = Files.localOutput(temp.resolve("geospatial-test.parquet").toFile()); + + try (FileAppender writer = + Parquet.write(outputFile) + .schema(schema) + .createWriterFunc(fileSchema -> InternalWriter.create(schema.asStruct(), fileSchema)) + .build()) { + for (Record r : records) { + writer.add(r); + } + writer.close(); + return writer.metrics(); + } + } +} diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestInternalParquet.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestInternalParquet.java index a53ac8972528..f193cab5c552 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestInternalParquet.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestInternalParquet.java @@ -56,6 +56,11 @@ protected boolean supportsVariant() { return true; } + @Override + protected boolean supportsGeospatial() { + return true; + } + @Override protected void writeAndValidate(Schema schema) throws IOException { List expected = RandomInternalData.generate(schema, 100, 1376L);