Require elem type

MaxGekk · MaxGekk · commit 05ce50ac1d9a · 2020-04-26T11:41:20.000+03:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -429,7 +429,7 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
 case class InSet(
     child: Expression,
     hset: Set[Any],
-    hsetElemType: Option[DataType] = None) extends UnaryExpression with Predicate {
+    hsetElemType: DataType) extends UnaryExpression with Predicate {
 
   require(hset != null, "hset could not be null")
 
@@ -449,12 +449,12 @@ case class InSet(
     }
   }
 
-  @transient lazy val set: Set[Any] = child.dataType match {
+  @transient lazy val set: Set[Any] = hsetElemType match {
     case t: AtomicType if !t.isInstanceOf[BinaryType] => hset
     case _: NullType => hset
     case _ =>
       // for structs use interpreted ordering to be able to compare UnsafeRows with non-UnsafeRows
-      TreeSet.empty(TypeUtils.getInterpretedOrdering(child.dataType)) ++ (hset - null)
+      TreeSet.empty(TypeUtils.getInterpretedOrdering(hsetElemType)) ++ (hset - null)
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
@@ -465,7 +465,7 @@ case class InSet(
     }
   }
 
-  private def canBeComputedUsingSwitch: Boolean = child.dataType match {
+  private def canBeComputedUsingSwitch: Boolean = hsetElemType match {
     case ByteType | ShortType | IntegerType | DateType => true
     case _ => false
   }
@@ -523,9 +523,8 @@ case class InSet(
 
   override def sql: String = {
     val valueSQL = child.sql
-    val elemType = hsetElemType.getOrElse(child.dataType)
     val listSQL = hset.toSeq
-      .map(elem => Literal(convertToScala(elem, elemType)).sql)
+      .map(elem => Literal(convertToScala(elem, hsetElemType)).sql)
       .mkString(", ")
     s"($valueSQL IN ($listSQL))"
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -251,7 +251,7 @@ object OptimizeIn extends Rule[LogicalPlan] {
           EqualTo(v, newList.head)
         } else if (newList.length > SQLConf.get.optimizerInSetConversionThreshold) {
           val hSet = newList.map(e => e.eval(EmptyRow))
-          InSet(v, HashSet() ++ hSet)
+          InSet(v, HashSet() ++ hSet, v.dataType)
         } else if (newList.length < list.length) {
           expr.copy(list = newList)
         } else { // newList.length == list.length && newList.length > 1
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -130,7 +130,9 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
   private def checkInAndInSet(in: In, expected: Any): Unit = {
     // expecting all in.list are Literal or NonFoldableLiteral.
     checkEvaluation(in, expected)
-    checkEvaluation(InSet(in.value, HashSet() ++ in.list.map(_.eval())), expected)
+    checkEvaluation(
+      InSet(in.value, HashSet() ++ in.list.map(_.eval()), in.value.dataType),
+      expected)
   }
 
   test("basic IN/INSET predicate test") {
@@ -154,7 +156,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
         Literal(2)))),
       true)
     checkEvaluation(
-      And(InSet(Literal(1), HashSet(1, 2)), InSet(Literal(2), Set(1, 2))),
+      And(InSet(Literal(1), HashSet(1, 2), IntegerType), InSet(Literal(2), Set(1, 2), IntegerType)),
       true)
 
     val ns = NonFoldableLiteral.create(null, StringType)
@@ -256,12 +258,12 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
 
       val nullLiteral = Literal(null, presentValue.dataType)
 
-      checkEvaluation(InSet(nullLiteral, values), expected = null)
-      checkEvaluation(InSet(nullLiteral, values + null), expected = null)
-      checkEvaluation(InSet(presentValue, values), expected = true)
-      checkEvaluation(InSet(presentValue, values + null), expected = true)
-      checkEvaluation(InSet(absentValue, values), expected = false)
-      checkEvaluation(InSet(absentValue, values + null), expected = null)
+      checkEvaluation(InSet(nullLiteral, values, nullLiteral.dataType), expected = null)
+      checkEvaluation(InSet(nullLiteral, values + null, nullLiteral.dataType), expected = null)
+      checkEvaluation(InSet(presentValue, values, presentValue.dataType), expected = true)
+      checkEvaluation(InSet(presentValue, values + null, presentValue.dataType), expected = true)
+      checkEvaluation(InSet(absentValue, values, absentValue.dataType), expected = false)
+      checkEvaluation(InSet(absentValue, values + null, absentValue.dataType), expected = null)
     }
 
     def checkAllTypes(): Unit = {
@@ -498,7 +500,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("SPARK-22693: InSet should not use global variables") {
     val ctx = new CodegenContext
-    InSet(Literal(1), Set(1, 2, 3, 4)).genCode(ctx)
+    InSet(Literal(1), Set(1, 2, 3, 4), IntegerType).genCode(ctx)
     assert(ctx.inlinedMutableStates.isEmpty)
   }
 
@@ -535,7 +537,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("SPARK-29100: InSet with empty input set") {
     val row = create_row(1)
-    val inSet = InSet(BoundReference(0, IntegerType, true), Set.empty)
+    val inSet = InSet(BoundReference(0, IntegerType, true), Set.empty, IntegerType)
     checkEvaluation(inSet, false, row)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
@@ -85,7 +85,7 @@ class OptimizeInSuite extends PlanTest {
     val optimized = Optimize.execute(originalQuery.analyze)
     val correctAnswer =
       testRelation
-        .where(InSet(UnresolvedAttribute("a"), (1 to 11).toSet))
+        .where(InSet(UnresolvedAttribute("a"), (1 to 11).toSet, IntegerType))
         .analyze
 
     comparePlans(optimized, correctAnswer)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.{ColumnStatsM
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * In this test suite, we test predicates containing the following operators:
@@ -352,15 +353,15 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
 
   test("cint IN (3, 4, 5)") {
     validateEstimatedStats(
-      Filter(InSet(attrInt, Set(3, 4, 5)), childStatsTestPlan(Seq(attrInt), 10L)),
+      Filter(InSet(attrInt, Set(3, 4, 5), IntegerType), childStatsTestPlan(Seq(attrInt), 10L)),
       Seq(attrInt -> ColumnStat(distinctCount = Some(3), min = Some(3), max = Some(5),
         nullCount = Some(0), avgLen = Some(4), maxLen = Some(4))),
       expectedRowCount = 3)
   }
 
   test("evaluateInSet with all zeros") {
     validateEstimatedStats(
-      Filter(InSet(attrString, Set(3, 4, 5)),
+      Filter(InSet(attrString, Set(3, 4, 5), IntegerType),
         StatsTestPlan(Seq(attrString), 0,
           AttributeMap(Seq(attrString ->
             ColumnStat(distinctCount = Some(0), min = None, max = None,
@@ -371,7 +372,7 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
 
   test("evaluateInSet with string") {
     validateEstimatedStats(
-      Filter(InSet(attrString, Set("A0")),
+      Filter(InSet(attrString, Set(UTF8String.fromString("A0")), StringType),
         StatsTestPlan(Seq(attrString), 10,
           AttributeMap(Seq(attrString ->
             ColumnStat(distinctCount = Some(10), min = None, max = None,
@@ -383,14 +384,14 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
 
   test("cint NOT IN (3, 4, 5)") {
     validateEstimatedStats(
-      Filter(Not(InSet(attrInt, Set(3, 4, 5))), childStatsTestPlan(Seq(attrInt), 10L)),
+      Filter(Not(InSet(attrInt, Set(3, 4, 5), IntegerType)), childStatsTestPlan(Seq(attrInt), 10L)),
       Seq(attrInt -> colStatInt.copy(distinctCount = Some(7))),
       expectedRowCount = 7)
   }
 
   test("cbool IN (true)") {
     validateEstimatedStats(
-      Filter(InSet(attrBool, Set(true)), childStatsTestPlan(Seq(attrBool), 10L)),
+      Filter(InSet(attrBool, Set(true), BooleanType), childStatsTestPlan(Seq(attrBool), 10L)),
       Seq(attrBool -> ColumnStat(distinctCount = Some(1), min = Some(true), max = Some(true),
         nullCount = Some(0), avgLen = Some(1), maxLen = Some(1))),
       expectedRowCount = 5)
@@ -510,7 +511,7 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
       attributeStats = AttributeMap(Seq(attrInt -> cornerChildColStatInt))
     )
     validateEstimatedStats(
-      Filter(InSet(attrInt, Set(1, 2, 3, 4, 5)), cornerChildStatsTestplan),
+      Filter(InSet(attrInt, Set(1, 2, 3, 4, 5), IntegerType), cornerChildStatsTestplan),
       Seq(attrInt -> ColumnStat(distinctCount = Some(2), min = Some(1), max = Some(5),
         nullCount = Some(0), avgLen = Some(4), maxLen = Some(4))),
       expectedRowCount = 2)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -830,7 +830,8 @@ class Column(val expr: Expression) extends Logging {
   def isInCollection(values: scala.collection.Iterable[_]): Column = withExpr {
     val exprValues = values.toSeq.map(lit(_).expr)
     if (exprValues.size > SQLConf.get.optimizerInSetConversionThreshold) {
-      InSet(expr, exprValues.map(_.eval()).toSet, exprValues.headOption.map(_.dataType))
+      val elemType = exprValues.headOption.map(_.dataType).getOrElse(NullType)
+      InSet(expr, exprValues.map(_.eval()).toSet, elemType)
     } else {
       In(expr, exprValues)
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala
@@ -159,7 +159,7 @@ case class InSubqueryExec(
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     prepareResult()
-    InSet(child, result.toSet).doGenCode(ctx, ev)
+    InSet(child, result.toSet, child.dataType).doGenCode(ctx, ev)
   }
 
   override lazy val canonicalized: InSubqueryExec = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -872,7 +872,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession {
   }
 
   test("SPARK-31563: sql of InSet for UTF8String collection") {
-    val inSet = InSet(Literal("a"), Set("a", "b").map(UTF8String.fromString))
+    val inSet = InSet(Literal("a"), Set("a", "b").map(UTF8String.fromString), StringType)
     assert(inSet.sql === "('a' IN ('a', 'b'))")
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala
@@ -110,7 +110,9 @@ class DataSourceStrategySuite extends PlanTest with SharedSparkSession {
     testTranslateFilter(LessThanOrEqual(1, attrInt),
       Some(sources.GreaterThanOrEqual(intColName, 1)))
 
-    testTranslateFilter(InSet(attrInt, Set(1, 2, 3)), Some(sources.In(intColName, Array(1, 2, 3))))
+    testTranslateFilter(
+      InSet(attrInt, Set(1, 2, 3), IntegerType),
+      Some(sources.In(intColName, Array(1, 2, 3))))
 
     testTranslateFilter(In(attrInt, Seq(1, 2, 3)), Some(sources.In(intColName, Array(1, 2, 3))))
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -37,6 +37,7 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils}
+import org.apache.spark.sql.types.IntegerType
 import org.apache.spark.util.Utils
 import org.apache.spark.util.collection.BitSet
 
@@ -188,8 +189,10 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
         df)
 
       // Case 4: InSet
-      val inSetExpr = expressions.InSet($"j".expr,
-        Set(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3).map(lit(_).expr))
+      val inSetExpr = expressions.InSet(
+        $"j".expr,
+        Set(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3).map(lit(_).expr),
+        IntegerType)
       checkPrunedAnswers(
         bucketSpec,
         bucketValues = Seq(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala
@@ -213,7 +213,7 @@ class HivePartitionFilteringSuite(version: String)
       0 to 4,
       "aa" :: "ab" :: "ba" :: "bb" :: Nil, {
         case expr @ In(v, list) if expr.inSetConvertible =>
-          InSet(v, list.map(_.eval(EmptyRow)).toSet)
+          InSet(v, list.map(_.eval(EmptyRow)).toSet, v.dataType)
       })
   }
 
@@ -225,7 +225,7 @@ class HivePartitionFilteringSuite(version: String)
       0 to 4,
       "aa" :: "ab" :: "ba" :: "bb" :: Nil, {
         case expr @ In(v, list) if expr.inSetConvertible =>
-          InSet(v, list.map(_.eval(EmptyRow)).toSet)
+          InSet(v, list.map(_.eval(EmptyRow)).toSet, v.dataType)
       })
   }
 
@@ -244,7 +244,7 @@ class HivePartitionFilteringSuite(version: String)
       0 to 4,
       "ab" :: "ba" :: Nil, {
         case expr @ In(v, list) if expr.inSetConvertible =>
-          InSet(v, list.map(_.eval(EmptyRow)).toSet)
+          InSet(v, list.map(_.eval(EmptyRow)).toSet, v.dataType)
       })
   }
 

Original file line number	Diff line number	Diff line change
`@@ -429,7 +429,7 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {`
`429`	`429`	`case class InSet(`
`430`	`430`	`child: Expression,`
`431`	`431`	`hset: Set[Any],`
`432`		`- hsetElemType: Option[DataType] = None) extends UnaryExpression with Predicate {`
	`432`	`+ hsetElemType: DataType) extends UnaryExpression with Predicate {`
`433`	`433`
`434`	`434`	`require(hset != null, "hset could not be null")`
`435`	`435`
`@@ -449,12 +449,12 @@ case class InSet(`
`449`	`449`	`}`
`450`	`450`	`}`
`451`	`451`
`452`		`- @transient lazy val set: Set[Any] = child.dataType match {`
	`452`	`+ @transient lazy val set: Set[Any] = hsetElemType match {`
`453`	`453`	`case t: AtomicType if !t.isInstanceOf[BinaryType] => hset`
`454`	`454`	`case _: NullType => hset`
`455`	`455`	`case _ =>`
`456`	`456`	`// for structs use interpreted ordering to be able to compare UnsafeRows with non-UnsafeRows`
`457`		`- TreeSet.empty(TypeUtils.getInterpretedOrdering(child.dataType)) ++ (hset - null)`
	`457`	`+ TreeSet.empty(TypeUtils.getInterpretedOrdering(hsetElemType)) ++ (hset - null)`
`458`	`458`	`}`
`459`	`459`
`460`	`460`	`override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {`
`@@ -465,7 +465,7 @@ case class InSet(`
`465`	`465`	`}`
`466`	`466`	`}`
`467`	`467`
`468`		`- private def canBeComputedUsingSwitch: Boolean = child.dataType match {`
	`468`	`+ private def canBeComputedUsingSwitch: Boolean = hsetElemType match {`
`469`	`469`	`case ByteType \| ShortType \| IntegerType \| DateType => true`
`470`	`470`	`case _ => false`
`471`	`471`	`}`
`@@ -523,9 +523,8 @@ case class InSet(`
`523`	`523`
`524`	`524`	`override def sql: String = {`
`525`	`525`	`val valueSQL = child.sql`
`526`		`- val elemType = hsetElemType.getOrElse(child.dataType)`
`527`	`526`	`val listSQL = hset.toSeq`
`528`		`- .map(elem => Literal(convertToScala(elem, elemType)).sql)`
	`527`	`+ .map(elem => Literal(convertToScala(elem, hsetElemType)).sql)`
`529`	`528`	`.mkString(", ")`
`530`	`529`	`s"($valueSQL IN ($listSQL))"`
`531`	`530`	`}`
Original file line number	Diff line number	Diff line change
`@@ -159,7 +159,7 @@ case class InSubqueryExec(`
`159`	`159`
`160`	`160`	`override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {`
`161`	`161`	`prepareResult()`
`162`		`- InSet(child, result.toSet).doGenCode(ctx, ev)`
	`162`	`+ InSet(child, result.toSet, child.dataType).doGenCode(ctx, ev)`
`163`	`163`	`}`
`164`	`164`
`165`	`165`	`override lazy val canonicalized: InSubqueryExec = {`
Original file line number	Diff line number	Diff line change
`@@ -872,7 +872,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession {`
`872`	`872`	`}`
`873`	`873`
`874`	`874`	`test("SPARK-31563: sql of InSet for UTF8String collection") {`
`875`		`- val inSet = InSet(Literal("a"), Set("a", "b").map(UTF8String.fromString))`
	`875`	`+ val inSet = InSet(Literal("a"), Set("a", "b").map(UTF8String.fromString), StringType)`
`876`	`876`	`assert(inSet.sql === "('a' IN ('a', 'b'))")`
`877`	`877`	`}`
`878`	`878`