address comments

cloud-fan · cloud-fan · commit ec2a2b841e96 · 2019-07-29T12:07:34.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala
@@ -66,9 +66,8 @@ case class ColumnarToRowExec(child: SparkPlan) extends UnaryExecNode with Codege
 
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
 
-  // `ColumnarToRowExec` is the beginning of a codegen stage, so it doesn't need to copy result and
-  // it can add limit condition check.
-  override def needCopyResult: Boolean = false
+  // `ColumnarToRowExec` processes the input RDD directly, which is kind of a leaf node in the
+  // codegen stage and needs to do the limit check.
   protected override def canCheckLimitNotReached: Boolean = true
 
   override lazy val metrics: Map[String, SQLMetric] = Map(
@@ -431,7 +430,7 @@ case class RowToColumnarExec(child: SparkPlan) extends UnaryExecNode {
     // Instead of creating a new config we are reusing columnBatchSize. In the future if we do
     // combine with some of the Arrow conversion tools we will need to unify some of the configs.
     val numRows = conf.columnBatchSize
-    // This avoids calling `output` in the RDD closure, so that we don't need to include the entire
+    // This avoids calling `schema` in the RDD closure, so that we don't need to include the entire
     // plan (this) in the closure.
     val localSchema = this.schema
     child.execute().mapPartitionsInternal { rowIterator =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -517,6 +517,9 @@ case class InputAdapter(child: SparkPlan) extends UnaryExecNode with InputRDDCod
     child.executeColumnar()
   }
 
+  // `InputAdapter` can only generate code to process the rows from its child. If the child produces
+  // columnar batches, there must be a `ColumnarToRowExec` above `InputAdapter` to handle it by
+  // overriding `inputRDD`.
   override def inputRDD: RDD[InternalRow] = child.execute()
 
   // This is a leaf node so the node can produce limit not reached checks.
@@ -868,9 +871,6 @@ case class CollapseCodegenStages(
         // The children of SortMergeJoin should do codegen separately.
         j.withNewChildren(j.children.map(
           child => InputAdapter(insertWholeStageCodegen(child))))
-      // `ColumnarToRowExec` is kind of a leaf node to whole-stage-codegen. Its generated code can
-      // process data from the input RDD directly.
-      case c: ColumnarToRowExec => c
       case p => p.withNewChildren(p.children.map(insertInputAdapter))
     }
   }
@@ -889,6 +889,9 @@ case class CollapseCodegenStages(
         // to support the fast driver-local collect/take paths.
         plan
       case plan: CodegenSupport if supportCodegen(plan) =>
+        // The whole-stage-codegen framework is row-based. If a plan supports columnar execution,
+        // it can't support whole-stage-codegen at the same time.
+        assert(!plan.supportsColumnar)
         WholeStageCodegenExec(insertInputAdapter(plan))(codegenStageCounter.incrementAndGet())
       case other =>
         other.withNewChildren(other.children.map(insertWholeStageCodegen))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -1293,8 +1293,8 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
       checkAnswer(df, Seq(Row(0, 0), Row(2, 0)))
       // need to execute the query before we can examine fs.inputRDDs()
       assert(df.queryExecution.executedPlan match {
-        case WholeStageCodegenExec(ColumnarToRowExec(
-            fs @ FileSourceScanExec(_, _, _, partitionFilters, _, _, _))) =>
+        case WholeStageCodegenExec(ColumnarToRowExec(InputAdapter(
+            fs @ FileSourceScanExec(_, _, _, partitionFilters, _, _, _)))) =>
           partitionFilters.exists(ExecSubqueryExpression.hasSubquery) &&
             fs.inputRDDs().forall(
               _.asInstanceOf[FileScanRDD].filePartitions.forall(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -128,7 +128,8 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
     val dsIntFilter = dsInt.filter(_ > 0)
     val planInt = dsIntFilter.queryExecution.executedPlan
     assert(planInt.collect {
-      case WholeStageCodegenExec(FilterExec(_, ColumnarToRowExec(_: InMemoryTableScanExec))) => ()
+      case WholeStageCodegenExec(FilterExec(_,
+          ColumnarToRowExec(InputAdapter(_: InMemoryTableScanExec)))) => ()
     }.length == 1)
     assert(dsIntFilter.collect() === Array(1, 2))
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -487,7 +487,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
 
     val planBeforeFilter = df2.queryExecution.executedPlan.collect {
       case FilterExec(_, c: ColumnarToRowExec) => c.child
-      case WholeStageCodegenExec(FilterExec(_, c: ColumnarToRowExec)) => c.child
+      case WholeStageCodegenExec(FilterExec(_, ColumnarToRowExec(i: InputAdapter))) => i.child
     }
     assert(planBeforeFilter.head.isInstanceOf[InMemoryTableScanExec])
 

Original file line number	Diff line number	Diff line change
`@@ -487,7 +487,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {`
`487`	`487`
`488`	`488`	`val planBeforeFilter = df2.queryExecution.executedPlan.collect {`
`489`	`489`	`case FilterExec(_, c: ColumnarToRowExec) => c.child`
`490`		`- case WholeStageCodegenExec(FilterExec(_, c: ColumnarToRowExec)) => c.child`
	`490`	`+ case WholeStageCodegenExec(FilterExec(_, ColumnarToRowExec(i: InputAdapter))) => i.child`
`491`	`491`	`}`
`492`	`492`	`assert(planBeforeFilter.head.isInstanceOf[InMemoryTableScanExec])`
`493`	`493`