diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index ffc3814c932c9..4f18ff3268443 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -77,6 +77,8 @@ license: | - In Spark 3.2, `CREATE TABLE .. LIKE ..` command can not use reserved properties. You need their specific clauses to specify them, for example, `CREATE TABLE test1 LIKE test LOCATION 'some path'`. You can set `spark.sql.legacy.notReserveProperties` to `true` to ignore the `ParseException`, in this case, these properties will be silently removed, for example: `TBLPROPERTIES('owner'='yao')` will have no effect. In Spark version 3.1 and below, the reserved properties can be used in `CREATE TABLE .. LIKE ..` command but have no side effects, for example, `TBLPROPERTIES('location'='/tmp')` does not change the location of the table but only create a headless property just like `'a'='b'`. + - In Spark 3.2, `TRANSFORM` operator can't support alias in inputs. In Spark 3.1 and earlier, we can write script transform like `SELECT TRANSFORM(a AS c1, b AS c2) USING 'cat' FROM TBL`. + ## Upgrading from Spark SQL 3.0 to 3.1 - In Spark 3.1, statistical aggregation function includes `std`, `stddev`, `stddev_samp`, `variance`, `var_samp`, `skewness`, `kurtosis`, `covar_samp`, `corr` will return `NULL` instead of `Double.NaN` when `DivideByZero` occurs during expression evaluation, for example, when `stddev_samp` applied on a single element set. In Spark version 3.0 and earlier, it will return `Double.NaN` in such case. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.statisticalAggregate` to `true`. diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 78c224ce18013..c958f9c387767 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -524,9 +524,9 @@ querySpecification ; transformClause - : (SELECT kind=TRANSFORM '(' setQuantifier? namedExpressionSeq ')' - | kind=MAP setQuantifier? namedExpressionSeq - | kind=REDUCE setQuantifier? namedExpressionSeq) + : (SELECT kind=TRANSFORM '(' setQuantifier? expressionSeq ')' + | kind=MAP setQuantifier? expressionSeq + | kind=REDUCE setQuantifier? expressionSeq) inRowFormat=rowFormat? (RECORDWRITER recordWriter=STRING)? USING script=STRING @@ -774,6 +774,10 @@ expression : booleanExpression ; +expressionSeq + : expression (',' expression)* + ; + booleanExpression : NOT booleanExpression #logicalNot | EXISTS '(' query ')' #exists diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index c7af21cabfe8f..f9317e865f275 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -627,6 +627,12 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg .map(typedVisit[Expression]) } + override def visitExpressionSeq(ctx: ExpressionSeqContext): Seq[Expression] = { + Option(ctx).toSeq + .flatMap(_.expression.asScala) + .map(typedVisit[Expression]) + } + /** * Create a logical plan using a having clause. */ @@ -680,8 +686,8 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg val plan = visitCommonSelectQueryClausePlan( relation, + visitExpressionSeq(transformClause.expressionSeq), lateralView, - transformClause.namedExpressionSeq, whereClause, aggregationClause, havingClause, @@ -726,8 +732,8 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg val plan = visitCommonSelectQueryClausePlan( relation, + visitNamedExpressionSeq(selectClause.namedExpressionSeq), lateralView, - selectClause.namedExpressionSeq, whereClause, aggregationClause, havingClause, @@ -740,8 +746,8 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg def visitCommonSelectQueryClausePlan( relation: LogicalPlan, + expressions: Seq[Expression], lateralView: java.util.List[LateralViewContext], - namedExpressionSeq: NamedExpressionSeqContext, whereClause: WhereClauseContext, aggregationClause: AggregationClauseContext, havingClause: HavingClauseContext, @@ -753,8 +759,6 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg // Add where. val withFilter = withLateralView.optionalMap(whereClause)(withWhereClause) - val expressions = visitNamedExpressionSeq(namedExpressionSeq) - // Add aggregation or a project. val namedExpressions = expressions.map { case e: NamedExpression => e diff --git a/sql/core/src/test/resources/sql-tests/inputs/transform.sql b/sql/core/src/test/resources/sql-tests/inputs/transform.sql index 8ee0ddc0a0ba1..e3ae34ee89300 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/transform.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/transform.sql @@ -206,7 +206,7 @@ FROM script_trans LIMIT 1; SELECT TRANSFORM( - b AS d5, a, + b, a, CASE WHEN c > 100 THEN 1 WHEN c < 100 THEN 2 @@ -225,45 +225,45 @@ SELECT TRANSFORM(*) FROM script_trans WHERE a <= 4; -SELECT TRANSFORM(b AS d, MAX(a) as max_a, CAST(SUM(c) AS STRING)) +SELECT TRANSFORM(b, MAX(a), CAST(SUM(c) AS STRING)) USING 'cat' AS (a, b, c) FROM script_trans WHERE a <= 4 GROUP BY b; -SELECT TRANSFORM(b AS d, MAX(a) FILTER (WHERE a > 3) AS max_a, CAST(SUM(c) AS STRING)) +SELECT TRANSFORM(b, MAX(a) FILTER (WHERE a > 3), CAST(SUM(c) AS STRING)) USING 'cat' AS (a,b,c) FROM script_trans WHERE a <= 4 GROUP BY b; -SELECT TRANSFORM(b, MAX(a) as max_a, CAST(sum(c) AS STRING)) +SELECT TRANSFORM(b, MAX(a), CAST(sum(c) AS STRING)) USING 'cat' AS (a, b, c) FROM script_trans WHERE a <= 2 GROUP BY b; -SELECT TRANSFORM(b, MAX(a) as max_a, CAST(SUM(c) AS STRING)) +SELECT TRANSFORM(b, MAX(a), CAST(SUM(c) AS STRING)) USING 'cat' AS (a, b, c) FROM script_trans WHERE a <= 4 GROUP BY b -HAVING max_a > 0; +HAVING MAX(a) > 0; -SELECT TRANSFORM(b, MAX(a) as max_a, CAST(SUM(c) AS STRING)) +SELECT TRANSFORM(b, MAX(a), CAST(SUM(c) AS STRING)) USING 'cat' AS (a, b, c) FROM script_trans WHERE a <= 4 GROUP BY b -HAVING max(a) > 1; +HAVING MAX(a) > 1; -SELECT TRANSFORM(b, MAX(a) OVER w as max_a, CAST(SUM(c) OVER w AS STRING)) +SELECT TRANSFORM(b, MAX(a) OVER w, CAST(SUM(c) OVER w AS STRING)) USING 'cat' AS (a, b, c) FROM script_trans WHERE a <= 4 WINDOW w AS (PARTITION BY b ORDER BY a); -SELECT TRANSFORM(b, MAX(a) as max_a, CAST(SUM(c) AS STRING), myCol, myCol2) +SELECT TRANSFORM(b, MAX(a), CAST(SUM(c) AS STRING), myCol, myCol2) USING 'cat' AS (a, b, c, d, e) FROM script_trans LATERAL VIEW explode(array(array(1,2,3))) myTable AS myCol @@ -280,7 +280,7 @@ FROM( SELECT a + 1; FROM( - SELECT TRANSFORM(a, SUM(b) b) + SELECT TRANSFORM(a, SUM(b)) USING 'cat' AS (`a` INT, b STRING) FROM script_trans GROUP BY a @@ -308,14 +308,6 @@ HAVING true; SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=false; -SET spark.sql.parser.quotedRegexColumnNames=true; - -SELECT TRANSFORM(`(a|b)?+.+`) - USING 'cat' AS (c) -FROM script_trans; - -SET spark.sql.parser.quotedRegexColumnNames=false; - -- SPARK-34634: self join using CTE contains transform WITH temp AS ( SELECT TRANSFORM(a) USING 'cat' AS (b string) FROM t @@ -331,3 +323,22 @@ SELECT TRANSFORM(ALL b, a, c) USING 'cat' AS (a, b, c) FROM script_trans WHERE a <= 4; + +-- SPARK-35070: TRANSFORM not support alias in inputs +SELECT TRANSFORM(b AS b_1, MAX(a), CAST(sum(c) AS STRING)) + USING 'cat' AS (a, b, c) +FROM script_trans +WHERE a <= 2 +GROUP BY b; + +SELECT TRANSFORM(b b_1, MAX(a), CAST(sum(c) AS STRING)) + USING 'cat' AS (a, b, c) +FROM script_trans +WHERE a <= 2 +GROUP BY b; + +SELECT TRANSFORM(b, MAX(a) AS max_a, CAST(sum(c) AS STRING)) + USING 'cat' AS (a, b, c) +FROM script_trans +WHERE a <= 2 +GROUP BY b; diff --git a/sql/core/src/test/resources/sql-tests/results/transform.sql.out b/sql/core/src/test/resources/sql-tests/results/transform.sql.out index fd879865fd5eb..c20ec4ca20715 100644 --- a/sql/core/src/test/resources/sql-tests/results/transform.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/transform.sql.out @@ -376,7 +376,7 @@ struct -- !query SELECT TRANSFORM( - b AS d5, a, + b, a, CASE WHEN c > 100 THEN 1 WHEN c < 100 THEN 2 @@ -416,7 +416,7 @@ struct -- !query -SELECT TRANSFORM(b AS d, MAX(a) as max_a, CAST(SUM(c) AS STRING)) +SELECT TRANSFORM(b, MAX(a), CAST(SUM(c) AS STRING)) USING 'cat' AS (a, b, c) FROM script_trans WHERE a <= 4 @@ -429,7 +429,7 @@ struct -- !query -SELECT TRANSFORM(b AS d, MAX(a) FILTER (WHERE a > 3) AS max_a, CAST(SUM(c) AS STRING)) +SELECT TRANSFORM(b, MAX(a) FILTER (WHERE a > 3), CAST(SUM(c) AS STRING)) USING 'cat' AS (a,b,c) FROM script_trans WHERE a <= 4 @@ -442,7 +442,7 @@ struct -- !query -SELECT TRANSFORM(b, MAX(a) as max_a, CAST(sum(c) AS STRING)) +SELECT TRANSFORM(b, MAX(a), CAST(sum(c) AS STRING)) USING 'cat' AS (a, b, c) FROM script_trans WHERE a <= 2 @@ -454,12 +454,12 @@ struct -- !query -SELECT TRANSFORM(b, MAX(a) as max_a, CAST(SUM(c) AS STRING)) +SELECT TRANSFORM(b, MAX(a), CAST(SUM(c) AS STRING)) USING 'cat' AS (a, b, c) FROM script_trans WHERE a <= 4 GROUP BY b -HAVING max_a > 0 +HAVING MAX(a) > 0 -- !query schema struct -- !query output @@ -468,12 +468,12 @@ struct -- !query -SELECT TRANSFORM(b, MAX(a) as max_a, CAST(SUM(c) AS STRING)) +SELECT TRANSFORM(b, MAX(a), CAST(SUM(c) AS STRING)) USING 'cat' AS (a, b, c) FROM script_trans WHERE a <= 4 GROUP BY b -HAVING max(a) > 1 +HAVING MAX(a) > 1 -- !query schema struct -- !query output @@ -481,7 +481,7 @@ struct -- !query -SELECT TRANSFORM(b, MAX(a) OVER w as max_a, CAST(SUM(c) OVER w AS STRING)) +SELECT TRANSFORM(b, MAX(a) OVER w, CAST(SUM(c) OVER w AS STRING)) USING 'cat' AS (a, b, c) FROM script_trans WHERE a <= 4 @@ -494,7 +494,7 @@ struct -- !query -SELECT TRANSFORM(b, MAX(a) as max_a, CAST(SUM(c) AS STRING), myCol, myCol2) +SELECT TRANSFORM(b, MAX(a), CAST(SUM(c) AS STRING), myCol, myCol2) USING 'cat' AS (a, b, c, d, e) FROM script_trans LATERAL VIEW explode(array(array(1,2,3))) myTable AS myCol @@ -527,7 +527,7 @@ struct<(a + 1):int> -- !query FROM( - SELECT TRANSFORM(a, SUM(b) b) + SELECT TRANSFORM(a, SUM(b)) USING 'cat' AS (`a` INT, b STRING) FROM script_trans GROUP BY a @@ -600,34 +600,6 @@ struct spark.sql.legacy.parser.havingWithoutGroupByAsWhere false --- !query -SET spark.sql.parser.quotedRegexColumnNames=true --- !query schema -struct --- !query output -spark.sql.parser.quotedRegexColumnNames true - - --- !query -SELECT TRANSFORM(`(a|b)?+.+`) - USING 'cat' AS (c) -FROM script_trans --- !query schema -struct --- !query output -3 -6 -9 - - --- !query -SET spark.sql.parser.quotedRegexColumnNames=false --- !query schema -struct --- !query output -spark.sql.parser.quotedRegexColumnNames false - - -- !query WITH temp AS ( SELECT TRANSFORM(a) USING 'cat' AS (b string) FROM t @@ -679,3 +651,69 @@ SELECT TRANSFORM(ALL b, a, c) USING 'cat' AS (a, b, c) FROM script_trans WHERE a <= 4 + + +-- !query +SELECT TRANSFORM(b AS b_1, MAX(a), CAST(sum(c) AS STRING)) + USING 'cat' AS (a, b, c) +FROM script_trans +WHERE a <= 2 +GROUP BY b +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'SELECT TRANSFORM(b AS'(line 1, pos 19) + +== SQL == +SELECT TRANSFORM(b AS b_1, MAX(a), CAST(sum(c) AS STRING)) +-------------------^^^ + USING 'cat' AS (a, b, c) +FROM script_trans +WHERE a <= 2 +GROUP BY b + + +-- !query +SELECT TRANSFORM(b b_1, MAX(a), CAST(sum(c) AS STRING)) + USING 'cat' AS (a, b, c) +FROM script_trans +WHERE a <= 2 +GROUP BY b +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'SELECT TRANSFORM(b b_1'(line 1, pos 19) + +== SQL == +SELECT TRANSFORM(b b_1, MAX(a), CAST(sum(c) AS STRING)) +-------------------^^^ + USING 'cat' AS (a, b, c) +FROM script_trans +WHERE a <= 2 +GROUP BY b + + +-- !query +SELECT TRANSFORM(b, MAX(a) AS max_a, CAST(sum(c) AS STRING)) + USING 'cat' AS (a, b, c) +FROM script_trans +WHERE a <= 2 +GROUP BY b +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'SELECT TRANSFORM(b, MAX(a) AS'(line 1, pos 27) + +== SQL == +SELECT TRANSFORM(b, MAX(a) AS max_a, CAST(sum(c) AS STRING)) +---------------------------^^^ + USING 'cat' AS (a, b, c) +FROM script_trans +WHERE a <= 2 +GROUP BY b