Skip to content

Commit 2741ae7

Browse files
authored
feat: Only allow incompatible cast expressions to run in comet if a config is enabled (#362)
1 parent 5fc6327 commit 2741ae7

File tree

13 files changed

+663
-181
lines changed

13 files changed

+663
-181
lines changed

common/src/main/scala/org/apache/comet/CometConf.scala

Lines changed: 8 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,9 @@
1919

2020
package org.apache.comet
2121

22-
import java.io.{BufferedOutputStream, FileOutputStream}
2322
import java.util.concurrent.TimeUnit
2423

2524
import scala.collection.mutable.ListBuffer
26-
import scala.io.Source
2725

2826
import org.apache.spark.network.util.ByteUnit
2927
import org.apache.spark.network.util.JavaUtils
@@ -376,12 +374,14 @@ object CometConf {
376374
.booleanConf
377375
.createWithDefault(false)
378376

379-
val COMET_CAST_STRING_TO_TIMESTAMP: ConfigEntry[Boolean] = conf(
380-
"spark.comet.cast.stringToTimestamp")
381-
.doc(
382-
"Comet is not currently fully compatible with Spark when casting from String to Timestamp.")
383-
.booleanConf
384-
.createWithDefault(false)
377+
val COMET_CAST_ALLOW_INCOMPATIBLE: ConfigEntry[Boolean] =
378+
conf("spark.comet.cast.allowIncompatible")
379+
.doc(
380+
"Comet is not currently fully compatible with Spark for all cast operations. " +
381+
"Set this config to true to allow them anyway. See compatibility guide " +
382+
"for more information.")
383+
.booleanConf
384+
.createWithDefault(false)
385385

386386
}
387387

@@ -625,36 +625,3 @@ private[comet] case class ConfigBuilder(key: String) {
625625
private object ConfigEntry {
626626
val UNDEFINED = "<undefined>"
627627
}
628-
629-
/**
630-
* Utility for generating markdown documentation from the configs.
631-
*
632-
* This is invoked when running `mvn clean package -DskipTests`.
633-
*/
634-
object CometConfGenerateDocs {
635-
def main(args: Array[String]): Unit = {
636-
if (args.length != 2) {
637-
// scalastyle:off println
638-
println("Missing arguments for template file and output file")
639-
// scalastyle:on println
640-
sys.exit(-1)
641-
}
642-
val templateFilename = args.head
643-
val outputFilename = args(1)
644-
val w = new BufferedOutputStream(new FileOutputStream(outputFilename))
645-
for (line <- Source.fromFile(templateFilename).getLines()) {
646-
if (line.trim == "<!--CONFIG_TABLE-->") {
647-
val publicConfigs = CometConf.allConfs.filter(_.isPublic)
648-
val confs = publicConfigs.sortBy(_.key)
649-
w.write("| Config | Description | Default Value |\n".getBytes)
650-
w.write("|--------|-------------|---------------|\n".getBytes)
651-
for (conf <- confs) {
652-
w.write(s"| ${conf.key} | ${conf.doc.trim} | ${conf.defaultValueString} |\n".getBytes)
653-
}
654-
} else {
655-
w.write(s"${line.trim}\n".getBytes)
656-
}
657-
}
658-
w.close()
659-
}
660-
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
<!---
2+
Licensed to the Apache Software Foundation (ASF) under one
3+
or more contributor license agreements. See the NOTICE file
4+
distributed with this work for additional information
5+
regarding copyright ownership. The ASF licenses this file
6+
to you under the Apache License, Version 2.0 (the
7+
"License"); you may not use this file except in compliance
8+
with the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing,
13+
software distributed under the License is distributed on an
14+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
KIND, either express or implied. See the License for the
16+
specific language governing permissions and limitations
17+
under the License.
18+
-->
19+
20+
# Compatibility Guide
21+
22+
Comet aims to provide consistent results with the version of Apache Spark that is being used.
23+
24+
This guide offers information about areas of functionality where there are known differences.
25+
26+
## ANSI mode
27+
28+
Comet currently ignores ANSI mode in most cases, and therefore can produce different results than Spark. By default,
29+
Comet will fall back to Spark if ANSI mode is enabled. To enable Comet to accelerate queries when ANSI mode is enabled,
30+
specify `spark.comet.ansi.enabled=true` in the Spark configuration. Comet's ANSI support is experimental and should not
31+
be used in production.
32+
33+
There is an [epic](https://github.com/apache/datafusion-comet/issues/313) where we are tracking the work to fully implement ANSI support.
34+
35+
## Cast
36+
37+
Cast operations in Comet fall into three levels of support:
38+
39+
- **Compatible**: The results match Apache Spark
40+
- **Incompatible**: The results may match Apache Spark for some inputs, but there are known issues where some inputs
41+
will result in incorrect results or exceptions. The query stage will fall back to Spark by default. Setting
42+
`spark.comet.cast.allowIncompatible=true` will allow all incompatible casts to run natively in Comet, but this is not
43+
recommended for production use.
44+
- **Unsupported**: Comet does not provide a native version of this cast expression and the query stage will fall back to
45+
Spark.
46+
47+
The following table shows the current cast operations supported by Comet. Any cast that does not appear in this
48+
table (such as those involving complex types and timestamp_ntz, for example) are not supported by Comet.
49+
50+
<!--CAST_TABLE-->
Lines changed: 136 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
11
<!---
2-
Licensed to the Apache Software Foundation (ASF) under one
3-
or more contributor license agreements. See the NOTICE file
4-
distributed with this work for additional information
5-
regarding copyright ownership. The ASF licenses this file
6-
to you under the Apache License, Version 2.0 (the
7-
"License"); you may not use this file except in compliance
8-
with the License. You may obtain a copy of the License at
9-
10-
http://www.apache.org/licenses/LICENSE-2.0
11-
12-
Unless required by applicable law or agreed to in writing,
13-
software distributed under the License is distributed on an
14-
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15-
KIND, either express or implied. See the License for the
16-
specific language governing permissions and limitations
17-
under the License.
2+
Licensed to the Apache Software Foundation (ASF) under one
3+
or more contributor license agreements. See the NOTICE file
4+
distributed with this work for additional information
5+
regarding copyright ownership. The ASF licenses this file
6+
to you under the Apache License, Version 2.0 (the
7+
"License"); you may not use this file except in compliance
8+
with the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing,
13+
software distributed under the License is distributed on an
14+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
KIND, either express or implied. See the License for the
16+
specific language governing permissions and limitations
17+
under the License.
1818
-->
1919

2020
# Compatibility Guide
@@ -34,13 +34,126 @@ There is an [epic](https://github.com/apache/datafusion-comet/issues/313) where
3434

3535
## Cast
3636

37-
Comet currently delegates to Apache DataFusion for most cast operations, and this means that the behavior is not
38-
guaranteed to be consistent with Spark.
37+
Cast operations in Comet fall into three levels of support:
3938

40-
There is an [epic](https://github.com/apache/datafusion-comet/issues/286) where we are tracking the work to implement Spark-compatible cast expressions.
39+
- **Compatible**: The results match Apache Spark
40+
- **Incompatible**: The results may match Apache Spark for some inputs, but there are known issues where some inputs
41+
will result in incorrect results or exceptions. The query stage will fall back to Spark by default. Setting
42+
`spark.comet.cast.allowIncompatible=true` will allow all incompatible casts to run natively in Comet, but this is not
43+
recommended for production use.
44+
- **Unsupported**: Comet does not provide a native version of this cast expression and the query stage will fall back to
45+
Spark.
4146

42-
### Cast from String to Timestamp
47+
The following table shows the current cast operations supported by Comet. Any cast that does not appear in this
48+
table (such as those involving complex types and timestamp_ntz, for example) are not supported by Comet.
4349

44-
Casting from String to Timestamp is disabled by default due to incompatibilities with Spark, including timezone
45-
issues, and can be enabled by setting `spark.comet.castStringToTimestamp=true`. See the
46-
[tracking issue](https://github.com/apache/datafusion-comet/issues/328) for more information.
50+
| From Type | To Type | Compatible? | Notes |
51+
| --------- | --------- | ------------ | ----------------------------------- |
52+
| boolean | byte | Compatible | |
53+
| boolean | short | Compatible | |
54+
| boolean | integer | Compatible | |
55+
| boolean | long | Compatible | |
56+
| boolean | float | Compatible | |
57+
| boolean | double | Compatible | |
58+
| boolean | decimal | Unsupported | |
59+
| boolean | string | Compatible | |
60+
| boolean | timestamp | Unsupported | |
61+
| byte | boolean | Compatible | |
62+
| byte | short | Compatible | |
63+
| byte | integer | Compatible | |
64+
| byte | long | Compatible | |
65+
| byte | float | Compatible | |
66+
| byte | double | Compatible | |
67+
| byte | decimal | Compatible | |
68+
| byte | string | Compatible | |
69+
| byte | binary | Unsupported | |
70+
| byte | timestamp | Unsupported | |
71+
| short | boolean | Compatible | |
72+
| short | byte | Compatible | |
73+
| short | integer | Compatible | |
74+
| short | long | Compatible | |
75+
| short | float | Compatible | |
76+
| short | double | Compatible | |
77+
| short | decimal | Compatible | |
78+
| short | string | Compatible | |
79+
| short | binary | Unsupported | |
80+
| short | timestamp | Unsupported | |
81+
| integer | boolean | Compatible | |
82+
| integer | byte | Compatible | |
83+
| integer | short | Compatible | |
84+
| integer | long | Compatible | |
85+
| integer | float | Compatible | |
86+
| integer | double | Compatible | |
87+
| integer | decimal | Compatible | |
88+
| integer | string | Compatible | |
89+
| integer | binary | Unsupported | |
90+
| integer | timestamp | Unsupported | |
91+
| long | boolean | Compatible | |
92+
| long | byte | Compatible | |
93+
| long | short | Compatible | |
94+
| long | integer | Compatible | |
95+
| long | float | Compatible | |
96+
| long | double | Compatible | |
97+
| long | decimal | Compatible | |
98+
| long | string | Compatible | |
99+
| long | binary | Unsupported | |
100+
| long | timestamp | Unsupported | |
101+
| float | boolean | Compatible | |
102+
| float | byte | Unsupported | |
103+
| float | short | Unsupported | |
104+
| float | integer | Unsupported | |
105+
| float | long | Unsupported | |
106+
| float | double | Compatible | |
107+
| float | decimal | Unsupported | |
108+
| float | string | Incompatible | |
109+
| float | timestamp | Unsupported | |
110+
| double | boolean | Compatible | |
111+
| double | byte | Unsupported | |
112+
| double | short | Unsupported | |
113+
| double | integer | Unsupported | |
114+
| double | long | Unsupported | |
115+
| double | float | Compatible | |
116+
| double | decimal | Incompatible | |
117+
| double | string | Incompatible | |
118+
| double | timestamp | Unsupported | |
119+
| decimal | boolean | Unsupported | |
120+
| decimal | byte | Unsupported | |
121+
| decimal | short | Unsupported | |
122+
| decimal | integer | Unsupported | |
123+
| decimal | long | Unsupported | |
124+
| decimal | float | Compatible | |
125+
| decimal | double | Compatible | |
126+
| decimal | string | Unsupported | |
127+
| decimal | timestamp | Unsupported | |
128+
| string | boolean | Compatible | |
129+
| string | byte | Compatible | |
130+
| string | short | Compatible | |
131+
| string | integer | Compatible | |
132+
| string | long | Compatible | |
133+
| string | float | Unsupported | |
134+
| string | double | Unsupported | |
135+
| string | decimal | Unsupported | |
136+
| string | binary | Compatible | |
137+
| string | date | Unsupported | |
138+
| string | timestamp | Incompatible | Not all valid formats are supported |
139+
| binary | string | Incompatible | |
140+
| date | boolean | Unsupported | |
141+
| date | byte | Unsupported | |
142+
| date | short | Unsupported | |
143+
| date | integer | Unsupported | |
144+
| date | long | Unsupported | |
145+
| date | float | Unsupported | |
146+
| date | double | Unsupported | |
147+
| date | decimal | Unsupported | |
148+
| date | string | Compatible | |
149+
| date | timestamp | Unsupported | |
150+
| timestamp | boolean | Unsupported | |
151+
| timestamp | byte | Unsupported | |
152+
| timestamp | short | Unsupported | |
153+
| timestamp | integer | Unsupported | |
154+
| timestamp | long | Compatible | |
155+
| timestamp | float | Unsupported | |
156+
| timestamp | double | Unsupported | |
157+
| timestamp | decimal | Unsupported | |
158+
| timestamp | string | Compatible | |
159+
| timestamp | date | Compatible | |

docs/source/user-guide/configs.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Comet provides the following configuration settings.
2525
|--------|-------------|---------------|
2626
| spark.comet.ansi.enabled | Comet does not respect ANSI mode in most cases and by default will not accelerate queries when ansi mode is enabled. Enable this setting to test Comet's experimental support for ANSI mode. This should not be used in production. | false |
2727
| spark.comet.batchSize | The columnar batch size, i.e., the maximum number of rows that a batch can contain. | 8192 |
28-
| spark.comet.cast.stringToTimestamp | Comet is not currently fully compatible with Spark when casting from String to Timestamp. | false |
28+
| spark.comet.cast.allowIncompatible | Comet is not currently fully compatible with Spark for all cast operations. Set this config to true to allow them anyway. See compatibility guide for more information. | false |
2929
| spark.comet.columnar.shuffle.async.enabled | Whether to enable asynchronous shuffle for Arrow-based shuffle. By default, this config is false. | false |
3030
| spark.comet.columnar.shuffle.async.max.thread.num | Maximum number of threads on an executor used for Comet async columnar shuffle. By default, this config is 100. This is the upper bound of total number of shuffle threads per executor. In other words, if the number of cores * the number of shuffle threads per task `spark.comet.columnar.shuffle.async.thread.num` is larger than this config. Comet will use this config as the number of shuffle threads per executor instead. | 100 |
3131
| spark.comet.columnar.shuffle.async.thread.num | Number of threads used for Comet async columnar shuffle per shuffle task. By default, this config is 3. Note that more threads means more memory requirement to buffer shuffle data before flushing to disk. Also, more threads may not always improve performance, and should be set based on the number of cores available. | 3 |

spark/pom.xml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ under the License.
5858
<groupId>org.scala-lang</groupId>
5959
<artifactId>scala-library</artifactId>
6060
</dependency>
61+
<dependency>
62+
<groupId>org.scala-lang</groupId>
63+
<artifactId>scala-reflect</artifactId>
64+
<scope>provided</scope>
65+
</dependency>
6166
<dependency>
6267
<groupId>com.google.protobuf</groupId>
6368
<artifactId>protobuf-java</artifactId>
@@ -270,17 +275,13 @@ under the License.
270275
<version>3.2.0</version>
271276
<executions>
272277
<execution>
273-
<id>generate-config-docs</id>
278+
<id>generate-user-guide-reference-docs</id>
274279
<phase>package</phase>
275280
<goals>
276281
<goal>java</goal>
277282
</goals>
278283
<configuration>
279-
<mainClass>org.apache.comet.CometConfGenerateDocs</mainClass>
280-
<arguments>
281-
<argument>docs/source/user-guide/configs-template.md</argument>
282-
<argument>docs/source/user-guide/configs.md</argument>
283-
</arguments>
284+
<mainClass>org.apache.comet.GenerateDocs</mainClass>
284285
<classpathScope>compile</classpathScope>
285286
</configuration>
286287
</execution>

0 commit comments

Comments
 (0)