Skip to content

Commit db68ff5

Browse files
committed
ORC-386 Add spark benchmarks.
I also refactored all of the old benchmarks to reduce the common code. I also split it into three modules so that I could separate the common code, the code that depends on hive, and the code that depends on spark. Avoiding building an uber jar that has both hive and spark made life much easier.
1 parent f3dd9c1 commit db68ff5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+2207
-1012
lines changed

java/bench/README.md

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ These big data file format benchmarks, compare:
77
* ORC
88
* Parquet
99

10+
There are three sub-modules to try to mitigate dependency hell:
11+
12+
* core - the shared part of the benchmarks
13+
* hive - the Hive benchmarks
14+
* spark - the Spark benchmarks
15+
1016
To build this library:
1117

1218
```% mvn clean package```
@@ -17,17 +23,25 @@ To fetch the source data:
1723

1824
To generate the derived data:
1925

20-
```% java -jar target/orc-benchmarks-*-uber.jar generate data```
26+
```% java -jar core/target/orc-benchmarks-core-*-uber.jar generate data```
2127

2228
To run a scan of all of the data:
2329

24-
```% java -jar target/orc-benchmarks-*-uber.jar scan data```
30+
```% java -jar core/target/orc-benchmarks-core-*-uber.jar scan data```
2531

2632
To run full read benchmark:
2733

28-
```% java -jar target/orc-benchmarks-*-uber.jar read-all data```
34+
```% java -jar hive/target/orc-benchmarks-hive-*-uber.jar read-all data```
2935

3036
To run column projection benchmark:
3137

32-
```% java -jar target/orc-benchmarks-*-uber.jar read-some data```
38+
```% java -jar hive/target/orc-benchmarks-hive-*-uber.jar read-some data```
39+
40+
To run decimal/decimal64 benchmark:
41+
42+
```% java -jar hive/target/orc-benchmarks-hive-*-uber.jar decimal data```
43+
44+
To run spark benchmark:
45+
46+
```% java -jar spark/target/orc-benchmarks-spark-*.jar spark data```
3347

java/bench/core/pom.xml

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!--
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
-->
15+
<project xmlns="http://maven.apache.org/POM/4.0.0"
16+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
17+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
18+
<modelVersion>4.0.0</modelVersion>
19+
<parent>
20+
<groupId>org.apache.orc</groupId>
21+
<artifactId>orc-benchmarks</artifactId>
22+
<version>1.6.0-SNAPSHOT</version>
23+
<relativePath>..</relativePath>
24+
</parent>
25+
26+
<groupId>org.apache.orc</groupId>
27+
<artifactId>orc-benchmarks-core</artifactId>
28+
<version>1.6.0-SNAPSHOT</version>
29+
<packaging>jar</packaging>
30+
<name>ORC Benchmarks Core</name>
31+
<description>
32+
The core parts of the benchmarks for comparing performance across formats.
33+
</description>
34+
35+
<dependencies>
36+
<dependency>
37+
<groupId>com.fasterxml.jackson.core</groupId>
38+
<artifactId>jackson-core</artifactId>
39+
</dependency>
40+
<dependency>
41+
<groupId>com.google.auto.service</groupId>
42+
<artifactId>auto-service</artifactId>
43+
</dependency>
44+
<dependency>
45+
<groupId>com.google.code.gson</groupId>
46+
<artifactId>gson</artifactId>
47+
</dependency>
48+
<dependency>
49+
<groupId>commons-cli</groupId>
50+
<artifactId>commons-cli</artifactId>
51+
</dependency>
52+
<dependency>
53+
<groupId>io.airlift</groupId>
54+
<artifactId>aircompressor</artifactId>
55+
</dependency>
56+
<dependency>
57+
<groupId>org.apache.avro</groupId>
58+
<artifactId>avro</artifactId>
59+
</dependency>
60+
<dependency>
61+
<groupId>org.apache.avro</groupId>
62+
<artifactId>avro-mapred</artifactId>
63+
<classifier>hadoop2</classifier>
64+
</dependency>
65+
<dependency>
66+
<groupId>org.apache.commons</groupId>
67+
<artifactId>commons-csv</artifactId>
68+
</dependency>
69+
<dependency>
70+
<groupId>org.apache.hadoop</groupId>
71+
<artifactId>hadoop-common</artifactId>
72+
</dependency>
73+
<dependency>
74+
<groupId>org.apache.hive</groupId>
75+
<artifactId>hive-storage-api</artifactId>
76+
</dependency>
77+
<dependency>
78+
<groupId>org.apache.orc</groupId>
79+
<artifactId>orc-core</artifactId>
80+
</dependency>
81+
<dependency>
82+
<groupId>org.apache.parquet</groupId>
83+
<artifactId>parquet-avro</artifactId>
84+
</dependency>
85+
<dependency>
86+
<groupId>org.apache.parquet</groupId>
87+
<artifactId>parquet-hadoop</artifactId>
88+
</dependency>
89+
<dependency>
90+
<groupId>org.openjdk.jmh</groupId>
91+
<artifactId>jmh-core</artifactId>
92+
</dependency>
93+
<dependency>
94+
<groupId>org.slf4j</groupId>
95+
<artifactId>slf4j-api</artifactId>
96+
</dependency>
97+
<dependency>
98+
<groupId>org.slf4j</groupId>
99+
<artifactId>slf4j-log4j12</artifactId>
100+
</dependency>
101+
</dependencies>
102+
103+
<build>
104+
<sourceDirectory>${basedir}/src/java</sourceDirectory>
105+
<testSourceDirectory>${basedir}/src/test</testSourceDirectory>
106+
<resources>
107+
<resource>
108+
<directory>src/resources</directory>
109+
</resource>
110+
</resources>
111+
<plugins>
112+
<plugin>
113+
<groupId>org.apache.maven.plugins</groupId>
114+
<artifactId>maven-compiler-plugin</artifactId>
115+
</plugin>
116+
<plugin>
117+
<groupId>org.apache.maven.plugins</groupId>
118+
<artifactId>maven-enforcer-plugin</artifactId>
119+
</plugin>
120+
<plugin>
121+
<artifactId>maven-assembly-plugin</artifactId>
122+
<configuration>
123+
<archive>
124+
<manifest>
125+
<mainClass>org.apache.orc.bench.core.Driver</mainClass>
126+
</manifest>
127+
</archive>
128+
</configuration>
129+
</plugin>
130+
</plugins>
131+
</build>
132+
133+
<profiles>
134+
<profile>
135+
<id>cmake</id>
136+
<build>
137+
<directory>${build.dir}/bench/core</directory>
138+
</build>
139+
</profile>
140+
</profiles>
141+
</project>

java/bench/src/java/org/apache/hadoop/fs/TrackingLocalFileSystem.java renamed to java/bench/core/src/java/org/apache/hadoop/fs/TrackingLocalFileSystem.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/**
1+
/*
22
* Licensed to the Apache Software Foundation (ASF) under one
33
* or more contributor license agreements. See the NOTICE file
44
* distributed with this work for additional information
@@ -19,10 +19,13 @@
1919

2020
import java.io.FileNotFoundException;
2121
import java.io.IOException;
22+
import java.net.URI;
2223

2324
public class TrackingLocalFileSystem extends RawLocalFileSystem {
25+
static final URI NAME = URI.create("track:///");
2426

2527
class TrackingFileInputStream extends RawLocalFileSystem.LocalFSFileInputStream {
28+
2629
public TrackingFileInputStream(Path f) throws IOException {
2730
super(f);
2831
}
@@ -51,6 +54,11 @@ public FSDataInputStream open(Path f, int bufferSize) throws IOException {
5154
new TrackingFileInputStream(f), bufferSize));
5255
}
5356

57+
@Override
58+
public URI getUri() {
59+
return NAME;
60+
}
61+
5462
public FileSystem.Statistics getLocalStatistics() {
5563
return statistics;
5664
}
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.orc.bench.core;
20+
21+
import org.apache.commons.cli.CommandLine;
22+
import org.apache.commons.cli.DefaultParser;
23+
import org.apache.commons.cli.HelpFormatter;
24+
import org.apache.commons.cli.Options;
25+
import org.apache.commons.cli.ParseException;
26+
27+
public class BenchmarkOptions {
28+
29+
public static final String HELP = "help";
30+
public static final String ITERATIONS = "iterations";
31+
public static final String WARMUP_ITERATIONS = "warmup-iterations";
32+
public static final String FORK = "fork";
33+
public static final String TIME = "time";
34+
public static final String MIN_MEMORY = "min-memory";
35+
public static final String MAX_MEMORY = "max-memory";
36+
public static final String GC = "gc";
37+
38+
public static CommandLine parseCommandLine(String[] args) {
39+
Options options = new Options()
40+
.addOption("h", HELP, false, "Provide help")
41+
.addOption("i", ITERATIONS, true, "Number of iterations")
42+
.addOption("I", WARMUP_ITERATIONS, true, "Number of warmup iterations")
43+
.addOption("f", FORK, true, "How many forks to use")
44+
.addOption("t", TIME, true, "How long each iteration is in seconds")
45+
.addOption("m", MIN_MEMORY, true, "The minimum size of each JVM")
46+
.addOption("M", MAX_MEMORY, true, "The maximum size of each JVM")
47+
.addOption("g", GC, false, "Should GC be profiled");
48+
CommandLine result;
49+
try {
50+
result = new DefaultParser().parse(options, args, true);
51+
} catch (ParseException pe) {
52+
System.err.println("Argument exception - " + pe.getMessage());
53+
result = null;
54+
}
55+
if (result == null || result.hasOption(HELP) || result.getArgs().length == 0) {
56+
new HelpFormatter().printHelp("java -jar <jar> <command> <options> <data>",
57+
options);
58+
System.err.println();
59+
System.exit(1);
60+
}
61+
return result;
62+
}
63+
}

java/bench/src/java/org/apache/orc/bench/CompressionKind.java renamed to java/bench/core/src/java/org/apache/orc/bench/core/CompressionKind.java

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
* limitations under the License.
1717
*/
1818

19-
package org.apache.orc.bench;
19+
package org.apache.orc.bench.core;
2020

2121
import io.airlift.compress.snappy.SnappyCodec;
2222
import org.apache.hadoop.fs.Path;
@@ -31,9 +31,9 @@
3131
* Enum for handling the compression codecs for the benchmark
3232
*/
3333
public enum CompressionKind {
34-
NONE(".none"),
35-
ZLIB(".gz"),
36-
SNAPPY(".snappy");
34+
NONE("none"),
35+
ZLIB("gz"),
36+
SNAPPY("snappy");
3737

3838
CompressionKind(String extendsion) {
3939
this.extension = extendsion;
@@ -77,11 +77,20 @@ public static CompressionKind fromPath(Path path) {
7777
if (lastDot >= 0) {
7878
String ext = name.substring(lastDot);
7979
for (CompressionKind value : values()) {
80-
if (ext.equals(value.getExtension())) {
80+
if (ext.equals("." + value.getExtension())) {
8181
return value;
8282
}
8383
}
8484
}
8585
return NONE;
8686
}
87+
88+
public static CompressionKind fromExtension(String extension) {
89+
for (CompressionKind value: values()) {
90+
if (value.extension.equals(extension)) {
91+
return value;
92+
}
93+
}
94+
throw new IllegalArgumentException("Unknown compression " + extension);
95+
}
8796
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
* <p/>
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
* <p/>
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.orc.bench.core;
20+
21+
import java.util.Arrays;
22+
import java.util.Map;
23+
import java.util.ServiceLoader;
24+
import java.util.TreeMap;
25+
26+
/**
27+
* A driver tool to call the various benchmark classes.
28+
*/
29+
public class Driver {
30+
private static final ServiceLoader<OrcBenchmark> loader =
31+
ServiceLoader.load(OrcBenchmark.class);
32+
33+
private static Map<String, OrcBenchmark> getBenchmarks() {
34+
Map<String, OrcBenchmark> result = new TreeMap<>();
35+
for(OrcBenchmark bench: loader) {
36+
result.put(bench.getName(), bench);
37+
}
38+
return result;
39+
}
40+
41+
private static final String PATTERN = " %10s - %s";
42+
43+
private static void printUsageAndExit(Map<String, OrcBenchmark> benchmarks) {
44+
System.err.println("Commands:");
45+
for(OrcBenchmark bench: benchmarks.values()) {
46+
System.err.println(String.format(PATTERN, bench.getName(),
47+
bench.getDescription()));
48+
}
49+
System.exit(1);
50+
}
51+
52+
public static void main(String[] args) throws Exception {
53+
Map<String, OrcBenchmark> benchmarks = getBenchmarks();
54+
if (args.length == 0) {
55+
printUsageAndExit(benchmarks);
56+
}
57+
String command = args[0];
58+
args = Arrays.copyOfRange(args, 1, args.length);
59+
OrcBenchmark bench = benchmarks.get(command);
60+
if (bench == null) {
61+
printUsageAndExit(benchmarks);
62+
}
63+
bench.run(args);
64+
}
65+
}

0 commit comments

Comments
 (0)