-
Notifications
You must be signed in to change notification settings - Fork 72
Add TPC-H Benchmarks #139
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add TPC-H Benchmarks #139
Changes from all commits
0c176e3
6bb5880
4d5808c
18bfe56
7e4aa5f
470bd71
8c68442
f8cd8e4
085ce7a
867aad4
e505066
4417824
4d261c6
679c2d5
39fbf13
b7f8251
c0ab7cf
eabcdda
3c29146
7347c3f
3c90688
62fbea2
0f43be3
1a82045
2960312
26003b0
17f82fa
6201a71
02ac318
7f02fa4
4bfcb9e
fc49497
156adf9
577d2cc
6197b43
0dbf790
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,35 +24,117 @@ import org.apache.spark.sql.SparkSession | |
* Convenient runner for benchmarks. | ||
* | ||
* To run locally, use | ||
* `$OPAQUE_HOME/build/sbt 'run edu.berkeley.cs.rise.opaque.benchmark.Benchmark'`. | ||
* `$OPAQUE_HOME/build/sbt 'run edu.berkeley.cs.rise.opaque.benchmark.Benchmark <flags>'`. | ||
* Available flags: | ||
* --num-partitions: specify the number of partitions the data should be split into. | ||
* Default: 2 * number of executors if exists, 4 otherwise | ||
* --size: specify the size of the dataset that should be loaded into Spark. | ||
* Default: sf_small | ||
* --operations: select the different operations that should be benchmarked. | ||
* Default: all | ||
* Available operations: logistic-regression, tpc-h | ||
* Syntax: --operations "logistic-regression,tpc-h" | ||
* --run-local: boolean whether to use HDFS or the local filesystem | ||
* Default: HDFS | ||
* Leave --operations flag blank to run all benchmarks | ||
* | ||
* To run on a cluster, use `$SPARK_HOME/bin/spark-submit` with appropriate arguments. | ||
*/ | ||
object Benchmark { | ||
|
||
val spark = SparkSession.builder() | ||
.appName("Benchmark") | ||
.getOrCreate() | ||
var numPartitions = spark.sparkContext.defaultParallelism | ||
var size = "sf_med" | ||
|
||
// Configure your HDFS namenode url here | ||
var fileUrl = "hdfs://10.0.3.4:8020" | ||
|
||
def dataDir: String = { | ||
if (System.getenv("SPARKSGX_DATA_DIR") == null) { | ||
throw new Exception("Set SPARKSGX_DATA_DIR") | ||
} | ||
System.getenv("SPARKSGX_DATA_DIR") | ||
} | ||
|
||
def main(args: Array[String]): Unit = { | ||
val spark = SparkSession.builder() | ||
.appName("QEDBenchmark") | ||
.getOrCreate() | ||
Utils.initSQLContext(spark.sqlContext) | ||
|
||
// val numPartitions = | ||
// if (spark.sparkContext.isLocal) 1 else spark.sparkContext.defaultParallelism | ||
|
||
def logisticRegression() = { | ||
// TODO: this fails when Spark is ran on a cluster | ||
/* | ||
// Warmup | ||
LogisticRegression.train(spark, Encrypted, 1000, 1) | ||
LogisticRegression.train(spark, Encrypted, 1000, 1) | ||
|
||
// Run | ||
LogisticRegression.train(spark, Insecure, 100000, 1) | ||
LogisticRegression.train(spark, Encrypted, 100000, 1) | ||
*/ | ||
} | ||
|
||
def runAll() = { | ||
logisticRegression() | ||
TPCHBenchmark.run(spark.sqlContext, numPartitions, size, fileUrl) | ||
} | ||
|
||
def main(args: Array[String]): Unit = { | ||
Utils.initSQLContext(spark.sqlContext) | ||
|
||
if (args.length >= 2 && args(1) == "--help") { | ||
println( | ||
"""Available flags: | ||
--num-partitions: specify the number of partitions the data should be split into. | ||
Default: 2 * number of executors if exists, 4 otherwise | ||
--size: specify the size of the dataset that should be loaded into Spark. | ||
Default: sf_small | ||
--operations: select the different operations that should be benchmarked. | ||
Default: all | ||
Available operations: logistic-regression, tpc-h | ||
Syntax: --operations "logistic-regression,tpc-h" | ||
Leave --operations flag blank to run all benchmarks | ||
--run-local: boolean whether to use HDFS or the local filesystem | ||
Default: HDFS""" | ||
) | ||
} | ||
|
||
var runAll = true | ||
args.slice(1, args.length).sliding(2, 2).toList.collect { | ||
case Array("--num-partitions", numPartitions: String) => { | ||
this.numPartitions = numPartitions.toInt | ||
} | ||
case Array("--size", size: String) => { | ||
val supportedSizes = Set("sf_small, sf_med") | ||
if (supportedSizes.contains(size)) { | ||
this.size = size | ||
} else { | ||
println("Given size is not supported: available values are " + supportedSizes.toString()) | ||
} | ||
} | ||
case Array("--run-local", runLocal: String) => { | ||
runLocal match { | ||
case "true" => { | ||
fileUrl = "file://" | ||
} | ||
case _ => {} | ||
} | ||
} | ||
case Array("--operations", operations: String) => { | ||
runAll = false | ||
val operationsArr = operations.split(",").map(_.trim) | ||
for (operation <- operationsArr) { | ||
operation match { | ||
case "logistic-regression" => { | ||
logisticRegression() | ||
} | ||
Comment on lines
+125
to
+127
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you make sure that this actually runs? I remember there were some errors that cause this to fail a while ago... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It runs locally in Spark but not in a cluster, going to comment it out and add a TODO for now. |
||
case "tpc-h" => { | ||
TPCHBenchmark.run(spark.sqlContext, numPartitions, size, fileUrl) | ||
} | ||
} | ||
} | ||
} | ||
} | ||
if (runAll) { | ||
this.runAll(); | ||
} | ||
spark.stop() | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package edu.berkeley.cs.rise.opaque.benchmark | ||
|
||
import edu.berkeley.cs.rise.opaque.Utils | ||
|
||
import org.apache.spark.sql.SQLContext | ||
|
||
object TPCHBenchmark { | ||
|
||
// Add query numbers here once they are supported | ||
val supportedQueries = Seq(1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 19, 20, 22) | ||
|
||
def query(queryNumber: Int, tpch: TPCH, sqlContext: SQLContext, numPartitions: Int) = { | ||
val sqlStr = tpch.getQuery(queryNumber) | ||
tpch.generateFiles(numPartitions) | ||
|
||
Utils.timeBenchmark( | ||
"distributed" -> (numPartitions > 1), | ||
"query" -> s"TPC-H $queryNumber", | ||
"system" -> Insecure.name) { | ||
|
||
tpch.performQuery(sqlStr, Insecure).collect | ||
} | ||
|
||
Utils.timeBenchmark( | ||
"distributed" -> (numPartitions > 1), | ||
"query" -> s"TPC-H $queryNumber", | ||
"system" -> Encrypted.name) { | ||
|
||
tpch.performQuery(sqlStr, Encrypted).collect | ||
} | ||
} | ||
|
||
def run(sqlContext: SQLContext, numPartitions: Int, size: String, fileUrl: String) = { | ||
val tpch = new TPCH(sqlContext, size, fileUrl) | ||
|
||
for (queryNumber <- supportedQueries) { | ||
query(queryNumber, tpch, sqlContext, numPartitions) | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if we should have a help argument here? Also, once this benchmark is set up, it might be good to put some text in the README about this benchmark.