apache · paulzwu · Aug 27, 2016 · Aug 28, 2016 · Aug 29, 2016 · Aug 29, 2016
diff --git a/.gitignore b/.gitignore
@@ -17,11 +17,13 @@
 .idea/
 .idea_modules/
 .project
+.pydevproject
 .scala_dependencies
 .settings
 /lib/
 R-unit-tests.log
 R/unit-tests.out
+R/cran-check.out
 build/*.jar
 build/apache-maven*
 build/scala*
@@ -72,7 +74,13 @@ metastore/
 metastore_db/
 sql/hive-thriftserver/test_warehouses
 warehouse/
+spark-warehouse/
 
 # For R session data
 .RData
 .RHistory
+.Rhistory
+*.Rproj
+*.Rproj.*
+
+.Rproj.user
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Spark provides this Travis CI configuration file to help contributors
+# check Scala/Java style conformance and JDK7/8 compilation easily
+# during their preparing pull requests.
+#   - Scalastyle is executed during `maven install` implicitly.
+#   - Java Checkstyle is executed by `lint-java`.
+# See the related discussion here.
+# https://github.com/apache/spark/pull/12980
+
+# 1. Choose OS (Ubuntu 14.04.3 LTS Server Edition 64bit, ~2 CORE, 7.5GB RAM)
+sudo: required
+dist: trusty
+
+# 2. Choose language and target JDKs for parallel builds.
+language: java
+jdk:
+  - oraclejdk7
+  - oraclejdk8
+
+# 3. Setup cache directory for SBT and Maven.
+cache:
+  directories:
+  - $HOME/.sbt
+  - $HOME/.m2
+
+# 4. Turn off notifications.
+notifications:
+  email: false
+
+# 5. Run maven install before running lint-java.
+install:
+  - export MAVEN_SKIP_RC=1
+  - build/mvn -T 4 -q -DskipTests -Pmesos -Pyarn -Phadoop-2.3 -Pkinesis-asl -Phive -Phive-thriftserver install
+
+# 6. Run lint-java.
+script:
+  - dev/lint-java
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -6,7 +6,7 @@ It lists steps that are required before creating a PR. In particular, consider:
 
 - Is the change important and ready enough to ask the community to spend time reviewing?
 - Have you searched for existing, related JIRAs and pull requests?
-- Is this a new feature that can stand alone as a package on http://spark-packages.org ?
+- Is this a new feature that can stand alone as a [third party project](https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects) ?
 - Is the change being proposed clearly explained and motivated?
 
 When you contribute code, you affirm that the contribution is your original work and that you 

diff --git a/LICENSE b/LICENSE
@@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
      (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.9.2 - http://py4j.sourceforge.net/)
+     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.3 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
      (BSD licence) sbt and sbt-launch-lib.bash
      (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
@@ -296,3 +296,4 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (MIT License) blockUI (http://jquery.malsup.com/block/)
      (MIT License) RowsGroup (http://datatables.net/license/mit)
      (MIT License) jsonFormatter (http://www.jqueryscript.net/other/jQuery-Plugin-For-Pretty-JSON-Formatting-jsonFormatter.html)
+     (MIT License) modernizr (https://github.com/Modernizr/Modernizr/blob/master/LICENSE)
diff --git a/NOTICE b/NOTICE
@@ -1,5 +1,5 @@
 Apache Spark
-Copyright 2014 The Apache Software Foundation.
+Copyright 2014 and onwards The Apache Software Foundation.
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
@@ -12,7 +12,9 @@ Common Development and Distribution License 1.0
 The following components are provided under the Common Development and Distribution License 1.0. See project link for details.
 
      (CDDL 1.0) Glassfish Jasper (org.mortbay.jetty:jsp-2.1:6.1.14 - http://jetty.mortbay.org/project/modules/jsp-2.1)
+     (CDDL 1.0) JAX-RS (https://jax-rs-spec.java.net/)
      (CDDL 1.0) Servlet Specification 2.5 API (org.mortbay.jetty:servlet-api-2.5:6.1.14 - http://jetty.mortbay.org/project/modules/servlet-api-2.5)
+     (CDDL 1.0) (GPL2 w/ CPE) javax.annotation API (https://glassfish.java.net/nonav/public/CDDL+GPL.html)
      (COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0) (GNU General Public Library) Streaming API for XML (javax.xml.stream:stax-api:1.0-2 - no url defined)
      (Common Development and Distribution License (CDDL) v1.0) JavaBeans Activation Framework (JAF) (javax.activation:activation:1.1 - http://java.sun.com/products/javabeans/jaf/index.jsp)
 
@@ -22,15 +24,10 @@ Common Development and Distribution License 1.1
 
 The following components are provided under the Common Development and Distribution License 1.1. See project link for details.
 
+     (CDDL 1.1) (GPL2 w/ CPE) org.glassfish.hk2 (https://hk2.java.net)
      (CDDL 1.1) (GPL2 w/ CPE) JAXB API bundle for GlassFish V3 (javax.xml.bind:jaxb-api:2.2.2 - https://jaxb.dev.java.net/)
      (CDDL 1.1) (GPL2 w/ CPE) JAXB RI (com.sun.xml.bind:jaxb-impl:2.2.3-1 - http://jaxb.java.net/)
-     (CDDL 1.1) (GPL2 w/ CPE) jersey-core (com.sun.jersey:jersey-core:1.8 - https://jersey.dev.java.net/jersey-core/)
-     (CDDL 1.1) (GPL2 w/ CPE) jersey-core (com.sun.jersey:jersey-core:1.9 - https://jersey.java.net/jersey-core/)
-     (CDDL 1.1) (GPL2 w/ CPE) jersey-guice (com.sun.jersey.contribs:jersey-guice:1.9 - https://jersey.java.net/jersey-contribs/jersey-guice/)
-     (CDDL 1.1) (GPL2 w/ CPE) jersey-json (com.sun.jersey:jersey-json:1.8 - https://jersey.dev.java.net/jersey-json/)
-     (CDDL 1.1) (GPL2 w/ CPE) jersey-json (com.sun.jersey:jersey-json:1.9 - https://jersey.java.net/jersey-json/)
-     (CDDL 1.1) (GPL2 w/ CPE) jersey-server (com.sun.jersey:jersey-server:1.8 - https://jersey.dev.java.net/jersey-server/)
-     (CDDL 1.1) (GPL2 w/ CPE) jersey-server (com.sun.jersey:jersey-server:1.9 - https://jersey.java.net/jersey-server/)
+     (CDDL 1.1) (GPL2 w/ CPE) Jersey 2 (https://jersey.java.net)
 
 ========================================================================
 Common Public License 1.0

diff --git a/R/.gitignore b/R/.gitignore
@@ -4,3 +4,5 @@
 lib
 pkg/man
 pkg/html
+SparkR.Rcheck/
+SparkR_*.tar.gz
diff --git a/R/DOCUMENTATION.md b/R/DOCUMENTATION.md
@@ -1,12 +1,12 @@
 # SparkR Documentation
 
-SparkR documentation is generated using in-source comments annotated using using
-`roxygen2`. After making changes to the documentation, to generate man pages,
+SparkR documentation is generated by using in-source comments and annotated by using
+[`roxygen2`](https://cran.r-project.org/web/packages/roxygen2/index.html). After making changes to the documentation and generating man pages,
 you can run the following from an R console in the SparkR home directory
-
-    library(devtools)
-    devtools::document(pkg="./pkg", roclets=c("rd"))
-
+```R
+library(devtools)
+devtools::document(pkg="./pkg", roclets=c("rd"))
+```
 You can verify if your changes are good by running
 
     R CMD check pkg/
diff --git a/R/README.md b/R/README.md
@@ -1,12 +1,13 @@
 # R on Spark
 
 SparkR is an R package that provides a light-weight frontend to use Spark from R.
+
 ### Installing sparkR
 
 Libraries of sparkR need to be created in `$SPARK_HOME/R/lib`. This can be done by running the script `$SPARK_HOME/R/install-dev.sh`.
 By default the above script uses the system wide installation of R. However, this can be changed to any user installed location of R by setting the environment variable `R_HOME` the full path of the base directory where R is installed, before running install-dev.sh script.
 Example: 
-```
+```bash
 # where /home/username/R is where R is installed and /home/username/R/bin contains the files R and RScript
 export R_HOME=/home/username/R
 ./install-dev.sh
@@ -17,8 +18,9 @@ export R_HOME=/home/username/R
 #### Build Spark
 
 Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
-```
-  build/mvn -DskipTests -Psparkr package
+
+```bash
+build/mvn -DskipTests -Psparkr package
 ```
 
 #### Running sparkR
@@ -37,8 +39,8 @@ To set other options like driver memory, executor memory etc. you can pass in th
 
 #### Using SparkR from RStudio
 
-If you wish to use SparkR from RStudio or other R frontends you will need to set some environment variables which point SparkR to your Spark installation. For example 
-```
+If you wish to use SparkR from RStudio or other R frontends you will need to set some environment variables which point SparkR to your Spark installation. For example
+```R
 # Set this to where Spark is installed
 Sys.setenv(SPARK_HOME="/Users/username/spark")
 # This line loads SparkR from the installed directory
@@ -55,23 +57,25 @@ Once you have made your changes, please include unit tests for them and run exis
 
 #### Generating documentation
 
-The SparkR documentation (Rd files and HTML files) are not a part of the source repository. To generate them you can run the script `R/create-docs.sh`. This script uses `devtools` and `knitr` to generate the docs and these packages need to be installed on the machine before using the script.
+The SparkR documentation (Rd files and HTML files) are not a part of the source repository. To generate them you can run the script `R/create-docs.sh`. This script uses `devtools` and `knitr` to generate the docs and these packages need to be installed on the machine before using the script. Also, you may need to install these [prerequisites](https://github.com/apache/spark/tree/master/docs#prerequisites). See also, `R/DOCUMENTATION.md`
 
 ### Examples, Unit tests
 
 SparkR comes with several sample programs in the `examples/src/main/r` directory.
 To run one of them, use `./bin/spark-submit <filename> <args>`. For example:
-
-    ./bin/spark-submit examples/src/main/r/dataframe.R
-
-You can also run the unit-tests for SparkR by running (you need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first):
-
-    R -e 'install.packages("testthat", repos="http://cran.us.r-project.org")'
-    ./R/run-tests.sh
+```bash
+./bin/spark-submit examples/src/main/r/dataframe.R
+```
+You can also run the unit tests for SparkR by running. You need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first:
+```bash
+R -e 'install.packages("testthat", repos="http://cran.us.r-project.org")'
+./R/run-tests.sh
+```
 
 ### Running on YARN
+
 The `./bin/spark-submit` can also be used to submit jobs to YARN clusters. You will need to set YARN conf dir before doing so. For example on CDH you can run
-```
+```bash
 export YARN_CONF_DIR=/etc/hadoop/conf
 ./bin/spark-submit --master yarn examples/src/main/r/dataframe.R
 ```
diff --git a/R/WINDOWS.md b/R/WINDOWS.md
@@ -4,10 +4,40 @@ To build SparkR on Windows, the following steps are required
 
 1. Install R (>= 3.1) and [Rtools](http://cran.r-project.org/bin/windows/Rtools/). Make sure to
 include Rtools and R in `PATH`.
+
 2. Install
 [JDK7](http://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html) and set
 `JAVA_HOME` in the system environment variables.
+
 3. Download and install [Maven](http://maven.apache.org/download.html). Also include the `bin`
 directory in Maven in `PATH`.
+
 4. Set `MAVEN_OPTS` as described in [Building Spark](http://spark.apache.org/docs/latest/building-spark.html).
-5. Open a command shell (`cmd`) in the Spark directory and run `mvn -DskipTests -Psparkr package`
+
+5. Open a command shell (`cmd`) in the Spark directory and build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
+
+    ```bash
+    mvn.cmd -DskipTests -Psparkr package
+    ```
+
+    `.\build\mvn` is a shell script so `mvn.cmd` should be used directly on Windows.
+
+##  Unit tests
+
+To run the SparkR unit tests on Windows, the following steps are required —assuming you are in the Spark root directory and do not have Apache Hadoop installed already:
+
+1. Create a folder to download Hadoop related files for Windows. For example, `cd ..` and `mkdir hadoop`.
+
+2. Download the relevant Hadoop bin package from [steveloughran/winutils](https://github.com/steveloughran/winutils). While these are not official ASF artifacts, they are built from the ASF release git hashes by a Hadoop PMC member on a dedicated Windows VM. For further reading, consult [Windows Problems on the Hadoop wiki](https://wiki.apache.org/hadoop/WindowsProblems).
+
+3. Install the files into `hadoop\bin`; make sure that `winutils.exe` and `hadoop.dll` are present.
+
+4. Set the environment variable `HADOOP_HOME` to the full path to the newly created `hadoop` directory.
+
+5. Run unit tests for SparkR by running the command below. You need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first:
+
+    ```
+    R -e "install.packages('testthat', repos='http://cran.us.r-project.org')"
+    .\bin\spark-submit2.cmd --conf spark.hadoop.fs.default.name="file:///" R\pkg\tests\run-all.R
+    ```
+
diff --git a/R/check-cran.sh b/R/check-cran.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -o pipefail
+set -e
+
+FWDIR="$(cd `dirname $0`; pwd)"
+pushd $FWDIR > /dev/null
+
+if [ ! -z "$R_HOME" ]
+  then
+    R_SCRIPT_PATH="$R_HOME/bin"
+  else
+    # if system wide R_HOME is not found, then exit
+    if [ ! `command -v R` ]; then
+      echo "Cannot find 'R_HOME'. Please specify 'R_HOME' or make sure R is properly installed."
+      exit 1
+    fi
+    R_SCRIPT_PATH="$(dirname $(which R))"
+fi
+echo "USING R_HOME = $R_HOME"
+
+# Build the latest docs
+$FWDIR/create-docs.sh
+
+# Build a zip file containing the source package
+"$R_SCRIPT_PATH/"R CMD build $FWDIR/pkg
+
+# Run check as-cran.
+VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'`
+
+CRAN_CHECK_OPTIONS="--as-cran"
+
+if [ -n "$NO_TESTS" ]
+then
+  CRAN_CHECK_OPTIONS=$CRAN_CHECK_OPTIONS" --no-tests"
+fi
+
+if [ -n "$NO_MANUAL" ]
+then
+  CRAN_CHECK_OPTIONS=$CRAN_CHECK_OPTIONS" --no-manual"
+fi
+
+echo "Running CRAN check with $CRAN_CHECK_OPTIONS options"
+
+"$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
+
+popd > /dev/null
diff --git a/R/create-docs.sh b/R/create-docs.sh
@@ -17,17 +17,26 @@
 # limitations under the License.
 #
 
-# Script to create API docs for SparkR
-# This requires `devtools` and `knitr` to be installed on the machine.
+# Script to create API docs and vignettes for SparkR
+# This requires `devtools`, `knitr` and `rmarkdown` to be installed on the machine.
 
 # After running this script the html docs can be found in 
 # $SPARK_HOME/R/pkg/html
+# The vignettes can be found in
+# $SPARK_HOME/R/pkg/vignettes/sparkr_vignettes.html
 
 set -o pipefail
 set -e
 
 # Figure out where the script is
 export FWDIR="$(cd "`dirname "$0"`"; pwd)"
+export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+
+# Required for setting SPARK_SCALA_VERSION
+. "${SPARK_HOME}"/bin/load-spark-env.sh
+
+echo "Using Scala $SPARK_SCALA_VERSION"
+
 pushd $FWDIR
 
 # Install the package (this will also generate the Rd files)
@@ -43,4 +52,21 @@ Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knit
 
 popd
 
+# Find Spark jars.
+if [ -f "${SPARK_HOME}/RELEASE" ]; then
+  SPARK_JARS_DIR="${SPARK_HOME}/jars"
+else
+  SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars"
+fi
+
+# Only create vignettes if Spark JARs exist
+if [ -d "$SPARK_JARS_DIR" ]; then
+  # render creates SparkR vignettes
+  Rscript -e 'library(rmarkdown); paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); render("pkg/vignettes/sparkr-vignettes.Rmd"); .libPaths(paths)'
+
+  find pkg/vignettes/. -not -name '.' -not -name '*.Rmd' -not -name '*.md' -not -name '*.pdf' -not -name '*.html' -delete
+else
+  echo "Skipping R vignettes as Spark JARs not found in $SPARK_HOME"
+fi
+
 popd