From 680b4759e04c7d371fc555e220730e0a4da251f5 Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Wed, 4 Nov 2015 18:37:04 -0800
Subject: [PATCH 1/8] Add stddev and friends

---
 R/pkg/R/group.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R
index 4cab1a69f601..f0b158997944 100644
--- a/R/pkg/R/group.R
+++ b/R/pkg/R/group.R
@@ -118,7 +118,9 @@ setMethod("summarize",
           })
 
 # sum/mean/avg/min/max
-methods <- c("sum", "mean", "avg", "min", "max")
+methods <- c("sum", "mean", "avg", "min", "max",
+             "stddev", "stddev_samp", "stddev_pop",
+             "variance", "var_samp", "var_pop", "skewness", "kurtosis")
 
 createMethod <- function(name) {
   setMethod(name,

From f63608e3d36bae0544281a94c732760bfebfcc6d Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Wed, 4 Nov 2015 21:05:57 -0800
Subject: [PATCH 2/8] add aggFunction by name and on Column

---
 R/pkg/R/functions.R | 152 ++++++++++++++++++++++++++++++++++++++++----
 R/pkg/R/generics.R  |  32 ++++++++++
 R/pkg/R/group.R     |  14 ++--
 3 files changed, 177 insertions(+), 21 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index d7fd27927913..c082ba423ad4 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -520,6 +520,22 @@ setMethod("isNaN",
             column(jc)
           })
 
+#' kurtosis
+#'
+#' Aggregate function: returns the kurtosis of the values in a group.
+#'
+#' @rdname kurtosis
+#' @name kurtosis
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{kurtosis(df$c)}
+setMethod("kurtosis",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "kurtosis", x@jc)
+            column(jc)
+          })
+
 #' last
 #'
 #' Aggregate function: returns the last value in a group.
@@ -958,6 +974,22 @@ setMethod("size",
             column(jc)
           })
 
+#' skewness
+#'
+#' Aggregate function: returns the skewness of the values in a group.
+#'
+#' @rdname skewness
+#' @name skewness
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{skewness(df$c)}
+setMethod("skewness",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "skewness", x@jc)
+            column(jc)
+          })
+
 #' soundex
 #'
 #' Return the soundex code for the specified expression.
@@ -974,6 +1006,54 @@ setMethod("soundex",
             column(jc)
           })
 
+#' var_samp
+#'
+#' Aggregate function: alias for \link{stddev_samp}
+#'
+#' @rdname stddev
+#' @name stddev
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{stddev(df$c)}
+setMethod("stddev",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "stddev", x@jc)
+            column(jc)
+          })
+
+#' stddev_samp
+#'
+#' Aggregate function: returns the unbiased sample standard deviation of the expression in a group.
+#'
+#' @rdname stddev_samp
+#' @name stddev_samp
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{stddev_samp(df$c)}
+setMethod("stddev_samp",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "stddev_samp", x@jc)
+            column(jc)
+          })
+
+#' stddev_pop
+#'
+#' Aggregate function: returns the population standard deviation of the expression in a group.
+#'
+#' @rdname stddev_pop
+#' @name stddev_pop
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{stddev_pop(df$c)}
+setMethod("stddev_pop",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "stddev_pop", x@jc)
+            column(jc)
+          })
+
 #' sqrt
 #'
 #' Computes the square root of the specified float value.
@@ -1168,6 +1248,54 @@ setMethod("upper",
             column(jc)
           })
 
+#' variance
+#'
+#' Aggregate function: alias for \link{var_samp}.
+#'
+#' @rdname variance
+#' @name variance
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{variance(df$c)}
+setMethod("variance",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "variance", x@jc)
+            column(jc)
+          })
+
+#' var_samp
+#'
+#' Aggregate function: returns the unbiased variance of the values in a group.
+#'
+#' @rdname var_samp
+#' @name var_samp
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{var_samp(df$c)}
+setMethod("var_samp",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "var_samp", x@jc)
+            column(jc)
+          })
+
+#' var_pop
+#'
+#' Aggregate function: returns the population variance of the values in a group.
+#'
+#' @rdname var_pop
+#' @name var_pop
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{var_pop(df$c)}
+setMethod("var_pop",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "var_pop", x@jc)
+            column(jc)
+          })
+
 #' weekofyear
 #'
 #' Extracts the week number as an integer from a given date/timestamp/string.
@@ -2020,10 +2148,10 @@ setMethod("ifelse",
 #'
 #' Window function: returns the cumulative distribution of values within a window partition,
 #' i.e. the fraction of rows that are below the current row.
-#' 
+#'
 #'   N = total number of rows in the partition
 #'   cumeDist(x) = number of values before (and including) x / N
-#'   
+#'
 #' This is equivalent to the CUME_DIST function in SQL.
 #'
 #' @rdname cumeDist
@@ -2039,13 +2167,13 @@ setMethod("cumeDist",
           })
 
 #' denseRank
-#' 
+#'
 #' Window function: returns the rank of rows within a window partition, without any gaps.
 #' The difference between rank and denseRank is that denseRank leaves no gaps in ranking
 #' sequence when there are ties. That is, if you were ranking a competition using denseRank
 #' and had three people tie for second place, you would say that all three were in second
 #' place and that the next person came in third.
-#' 
+#'
 #' This is equivalent to the DENSE_RANK function in SQL.
 #'
 #' @rdname denseRank
@@ -2065,7 +2193,7 @@ setMethod("denseRank",
 #' Window function: returns the value that is `offset` rows before the current row, and
 #' `defaultValue` if there is less than `offset` rows before the current row. For example,
 #' an `offset` of one will return the previous row at any given point in the window partition.
-#' 
+#'
 #' This is equivalent to the LAG function in SQL.
 #'
 #' @rdname lag
@@ -2092,7 +2220,7 @@ setMethod("lag",
 #' Window function: returns the value that is `offset` rows after the current row, and
 #' `null` if there is less than `offset` rows after the current row. For example,
 #' an `offset` of one will return the next row at any given point in the window partition.
-#' 
+#'
 #' This is equivalent to the LEAD function in SQL.
 #'
 #' @rdname lead
@@ -2119,7 +2247,7 @@ setMethod("lead",
 #' Window function: returns the ntile group id (from 1 to `n` inclusive) in an ordered window
 #' partition. Fow example, if `n` is 4, the first quarter of the rows will get value 1, the second
 #' quarter will get 2, the third quarter will get 3, and the last quarter will get 4.
-#' 
+#'
 #' This is equivalent to the NTILE function in SQL.
 #'
 #' @rdname ntile
@@ -2137,9 +2265,9 @@ setMethod("ntile",
 #' percentRank
 #'
 #' Window function: returns the relative rank (i.e. percentile) of rows within a window partition.
-#' 
+#'
 #' This is computed by:
-#' 
+#'
 #'   (rank of row in its partition - 1) / (number of rows in the partition - 1)
 #'
 #' This is equivalent to the PERCENT_RANK function in SQL.
@@ -2159,12 +2287,12 @@ setMethod("percentRank",
 #' rank
 #'
 #' Window function: returns the rank of rows within a window partition.
-#' 
+#'
 #' The difference between rank and denseRank is that denseRank leaves no gaps in ranking
 #' sequence when there are ties. That is, if you were ranking a competition using denseRank
 #' and had three people tie for second place, you would say that all three were in second
 #' place and that the next person came in third.
-#' 
+#'
 #' This is equivalent to the RANK function in SQL.
 #'
 #' @rdname rank
@@ -2189,7 +2317,7 @@ setMethod("rank",
 #' rowNumber
 #'
 #' Window function: returns a sequential number starting at 1 within a window partition.
-#' 
+#'
 #' This is equivalent to the ROW_NUMBER function in SQL.
 #'
 #' @rdname rowNumber
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 0b35340e48e4..bcd5883975ff 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -798,6 +798,10 @@ setGeneric("instr", function(y, x) { standardGeneric("instr") })
 #' @export
 setGeneric("isNaN", function(x) { standardGeneric("isNaN") })
 
+#' @rdname kurtosis
+#' @export
+setGeneric("kurtosis", function(x) { standardGeneric("kurtosis") })
+
 #' @rdname lag
 #' @export
 setGeneric("lag", function(x, offset, defaultValue = NULL) { standardGeneric("lag") })
@@ -967,10 +971,26 @@ setGeneric("signum", function(x) { standardGeneric("signum") })
 #' @export
 setGeneric("size", function(x) { standardGeneric("size") })
 
+#' @rdname skewness
+#' @export
+setGeneric("skewness", function(x) { standardGeneric("skewness") })
+
 #' @rdname soundex
 #' @export
 setGeneric("soundex", function(x) { standardGeneric("soundex") })
 
+#' @rdname stddev
+#' @export
+setGeneric("stddev", function(x) { standardGeneric("stddev") })
+
+#' @rdname stddev_samp
+#' @export
+setGeneric("stddev_samp", function(x) { standardGeneric("stddev_samp") })
+
+#' @rdname stddev_pop
+#' @export
+setGeneric("stddev_pop", function(x) { standardGeneric("stddev_pop") })
+
 #' @rdname substring_index
 #' @export
 setGeneric("substring_index", function(x, delim, count) { standardGeneric("substring_index") })
@@ -1019,6 +1039,18 @@ setGeneric("unix_timestamp", function(x, format) { standardGeneric("unix_timesta
 #' @export
 setGeneric("upper", function(x) { standardGeneric("upper") })
 
+#' @rdname variance
+#' @export
+setGeneric("variance", function(x) { standardGeneric("variance") })
+
+#' @rdname var_samp
+#' @export
+setGeneric("var_samp", function(x) { standardGeneric("var_samp") })
+
+#' @rdname var_pop
+#' @export
+setGeneric("var_pop", function(x) { standardGeneric("var_pop") })
+
 #' @rdname weekofyear
 #' @export
 setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R
index f0b158997944..cc788128317b 100644
--- a/R/pkg/R/group.R
+++ b/R/pkg/R/group.R
@@ -62,11 +62,6 @@ setMethod("show", "GroupedData",
 #' \dontrun{
 #'   count(groupBy(df, "name"))
 #' }
-setMethod("count",
-          signature(x = "GroupedData"),
-          function(x) {
-            dataFrame(callJMethod(x@sgd, "count"))
-          })
 
 #' Agg
 #'
@@ -79,6 +74,7 @@ setMethod("count",
 #' @param x a GroupedData
 #' @return a DataFrame
 #' @rdname agg
+#' @family agg_funcs
 #' @examples
 #' \dontrun{
 #'  df2 <- agg(df, age = "sum")  # new column name will be created as 'SUM(age#0)'
@@ -117,10 +113,10 @@ setMethod("summarize",
             agg(x, ...)
           })
 
-# sum/mean/avg/min/max
-methods <- c("sum", "mean", "avg", "min", "max",
-             "stddev", "stddev_samp", "stddev_pop",
-             "variance", "var_samp", "var_pop", "skewness", "kurtosis")
+# Aggregate Functions by name
+methods <- c("approxCountDistinct", "avg", "corr", "count", "countDistinct", "first", "kurtosis",
+             "last", "max", "mean", "min", "skewness", "stddev", "stddev_samp", "stddev_pop",
+             "sum", "sumDistinct", "variance", "var_samp", "var_pop")
 
 createMethod <- function(name) {
   setMethod(name,

From e0fda371d3691a21672a94e9e0b383890cc745a6 Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Thu, 5 Nov 2015 00:27:15 -0800
Subject: [PATCH 3/8] Add tests, checked supported functions for GroupedData

---
 R/pkg/NAMESPACE                  |  8 +++++
 R/pkg/R/group.R                  | 12 +++++--
 R/pkg/inst/tests/test_sparkSQL.R | 57 ++++++++++++++++++++++++--------
 3 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index cd9537a2655f..0a0a46b187aa 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -152,6 +152,7 @@ exportMethods("%in%",
               "isNaN",
               "isNotNull",
               "isNull",
+              "kurtosis",
               "lag",
               "last",
               "last_day",
@@ -209,7 +210,11 @@ exportMethods("%in%",
               "sin",
               "sinh",
               "size",
+              "skewness",
               "soundex",
+              "stddev",
+              "stddev_samp",
+              "stddev_pop",
               "sqrt",
               "startsWith",
               "substr",
@@ -228,6 +233,9 @@ exportMethods("%in%",
               "unhex",
               "unix_timestamp",
               "upper",
+              "variance",
+              "var_samp",
+              "var_pop",
               "weekofyear",
               "when",
               "year")
diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R
index cc788128317b..e5f702faee65 100644
--- a/R/pkg/R/group.R
+++ b/R/pkg/R/group.R
@@ -62,6 +62,11 @@ setMethod("show", "GroupedData",
 #' \dontrun{
 #'   count(groupBy(df, "name"))
 #' }
+setMethod("count",
+          signature(x = "GroupedData"),
+          function(x) {
+            dataFrame(callJMethod(x@sgd, "count"))
+          })
 
 #' Agg
 #'
@@ -114,9 +119,10 @@ setMethod("summarize",
           })
 
 # Aggregate Functions by name
-methods <- c("approxCountDistinct", "avg", "corr", "count", "countDistinct", "first", "kurtosis",
-             "last", "max", "mean", "min", "skewness", "stddev", "stddev_samp", "stddev_pop",
-             "sum", "sumDistinct", "variance", "var_samp", "var_pop")
+methods <- c("avg", "max", "mean", "min", "sum")
+
+# These are not exposed on GroupedData: "kurtosis", "skewness", "stddev", "stddev_samp", "stddev_pop",
+# "variance", "var_samp", "var_pop"
 
 createMethod <- function(name) {
   setMethod(name,
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index b4a4d03b2643..0fc82187d523 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -826,12 +826,13 @@ test_that("column functions", {
   c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + md5(c)
   c7 <- mean(c) + min(c) + month(c) + negate(c) + quarter(c)
   c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c)
-  c9 <- signum(c) + sin(c) + sinh(c) + size(c) + soundex(c) + sqrt(c) + sum(c)
+  c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c)
   c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c)
   c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
-  c12 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
-  c13 <- cumeDist() + ntile(1)
-  c14 <- denseRank() + percentRank() + rank() + rowNumber()
+  c12 <- variance(c)
+  c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
+  c14 <- cumeDist() + ntile(1)
+  c15 <- denseRank() + percentRank() + rank() + rowNumber()
 
   # Test if base::rank() is exposed
   expect_equal(class(rank())[[1]], "Column")
@@ -849,6 +850,12 @@ test_that("column functions", {
   expect_equal(collect(df3)[[2, 1]], FALSE)
   expect_equal(collect(df3)[[3, 1]], TRUE)
 
+  expect_equal(collect(select(df, sum(df$age)))[1, 1], 49)
+
+  expect_true(abs(collect(select(df, stddev(df$age)))[1, 1] - 7.778175) < 1e-6)
+
+  expect_equal(collect(select(df, variance(df$age)))[1, 1], 30.25)
+
   df4 <- createDataFrame(sqlContext, list(list(a = "010101")))
   expect_equal(collect(select(df4, conv(df4$a, 2, 16)))[1, 1], "15")
 })
@@ -997,20 +1004,42 @@ test_that("group by", {
   expect_is(df_summarized, "DataFrame")
   expect_equal(3, count(df_summarized))
 
-  df3 <- agg(gd, age = "sum")
+  df3 <- agg(gd, age = "stddev")
   expect_is(df3, "DataFrame")
-  expect_equal(3, count(df3))
+  df3_local <- collect(df3)
+  expect_equal(0, df3_local[df3_local$name == "Andy",][1, 2])
 
-  df3 <- agg(gd, age = sum(df$age))
-  expect_is(df3, "DataFrame")
-  expect_equal(3, count(df3))
-  expect_equal(columns(df3), c("name", "age"))
-
-  df4 <- sum(gd, "age")
+  df4 <- agg(gd, sumAge = sum(df$age))
   expect_is(df4, "DataFrame")
   expect_equal(3, count(df4))
-  expect_equal(3, count(mean(gd, "age")))
-  expect_equal(3, count(max(gd, "age")))
+  expect_equal(columns(df4), c("name", "sumAge"))
+
+  df5 <- sum(gd, "age")
+  expect_is(df5, "DataFrame")
+  expect_equal(3, count(df5))
+
+  expect_equal(3, count(mean(gd)))
+  expect_equal(3, count(max(gd)))
+  expect_equal(30, collect(max(gd))[1,2])
+  expect_equal(1, collect(count(gd))[1,2])
+
+  mockLines2 <- c("{\"name\":\"ID1\", \"value\": \"10\"}",
+                  "{\"name\":\"ID1\", \"value\": \"10\"}",
+                  "{\"name\":\"ID1\", \"value\": \"22\"}",
+                  "{\"name\":\"ID2\", \"value\": \"-3\"}")
+  jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp")
+  writeLines(mockLines2, jsonPath2)
+  gd2 <- groupBy(jsonFile(sqlContext, jsonPath2), "name")
+  df6 <- agg(gd2, value = "sum")
+  df6_local <- collect(df6)
+  expect_equal(42, df6_local[df6_local$name == "ID1",][1, 2])
+  expect_equal(-3, df6_local[df6_local$name == "ID2",][1, 2])
+
+  df7 <- agg(gd2, value = "stddev")
+  df7_local <- collect(df7)
+
+  expect_true(abs(df7_local[df7_local$name == "ID1",][1, 2] - 6.928203) < 1e-6)
+  expect_equal(0, df7_local[df7_local$name == "ID2",][1, 2])
 })
 
 test_that("arrange() and orderBy() on a DataFrame", {

From 58d0fd91843215cf05cd524bcfd75b9922fd59b8 Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Thu, 5 Nov 2015 00:34:27 -0800
Subject: [PATCH 4/8] fix bug in rdoc

---
 R/pkg/R/functions.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index c082ba423ad4..41cdf7544e04 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -1006,7 +1006,7 @@ setMethod("soundex",
             column(jc)
           })
 
-#' var_samp
+#' stddev
 #'
 #' Aggregate function: alias for \link{stddev_samp}
 #'

From 7a2a90468623814c8bfc46c22a5058b350951a64 Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Thu, 5 Nov 2015 13:37:21 -0800
Subject: [PATCH 5/8] test broken by variance mapping changed var_amp, make
 this fix at var_pop

---
 R/pkg/inst/tests/test_sparkSQL.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 0fc82187d523..454be47580b7 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -854,7 +854,7 @@ test_that("column functions", {
 
   expect_true(abs(collect(select(df, stddev(df$age)))[1, 1] - 7.778175) < 1e-6)
 
-  expect_equal(collect(select(df, variance(df$age)))[1, 1], 30.25)
+  expect_equal(collect(select(df, var_pop(df$age)))[1, 1], 30.25)
 
   df4 <- createDataFrame(sqlContext, list(list(a = "010101")))
   expect_equal(collect(select(df4, conv(df4$a, 2, 16)))[1, 1], "15")

From 769b553ceee5d967eb9057d56fbc2f5f73de6b08 Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Tue, 10 Nov 2015 15:53:04 -0800
Subject: [PATCH 6/8] add alias, fixes from feedback

---
 R/pkg/NAMESPACE                  |  6 ++++--
 R/pkg/R/functions.R              | 18 ++++++++++++++++++
 R/pkg/R/generics.R               | 22 +++++++++++++++-------
 R/pkg/inst/tests/test_sparkSQL.R |  1 +
 4 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 0a0a46b187aa..6b304587b077 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -205,6 +205,7 @@ exportMethods("%in%",
               "shiftLeft",
               "shiftRight",
               "shiftRightUnsigned",
+              "sd",
               "sign",
               "signum",
               "sin",
@@ -213,8 +214,8 @@ exportMethods("%in%",
               "skewness",
               "soundex",
               "stddev",
-              "stddev_samp",
               "stddev_pop",
+              "stddev_samp",
               "sqrt",
               "startsWith",
               "substr",
@@ -233,9 +234,10 @@ exportMethods("%in%",
               "unhex",
               "unix_timestamp",
               "upper",
+              "var",
               "variance",
-              "var_samp",
               "var_pop",
+              "var_samp",
               "weekofyear",
               "when",
               "year")
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 41cdf7544e04..64ab44b1dcfa 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -877,6 +877,15 @@ setMethod("rtrim",
             column(jc)
           })
 
+#' @rdname stddev
+#' @name sd
+#' @family agg_funcs
+setMethod("sd",
+          signature(x = "Column"),
+          function(x, na.rm = FALSE) {
+            stddev_samp(x)
+          })
+
 #' second
 #'
 #' Extracts the seconds as an integer from a given date/timestamp/string.
@@ -1248,6 +1257,15 @@ setMethod("upper",
             column(jc)
           })
 
+#' @family agg_funcs
+#' @rdname variance
+#' @name var
+setMethod("var",
+          signature(x = "Column"),
+          function(x, y = NULL, na.rm = FALSE, use) {
+            var_samp(x)
+          })
+
 #' variance
 #'
 #' Aggregate function: alias for \link{var_samp}.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index bcd5883975ff..e7701a942625 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -939,6 +939,10 @@ setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") })
 #' @export
 setGeneric("rtrim", function(x) { standardGeneric("rtrim") })
 
+#' @rdname stddev
+#' @export
+setGeneric("sd", function(x, na.rm = FALSE) { standardGeneric("sd") })
+
 #' @rdname second
 #' @export
 setGeneric("second", function(x) { standardGeneric("second") })
@@ -983,14 +987,14 @@ setGeneric("soundex", function(x) { standardGeneric("soundex") })
 #' @export
 setGeneric("stddev", function(x) { standardGeneric("stddev") })
 
-#' @rdname stddev_samp
-#' @export
-setGeneric("stddev_samp", function(x) { standardGeneric("stddev_samp") })
-
 #' @rdname stddev_pop
 #' @export
 setGeneric("stddev_pop", function(x) { standardGeneric("stddev_pop") })
 
+#' @rdname stddev_samp
+#' @export
+setGeneric("stddev_samp", function(x) { standardGeneric("stddev_samp") })
+
 #' @rdname substring_index
 #' @export
 setGeneric("substring_index", function(x, delim, count) { standardGeneric("substring_index") })
@@ -1041,16 +1045,20 @@ setGeneric("upper", function(x) { standardGeneric("upper") })
 
 #' @rdname variance
 #' @export
-setGeneric("variance", function(x) { standardGeneric("variance") })
+setGeneric("var", function(x, y = NULL, na.rm = FALSE, use) { standardGeneric("var") })
 
-#' @rdname var_samp
+#' @rdname variance
 #' @export
-setGeneric("var_samp", function(x) { standardGeneric("var_samp") })
+setGeneric("variance", function(x) { standardGeneric("variance") })
 
 #' @rdname var_pop
 #' @export
 setGeneric("var_pop", function(x) { standardGeneric("var_pop") })
 
+#' @rdname var_samp
+#' @export
+setGeneric("var_samp", function(x) { standardGeneric("var_samp") })
+
 #' @rdname weekofyear
 #' @export
 setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 454be47580b7..54ed646b4ad3 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1040,6 +1040,7 @@ test_that("group by", {
 
   expect_true(abs(df7_local[df7_local$name == "ID1",][1, 2] - 6.928203) < 1e-6)
   expect_equal(0, df7_local[df7_local$name == "ID2",][1, 2])
+  unlink(jsonPath2)
 })
 
 test_that("arrange() and orderBy() on a DataFrame", {

From 9243fa4b3e508a4be2932da52696db05de7180bf Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Tue, 10 Nov 2015 18:11:06 -0800
Subject: [PATCH 7/8] add tests, update docs

---
 R/pkg/R/functions.R              | 110 ++++++++++++++++++-------------
 R/pkg/inst/tests/test_sparkSQL.R |  31 +++++++--
 2 files changed, 89 insertions(+), 52 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 64ab44b1dcfa..0f2090d3b5eb 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -877,12 +877,25 @@ setMethod("rtrim",
             column(jc)
           })
 
-#' @rdname stddev
+#' sd
+#'
+#' Aggregate function: alias for \link{stddev_samp}
+#'
+#' @rdname sd
 #' @name sd
 #' @family agg_funcs
+#' @seealso \link{stddev_pop}, \link{stddev_samp}
+#' @export
+#' @examples
+#'\dontrun{
+#'stddev(df$c)
+#'select(df, stddev(df$age))
+#'agg(df, sd(df$age))
+#'}
 setMethod("sd",
           signature(x = "Column"),
           function(x, na.rm = FALSE) {
+            # In R, sample standard deviation is calculated with the sd() function.
             stddev_samp(x)
           })
 
@@ -1015,15 +1028,8 @@ setMethod("soundex",
             column(jc)
           })
 
-#' stddev
-#'
-#' Aggregate function: alias for \link{stddev_samp}
-#'
-#' @rdname stddev
+#' @rdname sd
 #' @name stddev
-#' @family agg_funcs
-#' @export
-#' @examples \dontrun{stddev(df$c)}
 setMethod("stddev",
           signature(x = "Column"),
           function(x) {
@@ -1031,35 +1037,37 @@ setMethod("stddev",
             column(jc)
           })
 
-#' stddev_samp
+#' stddev_pop
 #'
-#' Aggregate function: returns the unbiased sample standard deviation of the expression in a group.
+#' Aggregate function: returns the population standard deviation of the expression in a group.
 #'
-#' @rdname stddev_samp
-#' @name stddev_samp
+#' @rdname stddev_pop
+#' @name stddev_pop
 #' @family agg_funcs
+#' @seealso \link{sd}, \link{stddev_samp}
 #' @export
-#' @examples \dontrun{stddev_samp(df$c)}
-setMethod("stddev_samp",
+#' @examples \dontrun{stddev_pop(df$c)}
+setMethod("stddev_pop",
           signature(x = "Column"),
           function(x) {
-            jc <- callJStatic("org.apache.spark.sql.functions", "stddev_samp", x@jc)
+            jc <- callJStatic("org.apache.spark.sql.functions", "stddev_pop", x@jc)
             column(jc)
           })
 
-#' stddev_pop
+#' stddev_samp
 #'
-#' Aggregate function: returns the population standard deviation of the expression in a group.
+#' Aggregate function: returns the unbiased sample standard deviation of the expression in a group.
 #'
-#' @rdname stddev_pop
-#' @name stddev_pop
+#' @rdname stddev_samp
+#' @name stddev_samp
 #' @family agg_funcs
+#' @seealso \link{stddev_pop}, \link{sd}
 #' @export
-#' @examples \dontrun{stddev_pop(df$c)}
-setMethod("stddev_pop",
+#' @examples \dontrun{stddev_samp(df$c)}
+setMethod("stddev_samp",
           signature(x = "Column"),
           function(x) {
-            jc <- callJStatic("org.apache.spark.sql.functions", "stddev_pop", x@jc)
+            jc <- callJStatic("org.apache.spark.sql.functions", "stddev_samp", x@jc)
             column(jc)
           })
 
@@ -1257,24 +1265,30 @@ setMethod("upper",
             column(jc)
           })
 
-#' @family agg_funcs
-#' @rdname variance
+#' var
+#'
+#' Aggregate function: alias for \link{var_samp}.
+#'
+#' @rdname var
 #' @name var
+#' @family agg_funcs
+#' @seealso \link{var_pop}, \link{var_samp}
+#' @export
+#' @examples
+#'\dontrun{
+#'variance(df$c)
+#'select(df, var_pop(df$age))
+#'agg(df, var(df$age))
+#'}
 setMethod("var",
           signature(x = "Column"),
           function(x, y = NULL, na.rm = FALSE, use) {
+            # In R, sample variance is calculated with the var() function.
             var_samp(x)
           })
 
-#' variance
-#'
-#' Aggregate function: alias for \link{var_samp}.
-#'
-#' @rdname variance
+#' @rdname var
 #' @name variance
-#' @family agg_funcs
-#' @export
-#' @examples \dontrun{variance(df$c)}
 setMethod("variance",
           signature(x = "Column"),
           function(x) {
@@ -1282,35 +1296,37 @@ setMethod("variance",
             column(jc)
           })
 
-#' var_samp
+#' var_pop
 #'
-#' Aggregate function: returns the unbiased variance of the values in a group.
+#' Aggregate function: returns the population variance of the values in a group.
 #'
-#' @rdname var_samp
-#' @name var_samp
+#' @rdname var_pop
+#' @name var_pop
 #' @family agg_funcs
+#' @seealso \link{var}, \link{var_samp}
 #' @export
-#' @examples \dontrun{var_samp(df$c)}
-setMethod("var_samp",
+#' @examples \dontrun{var_pop(df$c)}
+setMethod("var_pop",
           signature(x = "Column"),
           function(x) {
-            jc <- callJStatic("org.apache.spark.sql.functions", "var_samp", x@jc)
+            jc <- callJStatic("org.apache.spark.sql.functions", "var_pop", x@jc)
             column(jc)
           })
 
-#' var_pop
+#' var_samp
 #'
-#' Aggregate function: returns the population variance of the values in a group.
+#' Aggregate function: returns the unbiased variance of the values in a group.
 #'
-#' @rdname var_pop
-#' @name var_pop
+#' @rdname var_samp
+#' @name var_samp
 #' @family agg_funcs
+#' @seealso \link{var_pop}, \link{var}
 #' @export
-#' @examples \dontrun{var_pop(df$c)}
-setMethod("var_pop",
+#' @examples \dontrun{var_samp(df$c)}
+setMethod("var_samp",
           signature(x = "Column"),
           function(x) {
-            jc <- callJStatic("org.apache.spark.sql.functions", "var_pop", x@jc)
+            jc <- callJStatic("org.apache.spark.sql.functions", "var_samp", x@jc)
             column(jc)
           })
 
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 54ed646b4ad3..dcf777a861f1 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -983,7 +983,7 @@ test_that("when(), otherwise() and ifelse() on a DataFrame", {
   expect_equal(collect(select(df, ifelse(df$a > 1 & df$b > 2, 0, 1)))[, 1], c(1, 0))
 })
 
-test_that("group by", {
+test_that("group by, agg functions", {
   df <- jsonFile(sqlContext, jsonPath)
   df1 <- agg(df, name = "max", age = "sum")
   expect_equal(1, count(df1))
@@ -1020,8 +1020,8 @@ test_that("group by", {
 
   expect_equal(3, count(mean(gd)))
   expect_equal(3, count(max(gd)))
-  expect_equal(30, collect(max(gd))[1,2])
-  expect_equal(1, collect(count(gd))[1,2])
+  expect_equal(30, collect(max(gd))[1, 2])
+  expect_equal(1, collect(count(gd))[1, 2])
 
   mockLines2 <- c("{\"name\":\"ID1\", \"value\": \"10\"}",
                   "{\"name\":\"ID1\", \"value\": \"10\"}",
@@ -1037,10 +1037,31 @@ test_that("group by", {
 
   df7 <- agg(gd2, value = "stddev")
   df7_local <- collect(df7)
-
   expect_true(abs(df7_local[df7_local$name == "ID1",][1, 2] - 6.928203) < 1e-6)
   expect_equal(0, df7_local[df7_local$name == "ID2",][1, 2])
+
+  mockLines3 <- c("{\"name\":\"Andy\", \"age\":30}",
+                  "{\"name\":\"Andy\", \"age\":30}",
+                  "{\"name\":\"Justin\", \"age\":19}",
+                  "{\"name\":\"Justin\", \"age\":1}")
+  jsonPath3 <- tempfile(pattern="sparkr-test", fileext=".tmp")
+  writeLines(mockLines3, jsonPath3)
+  df8 <- jsonFile(sqlContext, jsonPath3)
+  gd3 <- groupBy(df8, "name")
+  gd3_local <- collect(sum(gd3))
+  expect_equal(60, gd3_local[gd3_local$name == "Andy",][1, 2])
+  expect_equal(20, gd3_local[gd3_local$name == "Justin",][1, 2])
+
+  expect_true(abs(collect(agg(df, sd(df$age)))[1, 1] - 7.778175) < 1e-6)
+  gd3_local <- collect(agg(gd3, var(df8$age)))
+  expect_equal(162, gd3_local[gd3_local$name == "Justin",][1, 2])
+
+  # make sure base:: or stats::sd, var are working
+  expect_true(abs(sd(1:2) - 0.7071068) < 1e-6)
+  expect_true(abs(var(1:5, 1:5) - 2.5) < 1e-6)
+
   unlink(jsonPath2)
+  unlink(jsonPath3)
 })
 
 test_that("arrange() and orderBy() on a DataFrame", {
@@ -1268,7 +1289,7 @@ test_that("mutate(), transform(), rename() and names()", {
   expect_equal(columns(transformedDF)[4], "newAge2")
   expect_equal(first(filter(transformedDF, transformedDF$name == "Andy"))$newAge, -30)
 
-  # test if transform on local data frames works
+  # test if base::transform on local data frames works
   # ensure the proper signature is used - otherwise this will fail to run
   attach(airquality)
   result <- transform(Ozone, logOzone = log(Ozone))

From 7498e39bc34449251aabbf1ad2f37b3fb56acf4e Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Tue, 10 Nov 2015 18:47:04 -0800
Subject: [PATCH 8/8] fix doc which is broken by last minute changes

---
 R/pkg/R/generics.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index e7701a942625..758f7394b8dc 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -939,7 +939,7 @@ setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") })
 #' @export
 setGeneric("rtrim", function(x) { standardGeneric("rtrim") })
 
-#' @rdname stddev
+#' @rdname sd
 #' @export
 setGeneric("sd", function(x, na.rm = FALSE) { standardGeneric("sd") })
 
@@ -983,7 +983,7 @@ setGeneric("skewness", function(x) { standardGeneric("skewness") })
 #' @export
 setGeneric("soundex", function(x) { standardGeneric("soundex") })
 
-#' @rdname stddev
+#' @rdname sd
 #' @export
 setGeneric("stddev", function(x) { standardGeneric("stddev") })
 
@@ -1043,11 +1043,11 @@ setGeneric("unix_timestamp", function(x, format) { standardGeneric("unix_timesta
 #' @export
 setGeneric("upper", function(x) { standardGeneric("upper") })
 
-#' @rdname variance
+#' @rdname var
 #' @export
 setGeneric("var", function(x, y = NULL, na.rm = FALSE, use) { standardGeneric("var") })
 
-#' @rdname variance
+#' @rdname var
 #' @export
 setGeneric("variance", function(x) { standardGeneric("variance") })