From 680b4759e04c7d371fc555e220730e0a4da251f5 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Wed, 4 Nov 2015 18:37:04 -0800 Subject: [PATCH 1/8] Add stddev and friends --- R/pkg/R/group.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R index 4cab1a69f601..f0b158997944 100644 --- a/R/pkg/R/group.R +++ b/R/pkg/R/group.R @@ -118,7 +118,9 @@ setMethod("summarize", }) # sum/mean/avg/min/max -methods <- c("sum", "mean", "avg", "min", "max") +methods <- c("sum", "mean", "avg", "min", "max", + "stddev", "stddev_samp", "stddev_pop", + "variance", "var_samp", "var_pop", "skewness", "kurtosis") createMethod <- function(name) { setMethod(name, From f63608e3d36bae0544281a94c732760bfebfcc6d Mon Sep 17 00:00:00 2001 From: felixcheung Date: Wed, 4 Nov 2015 21:05:57 -0800 Subject: [PATCH 2/8] add aggFunction by name and on Column --- R/pkg/R/functions.R | 152 ++++++++++++++++++++++++++++++++++++++++---- R/pkg/R/generics.R | 32 ++++++++++ R/pkg/R/group.R | 14 ++-- 3 files changed, 177 insertions(+), 21 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index d7fd27927913..c082ba423ad4 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -520,6 +520,22 @@ setMethod("isNaN", column(jc) }) +#' kurtosis +#' +#' Aggregate function: returns the kurtosis of the values in a group. +#' +#' @rdname kurtosis +#' @name kurtosis +#' @family agg_funcs +#' @export +#' @examples \dontrun{kurtosis(df$c)} +setMethod("kurtosis", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "kurtosis", x@jc) + column(jc) + }) + #' last #' #' Aggregate function: returns the last value in a group. @@ -958,6 +974,22 @@ setMethod("size", column(jc) }) +#' skewness +#' +#' Aggregate function: returns the skewness of the values in a group. +#' +#' @rdname skewness +#' @name skewness +#' @family agg_funcs +#' @export +#' @examples \dontrun{skewness(df$c)} +setMethod("skewness", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "skewness", x@jc) + column(jc) + }) + #' soundex #' #' Return the soundex code for the specified expression. @@ -974,6 +1006,54 @@ setMethod("soundex", column(jc) }) +#' var_samp +#' +#' Aggregate function: alias for \link{stddev_samp} +#' +#' @rdname stddev +#' @name stddev +#' @family agg_funcs +#' @export +#' @examples \dontrun{stddev(df$c)} +setMethod("stddev", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "stddev", x@jc) + column(jc) + }) + +#' stddev_samp +#' +#' Aggregate function: returns the unbiased sample standard deviation of the expression in a group. +#' +#' @rdname stddev_samp +#' @name stddev_samp +#' @family agg_funcs +#' @export +#' @examples \dontrun{stddev_samp(df$c)} +setMethod("stddev_samp", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "stddev_samp", x@jc) + column(jc) + }) + +#' stddev_pop +#' +#' Aggregate function: returns the population standard deviation of the expression in a group. +#' +#' @rdname stddev_pop +#' @name stddev_pop +#' @family agg_funcs +#' @export +#' @examples \dontrun{stddev_pop(df$c)} +setMethod("stddev_pop", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "stddev_pop", x@jc) + column(jc) + }) + #' sqrt #' #' Computes the square root of the specified float value. @@ -1168,6 +1248,54 @@ setMethod("upper", column(jc) }) +#' variance +#' +#' Aggregate function: alias for \link{var_samp}. +#' +#' @rdname variance +#' @name variance +#' @family agg_funcs +#' @export +#' @examples \dontrun{variance(df$c)} +setMethod("variance", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "variance", x@jc) + column(jc) + }) + +#' var_samp +#' +#' Aggregate function: returns the unbiased variance of the values in a group. +#' +#' @rdname var_samp +#' @name var_samp +#' @family agg_funcs +#' @export +#' @examples \dontrun{var_samp(df$c)} +setMethod("var_samp", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "var_samp", x@jc) + column(jc) + }) + +#' var_pop +#' +#' Aggregate function: returns the population variance of the values in a group. +#' +#' @rdname var_pop +#' @name var_pop +#' @family agg_funcs +#' @export +#' @examples \dontrun{var_pop(df$c)} +setMethod("var_pop", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "var_pop", x@jc) + column(jc) + }) + #' weekofyear #' #' Extracts the week number as an integer from a given date/timestamp/string. @@ -2020,10 +2148,10 @@ setMethod("ifelse", #' #' Window function: returns the cumulative distribution of values within a window partition, #' i.e. the fraction of rows that are below the current row. -#' +#' #' N = total number of rows in the partition #' cumeDist(x) = number of values before (and including) x / N -#' +#' #' This is equivalent to the CUME_DIST function in SQL. #' #' @rdname cumeDist @@ -2039,13 +2167,13 @@ setMethod("cumeDist", }) #' denseRank -#' +#' #' Window function: returns the rank of rows within a window partition, without any gaps. #' The difference between rank and denseRank is that denseRank leaves no gaps in ranking #' sequence when there are ties. That is, if you were ranking a competition using denseRank #' and had three people tie for second place, you would say that all three were in second #' place and that the next person came in third. -#' +#' #' This is equivalent to the DENSE_RANK function in SQL. #' #' @rdname denseRank @@ -2065,7 +2193,7 @@ setMethod("denseRank", #' Window function: returns the value that is `offset` rows before the current row, and #' `defaultValue` if there is less than `offset` rows before the current row. For example, #' an `offset` of one will return the previous row at any given point in the window partition. -#' +#' #' This is equivalent to the LAG function in SQL. #' #' @rdname lag @@ -2092,7 +2220,7 @@ setMethod("lag", #' Window function: returns the value that is `offset` rows after the current row, and #' `null` if there is less than `offset` rows after the current row. For example, #' an `offset` of one will return the next row at any given point in the window partition. -#' +#' #' This is equivalent to the LEAD function in SQL. #' #' @rdname lead @@ -2119,7 +2247,7 @@ setMethod("lead", #' Window function: returns the ntile group id (from 1 to `n` inclusive) in an ordered window #' partition. Fow example, if `n` is 4, the first quarter of the rows will get value 1, the second #' quarter will get 2, the third quarter will get 3, and the last quarter will get 4. -#' +#' #' This is equivalent to the NTILE function in SQL. #' #' @rdname ntile @@ -2137,9 +2265,9 @@ setMethod("ntile", #' percentRank #' #' Window function: returns the relative rank (i.e. percentile) of rows within a window partition. -#' +#' #' This is computed by: -#' +#' #' (rank of row in its partition - 1) / (number of rows in the partition - 1) #' #' This is equivalent to the PERCENT_RANK function in SQL. @@ -2159,12 +2287,12 @@ setMethod("percentRank", #' rank #' #' Window function: returns the rank of rows within a window partition. -#' +#' #' The difference between rank and denseRank is that denseRank leaves no gaps in ranking #' sequence when there are ties. That is, if you were ranking a competition using denseRank #' and had three people tie for second place, you would say that all three were in second #' place and that the next person came in third. -#' +#' #' This is equivalent to the RANK function in SQL. #' #' @rdname rank @@ -2189,7 +2317,7 @@ setMethod("rank", #' rowNumber #' #' Window function: returns a sequential number starting at 1 within a window partition. -#' +#' #' This is equivalent to the ROW_NUMBER function in SQL. #' #' @rdname rowNumber diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 0b35340e48e4..bcd5883975ff 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -798,6 +798,10 @@ setGeneric("instr", function(y, x) { standardGeneric("instr") }) #' @export setGeneric("isNaN", function(x) { standardGeneric("isNaN") }) +#' @rdname kurtosis +#' @export +setGeneric("kurtosis", function(x) { standardGeneric("kurtosis") }) + #' @rdname lag #' @export setGeneric("lag", function(x, offset, defaultValue = NULL) { standardGeneric("lag") }) @@ -967,10 +971,26 @@ setGeneric("signum", function(x) { standardGeneric("signum") }) #' @export setGeneric("size", function(x) { standardGeneric("size") }) +#' @rdname skewness +#' @export +setGeneric("skewness", function(x) { standardGeneric("skewness") }) + #' @rdname soundex #' @export setGeneric("soundex", function(x) { standardGeneric("soundex") }) +#' @rdname stddev +#' @export +setGeneric("stddev", function(x) { standardGeneric("stddev") }) + +#' @rdname stddev_samp +#' @export +setGeneric("stddev_samp", function(x) { standardGeneric("stddev_samp") }) + +#' @rdname stddev_pop +#' @export +setGeneric("stddev_pop", function(x) { standardGeneric("stddev_pop") }) + #' @rdname substring_index #' @export setGeneric("substring_index", function(x, delim, count) { standardGeneric("substring_index") }) @@ -1019,6 +1039,18 @@ setGeneric("unix_timestamp", function(x, format) { standardGeneric("unix_timesta #' @export setGeneric("upper", function(x) { standardGeneric("upper") }) +#' @rdname variance +#' @export +setGeneric("variance", function(x) { standardGeneric("variance") }) + +#' @rdname var_samp +#' @export +setGeneric("var_samp", function(x) { standardGeneric("var_samp") }) + +#' @rdname var_pop +#' @export +setGeneric("var_pop", function(x) { standardGeneric("var_pop") }) + #' @rdname weekofyear #' @export setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") }) diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R index f0b158997944..cc788128317b 100644 --- a/R/pkg/R/group.R +++ b/R/pkg/R/group.R @@ -62,11 +62,6 @@ setMethod("show", "GroupedData", #' \dontrun{ #' count(groupBy(df, "name")) #' } -setMethod("count", - signature(x = "GroupedData"), - function(x) { - dataFrame(callJMethod(x@sgd, "count")) - }) #' Agg #' @@ -79,6 +74,7 @@ setMethod("count", #' @param x a GroupedData #' @return a DataFrame #' @rdname agg +#' @family agg_funcs #' @examples #' \dontrun{ #' df2 <- agg(df, age = "sum") # new column name will be created as 'SUM(age#0)' @@ -117,10 +113,10 @@ setMethod("summarize", agg(x, ...) }) -# sum/mean/avg/min/max -methods <- c("sum", "mean", "avg", "min", "max", - "stddev", "stddev_samp", "stddev_pop", - "variance", "var_samp", "var_pop", "skewness", "kurtosis") +# Aggregate Functions by name +methods <- c("approxCountDistinct", "avg", "corr", "count", "countDistinct", "first", "kurtosis", + "last", "max", "mean", "min", "skewness", "stddev", "stddev_samp", "stddev_pop", + "sum", "sumDistinct", "variance", "var_samp", "var_pop") createMethod <- function(name) { setMethod(name, From e0fda371d3691a21672a94e9e0b383890cc745a6 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Thu, 5 Nov 2015 00:27:15 -0800 Subject: [PATCH 3/8] Add tests, checked supported functions for GroupedData --- R/pkg/NAMESPACE | 8 +++++ R/pkg/R/group.R | 12 +++++-- R/pkg/inst/tests/test_sparkSQL.R | 57 ++++++++++++++++++++++++-------- 3 files changed, 60 insertions(+), 17 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index cd9537a2655f..0a0a46b187aa 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -152,6 +152,7 @@ exportMethods("%in%", "isNaN", "isNotNull", "isNull", + "kurtosis", "lag", "last", "last_day", @@ -209,7 +210,11 @@ exportMethods("%in%", "sin", "sinh", "size", + "skewness", "soundex", + "stddev", + "stddev_samp", + "stddev_pop", "sqrt", "startsWith", "substr", @@ -228,6 +233,9 @@ exportMethods("%in%", "unhex", "unix_timestamp", "upper", + "variance", + "var_samp", + "var_pop", "weekofyear", "when", "year") diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R index cc788128317b..e5f702faee65 100644 --- a/R/pkg/R/group.R +++ b/R/pkg/R/group.R @@ -62,6 +62,11 @@ setMethod("show", "GroupedData", #' \dontrun{ #' count(groupBy(df, "name")) #' } +setMethod("count", + signature(x = "GroupedData"), + function(x) { + dataFrame(callJMethod(x@sgd, "count")) + }) #' Agg #' @@ -114,9 +119,10 @@ setMethod("summarize", }) # Aggregate Functions by name -methods <- c("approxCountDistinct", "avg", "corr", "count", "countDistinct", "first", "kurtosis", - "last", "max", "mean", "min", "skewness", "stddev", "stddev_samp", "stddev_pop", - "sum", "sumDistinct", "variance", "var_samp", "var_pop") +methods <- c("avg", "max", "mean", "min", "sum") + +# These are not exposed on GroupedData: "kurtosis", "skewness", "stddev", "stddev_samp", "stddev_pop", +# "variance", "var_samp", "var_pop" createMethod <- function(name) { setMethod(name, diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index b4a4d03b2643..0fc82187d523 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -826,12 +826,13 @@ test_that("column functions", { c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + md5(c) c7 <- mean(c) + min(c) + month(c) + negate(c) + quarter(c) c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c) - c9 <- signum(c) + sin(c) + sinh(c) + size(c) + soundex(c) + sqrt(c) + sum(c) + c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c) c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c) c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c) - c12 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1) - c13 <- cumeDist() + ntile(1) - c14 <- denseRank() + percentRank() + rank() + rowNumber() + c12 <- variance(c) + c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1) + c14 <- cumeDist() + ntile(1) + c15 <- denseRank() + percentRank() + rank() + rowNumber() # Test if base::rank() is exposed expect_equal(class(rank())[[1]], "Column") @@ -849,6 +850,12 @@ test_that("column functions", { expect_equal(collect(df3)[[2, 1]], FALSE) expect_equal(collect(df3)[[3, 1]], TRUE) + expect_equal(collect(select(df, sum(df$age)))[1, 1], 49) + + expect_true(abs(collect(select(df, stddev(df$age)))[1, 1] - 7.778175) < 1e-6) + + expect_equal(collect(select(df, variance(df$age)))[1, 1], 30.25) + df4 <- createDataFrame(sqlContext, list(list(a = "010101"))) expect_equal(collect(select(df4, conv(df4$a, 2, 16)))[1, 1], "15") }) @@ -997,20 +1004,42 @@ test_that("group by", { expect_is(df_summarized, "DataFrame") expect_equal(3, count(df_summarized)) - df3 <- agg(gd, age = "sum") + df3 <- agg(gd, age = "stddev") expect_is(df3, "DataFrame") - expect_equal(3, count(df3)) + df3_local <- collect(df3) + expect_equal(0, df3_local[df3_local$name == "Andy",][1, 2]) - df3 <- agg(gd, age = sum(df$age)) - expect_is(df3, "DataFrame") - expect_equal(3, count(df3)) - expect_equal(columns(df3), c("name", "age")) - - df4 <- sum(gd, "age") + df4 <- agg(gd, sumAge = sum(df$age)) expect_is(df4, "DataFrame") expect_equal(3, count(df4)) - expect_equal(3, count(mean(gd, "age"))) - expect_equal(3, count(max(gd, "age"))) + expect_equal(columns(df4), c("name", "sumAge")) + + df5 <- sum(gd, "age") + expect_is(df5, "DataFrame") + expect_equal(3, count(df5)) + + expect_equal(3, count(mean(gd))) + expect_equal(3, count(max(gd))) + expect_equal(30, collect(max(gd))[1,2]) + expect_equal(1, collect(count(gd))[1,2]) + + mockLines2 <- c("{\"name\":\"ID1\", \"value\": \"10\"}", + "{\"name\":\"ID1\", \"value\": \"10\"}", + "{\"name\":\"ID1\", \"value\": \"22\"}", + "{\"name\":\"ID2\", \"value\": \"-3\"}") + jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp") + writeLines(mockLines2, jsonPath2) + gd2 <- groupBy(jsonFile(sqlContext, jsonPath2), "name") + df6 <- agg(gd2, value = "sum") + df6_local <- collect(df6) + expect_equal(42, df6_local[df6_local$name == "ID1",][1, 2]) + expect_equal(-3, df6_local[df6_local$name == "ID2",][1, 2]) + + df7 <- agg(gd2, value = "stddev") + df7_local <- collect(df7) + + expect_true(abs(df7_local[df7_local$name == "ID1",][1, 2] - 6.928203) < 1e-6) + expect_equal(0, df7_local[df7_local$name == "ID2",][1, 2]) }) test_that("arrange() and orderBy() on a DataFrame", { From 58d0fd91843215cf05cd524bcfd75b9922fd59b8 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Thu, 5 Nov 2015 00:34:27 -0800 Subject: [PATCH 4/8] fix bug in rdoc --- R/pkg/R/functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index c082ba423ad4..41cdf7544e04 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1006,7 +1006,7 @@ setMethod("soundex", column(jc) }) -#' var_samp +#' stddev #' #' Aggregate function: alias for \link{stddev_samp} #' From 7a2a90468623814c8bfc46c22a5058b350951a64 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Thu, 5 Nov 2015 13:37:21 -0800 Subject: [PATCH 5/8] test broken by variance mapping changed var_amp, make this fix at var_pop --- R/pkg/inst/tests/test_sparkSQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 0fc82187d523..454be47580b7 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -854,7 +854,7 @@ test_that("column functions", { expect_true(abs(collect(select(df, stddev(df$age)))[1, 1] - 7.778175) < 1e-6) - expect_equal(collect(select(df, variance(df$age)))[1, 1], 30.25) + expect_equal(collect(select(df, var_pop(df$age)))[1, 1], 30.25) df4 <- createDataFrame(sqlContext, list(list(a = "010101"))) expect_equal(collect(select(df4, conv(df4$a, 2, 16)))[1, 1], "15") From 769b553ceee5d967eb9057d56fbc2f5f73de6b08 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Tue, 10 Nov 2015 15:53:04 -0800 Subject: [PATCH 6/8] add alias, fixes from feedback --- R/pkg/NAMESPACE | 6 ++++-- R/pkg/R/functions.R | 18 ++++++++++++++++++ R/pkg/R/generics.R | 22 +++++++++++++++------- R/pkg/inst/tests/test_sparkSQL.R | 1 + 4 files changed, 38 insertions(+), 9 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 0a0a46b187aa..6b304587b077 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -205,6 +205,7 @@ exportMethods("%in%", "shiftLeft", "shiftRight", "shiftRightUnsigned", + "sd", "sign", "signum", "sin", @@ -213,8 +214,8 @@ exportMethods("%in%", "skewness", "soundex", "stddev", - "stddev_samp", "stddev_pop", + "stddev_samp", "sqrt", "startsWith", "substr", @@ -233,9 +234,10 @@ exportMethods("%in%", "unhex", "unix_timestamp", "upper", + "var", "variance", - "var_samp", "var_pop", + "var_samp", "weekofyear", "when", "year") diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 41cdf7544e04..64ab44b1dcfa 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -877,6 +877,15 @@ setMethod("rtrim", column(jc) }) +#' @rdname stddev +#' @name sd +#' @family agg_funcs +setMethod("sd", + signature(x = "Column"), + function(x, na.rm = FALSE) { + stddev_samp(x) + }) + #' second #' #' Extracts the seconds as an integer from a given date/timestamp/string. @@ -1248,6 +1257,15 @@ setMethod("upper", column(jc) }) +#' @family agg_funcs +#' @rdname variance +#' @name var +setMethod("var", + signature(x = "Column"), + function(x, y = NULL, na.rm = FALSE, use) { + var_samp(x) + }) + #' variance #' #' Aggregate function: alias for \link{var_samp}. diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index bcd5883975ff..e7701a942625 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -939,6 +939,10 @@ setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") }) #' @export setGeneric("rtrim", function(x) { standardGeneric("rtrim") }) +#' @rdname stddev +#' @export +setGeneric("sd", function(x, na.rm = FALSE) { standardGeneric("sd") }) + #' @rdname second #' @export setGeneric("second", function(x) { standardGeneric("second") }) @@ -983,14 +987,14 @@ setGeneric("soundex", function(x) { standardGeneric("soundex") }) #' @export setGeneric("stddev", function(x) { standardGeneric("stddev") }) -#' @rdname stddev_samp -#' @export -setGeneric("stddev_samp", function(x) { standardGeneric("stddev_samp") }) - #' @rdname stddev_pop #' @export setGeneric("stddev_pop", function(x) { standardGeneric("stddev_pop") }) +#' @rdname stddev_samp +#' @export +setGeneric("stddev_samp", function(x) { standardGeneric("stddev_samp") }) + #' @rdname substring_index #' @export setGeneric("substring_index", function(x, delim, count) { standardGeneric("substring_index") }) @@ -1041,16 +1045,20 @@ setGeneric("upper", function(x) { standardGeneric("upper") }) #' @rdname variance #' @export -setGeneric("variance", function(x) { standardGeneric("variance") }) +setGeneric("var", function(x, y = NULL, na.rm = FALSE, use) { standardGeneric("var") }) -#' @rdname var_samp +#' @rdname variance #' @export -setGeneric("var_samp", function(x) { standardGeneric("var_samp") }) +setGeneric("variance", function(x) { standardGeneric("variance") }) #' @rdname var_pop #' @export setGeneric("var_pop", function(x) { standardGeneric("var_pop") }) +#' @rdname var_samp +#' @export +setGeneric("var_samp", function(x) { standardGeneric("var_samp") }) + #' @rdname weekofyear #' @export setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") }) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 454be47580b7..54ed646b4ad3 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1040,6 +1040,7 @@ test_that("group by", { expect_true(abs(df7_local[df7_local$name == "ID1",][1, 2] - 6.928203) < 1e-6) expect_equal(0, df7_local[df7_local$name == "ID2",][1, 2]) + unlink(jsonPath2) }) test_that("arrange() and orderBy() on a DataFrame", { From 9243fa4b3e508a4be2932da52696db05de7180bf Mon Sep 17 00:00:00 2001 From: felixcheung Date: Tue, 10 Nov 2015 18:11:06 -0800 Subject: [PATCH 7/8] add tests, update docs --- R/pkg/R/functions.R | 110 ++++++++++++++++++------------- R/pkg/inst/tests/test_sparkSQL.R | 31 +++++++-- 2 files changed, 89 insertions(+), 52 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 64ab44b1dcfa..0f2090d3b5eb 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -877,12 +877,25 @@ setMethod("rtrim", column(jc) }) -#' @rdname stddev +#' sd +#' +#' Aggregate function: alias for \link{stddev_samp} +#' +#' @rdname sd #' @name sd #' @family agg_funcs +#' @seealso \link{stddev_pop}, \link{stddev_samp} +#' @export +#' @examples +#'\dontrun{ +#'stddev(df$c) +#'select(df, stddev(df$age)) +#'agg(df, sd(df$age)) +#'} setMethod("sd", signature(x = "Column"), function(x, na.rm = FALSE) { + # In R, sample standard deviation is calculated with the sd() function. stddev_samp(x) }) @@ -1015,15 +1028,8 @@ setMethod("soundex", column(jc) }) -#' stddev -#' -#' Aggregate function: alias for \link{stddev_samp} -#' -#' @rdname stddev +#' @rdname sd #' @name stddev -#' @family agg_funcs -#' @export -#' @examples \dontrun{stddev(df$c)} setMethod("stddev", signature(x = "Column"), function(x) { @@ -1031,35 +1037,37 @@ setMethod("stddev", column(jc) }) -#' stddev_samp +#' stddev_pop #' -#' Aggregate function: returns the unbiased sample standard deviation of the expression in a group. +#' Aggregate function: returns the population standard deviation of the expression in a group. #' -#' @rdname stddev_samp -#' @name stddev_samp +#' @rdname stddev_pop +#' @name stddev_pop #' @family agg_funcs +#' @seealso \link{sd}, \link{stddev_samp} #' @export -#' @examples \dontrun{stddev_samp(df$c)} -setMethod("stddev_samp", +#' @examples \dontrun{stddev_pop(df$c)} +setMethod("stddev_pop", signature(x = "Column"), function(x) { - jc <- callJStatic("org.apache.spark.sql.functions", "stddev_samp", x@jc) + jc <- callJStatic("org.apache.spark.sql.functions", "stddev_pop", x@jc) column(jc) }) -#' stddev_pop +#' stddev_samp #' -#' Aggregate function: returns the population standard deviation of the expression in a group. +#' Aggregate function: returns the unbiased sample standard deviation of the expression in a group. #' -#' @rdname stddev_pop -#' @name stddev_pop +#' @rdname stddev_samp +#' @name stddev_samp #' @family agg_funcs +#' @seealso \link{stddev_pop}, \link{sd} #' @export -#' @examples \dontrun{stddev_pop(df$c)} -setMethod("stddev_pop", +#' @examples \dontrun{stddev_samp(df$c)} +setMethod("stddev_samp", signature(x = "Column"), function(x) { - jc <- callJStatic("org.apache.spark.sql.functions", "stddev_pop", x@jc) + jc <- callJStatic("org.apache.spark.sql.functions", "stddev_samp", x@jc) column(jc) }) @@ -1257,24 +1265,30 @@ setMethod("upper", column(jc) }) -#' @family agg_funcs -#' @rdname variance +#' var +#' +#' Aggregate function: alias for \link{var_samp}. +#' +#' @rdname var #' @name var +#' @family agg_funcs +#' @seealso \link{var_pop}, \link{var_samp} +#' @export +#' @examples +#'\dontrun{ +#'variance(df$c) +#'select(df, var_pop(df$age)) +#'agg(df, var(df$age)) +#'} setMethod("var", signature(x = "Column"), function(x, y = NULL, na.rm = FALSE, use) { + # In R, sample variance is calculated with the var() function. var_samp(x) }) -#' variance -#' -#' Aggregate function: alias for \link{var_samp}. -#' -#' @rdname variance +#' @rdname var #' @name variance -#' @family agg_funcs -#' @export -#' @examples \dontrun{variance(df$c)} setMethod("variance", signature(x = "Column"), function(x) { @@ -1282,35 +1296,37 @@ setMethod("variance", column(jc) }) -#' var_samp +#' var_pop #' -#' Aggregate function: returns the unbiased variance of the values in a group. +#' Aggregate function: returns the population variance of the values in a group. #' -#' @rdname var_samp -#' @name var_samp +#' @rdname var_pop +#' @name var_pop #' @family agg_funcs +#' @seealso \link{var}, \link{var_samp} #' @export -#' @examples \dontrun{var_samp(df$c)} -setMethod("var_samp", +#' @examples \dontrun{var_pop(df$c)} +setMethod("var_pop", signature(x = "Column"), function(x) { - jc <- callJStatic("org.apache.spark.sql.functions", "var_samp", x@jc) + jc <- callJStatic("org.apache.spark.sql.functions", "var_pop", x@jc) column(jc) }) -#' var_pop +#' var_samp #' -#' Aggregate function: returns the population variance of the values in a group. +#' Aggregate function: returns the unbiased variance of the values in a group. #' -#' @rdname var_pop -#' @name var_pop +#' @rdname var_samp +#' @name var_samp #' @family agg_funcs +#' @seealso \link{var_pop}, \link{var} #' @export -#' @examples \dontrun{var_pop(df$c)} -setMethod("var_pop", +#' @examples \dontrun{var_samp(df$c)} +setMethod("var_samp", signature(x = "Column"), function(x) { - jc <- callJStatic("org.apache.spark.sql.functions", "var_pop", x@jc) + jc <- callJStatic("org.apache.spark.sql.functions", "var_samp", x@jc) column(jc) }) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 54ed646b4ad3..dcf777a861f1 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -983,7 +983,7 @@ test_that("when(), otherwise() and ifelse() on a DataFrame", { expect_equal(collect(select(df, ifelse(df$a > 1 & df$b > 2, 0, 1)))[, 1], c(1, 0)) }) -test_that("group by", { +test_that("group by, agg functions", { df <- jsonFile(sqlContext, jsonPath) df1 <- agg(df, name = "max", age = "sum") expect_equal(1, count(df1)) @@ -1020,8 +1020,8 @@ test_that("group by", { expect_equal(3, count(mean(gd))) expect_equal(3, count(max(gd))) - expect_equal(30, collect(max(gd))[1,2]) - expect_equal(1, collect(count(gd))[1,2]) + expect_equal(30, collect(max(gd))[1, 2]) + expect_equal(1, collect(count(gd))[1, 2]) mockLines2 <- c("{\"name\":\"ID1\", \"value\": \"10\"}", "{\"name\":\"ID1\", \"value\": \"10\"}", @@ -1037,10 +1037,31 @@ test_that("group by", { df7 <- agg(gd2, value = "stddev") df7_local <- collect(df7) - expect_true(abs(df7_local[df7_local$name == "ID1",][1, 2] - 6.928203) < 1e-6) expect_equal(0, df7_local[df7_local$name == "ID2",][1, 2]) + + mockLines3 <- c("{\"name\":\"Andy\", \"age\":30}", + "{\"name\":\"Andy\", \"age\":30}", + "{\"name\":\"Justin\", \"age\":19}", + "{\"name\":\"Justin\", \"age\":1}") + jsonPath3 <- tempfile(pattern="sparkr-test", fileext=".tmp") + writeLines(mockLines3, jsonPath3) + df8 <- jsonFile(sqlContext, jsonPath3) + gd3 <- groupBy(df8, "name") + gd3_local <- collect(sum(gd3)) + expect_equal(60, gd3_local[gd3_local$name == "Andy",][1, 2]) + expect_equal(20, gd3_local[gd3_local$name == "Justin",][1, 2]) + + expect_true(abs(collect(agg(df, sd(df$age)))[1, 1] - 7.778175) < 1e-6) + gd3_local <- collect(agg(gd3, var(df8$age))) + expect_equal(162, gd3_local[gd3_local$name == "Justin",][1, 2]) + + # make sure base:: or stats::sd, var are working + expect_true(abs(sd(1:2) - 0.7071068) < 1e-6) + expect_true(abs(var(1:5, 1:5) - 2.5) < 1e-6) + unlink(jsonPath2) + unlink(jsonPath3) }) test_that("arrange() and orderBy() on a DataFrame", { @@ -1268,7 +1289,7 @@ test_that("mutate(), transform(), rename() and names()", { expect_equal(columns(transformedDF)[4], "newAge2") expect_equal(first(filter(transformedDF, transformedDF$name == "Andy"))$newAge, -30) - # test if transform on local data frames works + # test if base::transform on local data frames works # ensure the proper signature is used - otherwise this will fail to run attach(airquality) result <- transform(Ozone, logOzone = log(Ozone)) From 7498e39bc34449251aabbf1ad2f37b3fb56acf4e Mon Sep 17 00:00:00 2001 From: felixcheung Date: Tue, 10 Nov 2015 18:47:04 -0800 Subject: [PATCH 8/8] fix doc which is broken by last minute changes --- R/pkg/R/generics.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index e7701a942625..758f7394b8dc 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -939,7 +939,7 @@ setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") }) #' @export setGeneric("rtrim", function(x) { standardGeneric("rtrim") }) -#' @rdname stddev +#' @rdname sd #' @export setGeneric("sd", function(x, na.rm = FALSE) { standardGeneric("sd") }) @@ -983,7 +983,7 @@ setGeneric("skewness", function(x) { standardGeneric("skewness") }) #' @export setGeneric("soundex", function(x) { standardGeneric("soundex") }) -#' @rdname stddev +#' @rdname sd #' @export setGeneric("stddev", function(x) { standardGeneric("stddev") }) @@ -1043,11 +1043,11 @@ setGeneric("unix_timestamp", function(x, format) { standardGeneric("unix_timesta #' @export setGeneric("upper", function(x) { standardGeneric("upper") }) -#' @rdname variance +#' @rdname var #' @export setGeneric("var", function(x, y = NULL, na.rm = FALSE, use) { standardGeneric("var") }) -#' @rdname variance +#' @rdname var #' @export setGeneric("variance", function(x) { standardGeneric("variance") })