Skip to content

Commit 9243fa4

Browse files
committed
add tests, update docs
1 parent 769b553 commit 9243fa4

File tree

2 files changed

+89
-52
lines changed

2 files changed

+89
-52
lines changed

R/pkg/R/functions.R

Lines changed: 63 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -877,12 +877,25 @@ setMethod("rtrim",
877877
column(jc)
878878
})
879879

880-
#' @rdname stddev
880+
#' sd
881+
#'
882+
#' Aggregate function: alias for \link{stddev_samp}
883+
#'
884+
#' @rdname sd
881885
#' @name sd
882886
#' @family agg_funcs
887+
#' @seealso \link{stddev_pop}, \link{stddev_samp}
888+
#' @export
889+
#' @examples
890+
#'\dontrun{
891+
#'stddev(df$c)
892+
#'select(df, stddev(df$age))
893+
#'agg(df, sd(df$age))
894+
#'}
883895
setMethod("sd",
884896
signature(x = "Column"),
885897
function(x, na.rm = FALSE) {
898+
# In R, sample standard deviation is calculated with the sd() function.
886899
stddev_samp(x)
887900
})
888901

@@ -1015,51 +1028,46 @@ setMethod("soundex",
10151028
column(jc)
10161029
})
10171030

1018-
#' stddev
1019-
#'
1020-
#' Aggregate function: alias for \link{stddev_samp}
1021-
#'
1022-
#' @rdname stddev
1031+
#' @rdname sd
10231032
#' @name stddev
1024-
#' @family agg_funcs
1025-
#' @export
1026-
#' @examples \dontrun{stddev(df$c)}
10271033
setMethod("stddev",
10281034
signature(x = "Column"),
10291035
function(x) {
10301036
jc <- callJStatic("org.apache.spark.sql.functions", "stddev", x@jc)
10311037
column(jc)
10321038
})
10331039

1034-
#' stddev_samp
1040+
#' stddev_pop
10351041
#'
1036-
#' Aggregate function: returns the unbiased sample standard deviation of the expression in a group.
1042+
#' Aggregate function: returns the population standard deviation of the expression in a group.
10371043
#'
1038-
#' @rdname stddev_samp
1039-
#' @name stddev_samp
1044+
#' @rdname stddev_pop
1045+
#' @name stddev_pop
10401046
#' @family agg_funcs
1047+
#' @seealso \link{sd}, \link{stddev_samp}
10411048
#' @export
1042-
#' @examples \dontrun{stddev_samp(df$c)}
1043-
setMethod("stddev_samp",
1049+
#' @examples \dontrun{stddev_pop(df$c)}
1050+
setMethod("stddev_pop",
10441051
signature(x = "Column"),
10451052
function(x) {
1046-
jc <- callJStatic("org.apache.spark.sql.functions", "stddev_samp", x@jc)
1053+
jc <- callJStatic("org.apache.spark.sql.functions", "stddev_pop", x@jc)
10471054
column(jc)
10481055
})
10491056

1050-
#' stddev_pop
1057+
#' stddev_samp
10511058
#'
1052-
#' Aggregate function: returns the population standard deviation of the expression in a group.
1059+
#' Aggregate function: returns the unbiased sample standard deviation of the expression in a group.
10531060
#'
1054-
#' @rdname stddev_pop
1055-
#' @name stddev_pop
1061+
#' @rdname stddev_samp
1062+
#' @name stddev_samp
10561063
#' @family agg_funcs
1064+
#' @seealso \link{stddev_pop}, \link{sd}
10571065
#' @export
1058-
#' @examples \dontrun{stddev_pop(df$c)}
1059-
setMethod("stddev_pop",
1066+
#' @examples \dontrun{stddev_samp(df$c)}
1067+
setMethod("stddev_samp",
10601068
signature(x = "Column"),
10611069
function(x) {
1062-
jc <- callJStatic("org.apache.spark.sql.functions", "stddev_pop", x@jc)
1070+
jc <- callJStatic("org.apache.spark.sql.functions", "stddev_samp", x@jc)
10631071
column(jc)
10641072
})
10651073

@@ -1257,60 +1265,68 @@ setMethod("upper",
12571265
column(jc)
12581266
})
12591267

1260-
#' @family agg_funcs
1261-
#' @rdname variance
1268+
#' var
1269+
#'
1270+
#' Aggregate function: alias for \link{var_samp}.
1271+
#'
1272+
#' @rdname var
12621273
#' @name var
1274+
#' @family agg_funcs
1275+
#' @seealso \link{var_pop}, \link{var_samp}
1276+
#' @export
1277+
#' @examples
1278+
#'\dontrun{
1279+
#'variance(df$c)
1280+
#'select(df, var_pop(df$age))
1281+
#'agg(df, var(df$age))
1282+
#'}
12631283
setMethod("var",
12641284
signature(x = "Column"),
12651285
function(x, y = NULL, na.rm = FALSE, use) {
1286+
# In R, sample variance is calculated with the var() function.
12661287
var_samp(x)
12671288
})
12681289

1269-
#' variance
1270-
#'
1271-
#' Aggregate function: alias for \link{var_samp}.
1272-
#'
1273-
#' @rdname variance
1290+
#' @rdname var
12741291
#' @name variance
1275-
#' @family agg_funcs
1276-
#' @export
1277-
#' @examples \dontrun{variance(df$c)}
12781292
setMethod("variance",
12791293
signature(x = "Column"),
12801294
function(x) {
12811295
jc <- callJStatic("org.apache.spark.sql.functions", "variance", x@jc)
12821296
column(jc)
12831297
})
12841298

1285-
#' var_samp
1299+
#' var_pop
12861300
#'
1287-
#' Aggregate function: returns the unbiased variance of the values in a group.
1301+
#' Aggregate function: returns the population variance of the values in a group.
12881302
#'
1289-
#' @rdname var_samp
1290-
#' @name var_samp
1303+
#' @rdname var_pop
1304+
#' @name var_pop
12911305
#' @family agg_funcs
1306+
#' @seealso \link{var}, \link{var_samp}
12921307
#' @export
1293-
#' @examples \dontrun{var_samp(df$c)}
1294-
setMethod("var_samp",
1308+
#' @examples \dontrun{var_pop(df$c)}
1309+
setMethod("var_pop",
12951310
signature(x = "Column"),
12961311
function(x) {
1297-
jc <- callJStatic("org.apache.spark.sql.functions", "var_samp", x@jc)
1312+
jc <- callJStatic("org.apache.spark.sql.functions", "var_pop", x@jc)
12981313
column(jc)
12991314
})
13001315

1301-
#' var_pop
1316+
#' var_samp
13021317
#'
1303-
#' Aggregate function: returns the population variance of the values in a group.
1318+
#' Aggregate function: returns the unbiased variance of the values in a group.
13041319
#'
1305-
#' @rdname var_pop
1306-
#' @name var_pop
1320+
#' @rdname var_samp
1321+
#' @name var_samp
13071322
#' @family agg_funcs
1323+
#' @seealso \link{var_pop}, \link{var}
13081324
#' @export
1309-
#' @examples \dontrun{var_pop(df$c)}
1310-
setMethod("var_pop",
1325+
#' @examples \dontrun{var_samp(df$c)}
1326+
setMethod("var_samp",
13111327
signature(x = "Column"),
13121328
function(x) {
1313-
jc <- callJStatic("org.apache.spark.sql.functions", "var_pop", x@jc)
1329+
jc <- callJStatic("org.apache.spark.sql.functions", "var_samp", x@jc)
13141330
column(jc)
13151331
})
13161332

R/pkg/inst/tests/test_sparkSQL.R

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -983,7 +983,7 @@ test_that("when(), otherwise() and ifelse() on a DataFrame", {
983983
expect_equal(collect(select(df, ifelse(df$a > 1 & df$b > 2, 0, 1)))[, 1], c(1, 0))
984984
})
985985

986-
test_that("group by", {
986+
test_that("group by, agg functions", {
987987
df <- jsonFile(sqlContext, jsonPath)
988988
df1 <- agg(df, name = "max", age = "sum")
989989
expect_equal(1, count(df1))
@@ -1020,8 +1020,8 @@ test_that("group by", {
10201020

10211021
expect_equal(3, count(mean(gd)))
10221022
expect_equal(3, count(max(gd)))
1023-
expect_equal(30, collect(max(gd))[1,2])
1024-
expect_equal(1, collect(count(gd))[1,2])
1023+
expect_equal(30, collect(max(gd))[1, 2])
1024+
expect_equal(1, collect(count(gd))[1, 2])
10251025

10261026
mockLines2 <- c("{\"name\":\"ID1\", \"value\": \"10\"}",
10271027
"{\"name\":\"ID1\", \"value\": \"10\"}",
@@ -1037,10 +1037,31 @@ test_that("group by", {
10371037

10381038
df7 <- agg(gd2, value = "stddev")
10391039
df7_local <- collect(df7)
1040-
10411040
expect_true(abs(df7_local[df7_local$name == "ID1",][1, 2] - 6.928203) < 1e-6)
10421041
expect_equal(0, df7_local[df7_local$name == "ID2",][1, 2])
1042+
1043+
mockLines3 <- c("{\"name\":\"Andy\", \"age\":30}",
1044+
"{\"name\":\"Andy\", \"age\":30}",
1045+
"{\"name\":\"Justin\", \"age\":19}",
1046+
"{\"name\":\"Justin\", \"age\":1}")
1047+
jsonPath3 <- tempfile(pattern="sparkr-test", fileext=".tmp")
1048+
writeLines(mockLines3, jsonPath3)
1049+
df8 <- jsonFile(sqlContext, jsonPath3)
1050+
gd3 <- groupBy(df8, "name")
1051+
gd3_local <- collect(sum(gd3))
1052+
expect_equal(60, gd3_local[gd3_local$name == "Andy",][1, 2])
1053+
expect_equal(20, gd3_local[gd3_local$name == "Justin",][1, 2])
1054+
1055+
expect_true(abs(collect(agg(df, sd(df$age)))[1, 1] - 7.778175) < 1e-6)
1056+
gd3_local <- collect(agg(gd3, var(df8$age)))
1057+
expect_equal(162, gd3_local[gd3_local$name == "Justin",][1, 2])
1058+
1059+
# make sure base:: or stats::sd, var are working
1060+
expect_true(abs(sd(1:2) - 0.7071068) < 1e-6)
1061+
expect_true(abs(var(1:5, 1:5) - 2.5) < 1e-6)
1062+
10431063
unlink(jsonPath2)
1064+
unlink(jsonPath3)
10441065
})
10451066

10461067
test_that("arrange() and orderBy() on a DataFrame", {
@@ -1268,7 +1289,7 @@ test_that("mutate(), transform(), rename() and names()", {
12681289
expect_equal(columns(transformedDF)[4], "newAge2")
12691290
expect_equal(first(filter(transformedDF, transformedDF$name == "Andy"))$newAge, -30)
12701291

1271-
# test if transform on local data frames works
1292+
# test if base::transform on local data frames works
12721293
# ensure the proper signature is used - otherwise this will fail to run
12731294
attach(airquality)
12741295
result <- transform(Ozone, logOzone = log(Ozone))

0 commit comments

Comments
 (0)