From f9073438d09d1b2ab1517b8762877625e76c1c78 Mon Sep 17 00:00:00 2001 From: krishnakalyan3 Date: Wed, 1 Feb 2017 18:33:02 +0100 Subject: [PATCH 1/5] add bisectingKmeans --- R/pkg/vignettes/sparkr-vignettes.Rmd | 14 ++++++++++++++ docs/ml-clustering.md | 7 +++++++ 2 files changed, 21 insertions(+) diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 36a78477dc26..d5e43735a6a4 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -494,6 +494,8 @@ SparkR supports the following machine learning models and algorithms. * Latent Dirichlet Allocation (LDA) +* Bisecting $k$-means + #### Collaborative Filtering * Alternating Least Squares (ALS) @@ -819,6 +821,18 @@ perplexity <- spark.perplexity(model, corpusDF) perplexity ``` +#### Bisecting k-means + +`spark.bisectingKmeans` is a kind of [hierarchical clustering](https://en.wikipedia.org/wiki/Hierarchical_clustering) using a divisive (or "top-down") approach: all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy. + +```{r} +df <- createDataFrame(iris) +model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4) +summary(kmeansModel) +fitted <- predict(model, df) +head(select(fitted, "Sepal_Length", "prediction")) +``` + #### Alternating Least Squares `spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614). diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md index d8b6553c5b84..c526c9772055 100644 --- a/docs/ml-clustering.md +++ b/docs/ml-clustering.md @@ -167,6 +167,13 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering. {% include_example python/ml/bisecting_k_means_example.py %} + +
+ +Refer to the [R API docs](api/R/spark.bisectingKmeans.html) for more details. {% include_example r/ml/bisectingKmeans.R %} + +{% include_example r/ml/lda.R %} +
## Gaussian Mixture Model (GMM) From 66fd02792687b425ae338cac157ec46aca5fdd63 Mon Sep 17 00:00:00 2001 From: krishnakalyan3 Date: Wed, 1 Feb 2017 20:20:18 +0100 Subject: [PATCH 2/5] fix lda bad commit --- docs/ml-clustering.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md index c526c9772055..1186fb73d0fa 100644 --- a/docs/ml-clustering.md +++ b/docs/ml-clustering.md @@ -170,9 +170,9 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.
-Refer to the [R API docs](api/R/spark.bisectingKmeans.html) for more details. {% include_example r/ml/bisectingKmeans.R %} +Refer to the [R API docs](api/R/spark.bisectingKmeans.html) for more details. -{% include_example r/ml/lda.R %} +{% include_example r/ml/bisectingKmeans.R %}
From 73e1c12c83be932f5f1928b2e2964974e921bfb5 Mon Sep 17 00:00:00 2001 From: krishnakalyan3 Date: Wed, 1 Feb 2017 20:23:45 +0100 Subject: [PATCH 3/5] example --- examples/src/main/r/ml/bisectingKmeans.R | 42 ++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 examples/src/main/r/ml/bisectingKmeans.R diff --git a/examples/src/main/r/ml/bisectingKmeans.R b/examples/src/main/r/ml/bisectingKmeans.R new file mode 100644 index 000000000000..8228410ee59f --- /dev/null +++ b/examples/src/main/r/ml/bisectingKmeans.R @@ -0,0 +1,42 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/bisectingKmeans.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-bisectingKmeans-example") + +# $example on$ +irisDF <- suppressWarnings(createDataFrame(iris)) + +# Fit bisecting k-means model with four centers +model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4) + +# get fitted result from a bisecting k-means model +fitted.model <- fitted(model, "centers") + +# Model summary +showDF(fitted.model) + +# fitted values on training data +fitted <- predict(model, df) +head(select(fitted, "Sepal_Length", "prediction")) +# $example off$ From 240c9893989914424bbd2cf1cc01cf95daa95e27 Mon Sep 17 00:00:00 2001 From: krishnakalyan3 Date: Wed, 1 Feb 2017 20:29:10 +0100 Subject: [PATCH 4/5] fix supress and summary --- examples/src/main/r/ml/bisectingKmeans.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/src/main/r/ml/bisectingKmeans.R b/examples/src/main/r/ml/bisectingKmeans.R index 8228410ee59f..37aeb74fc761 100644 --- a/examples/src/main/r/ml/bisectingKmeans.R +++ b/examples/src/main/r/ml/bisectingKmeans.R @@ -25,7 +25,7 @@ library(SparkR) sparkR.session(appName = "SparkR-ML-bisectingKmeans-example") # $example on$ -irisDF <- suppressWarnings(createDataFrame(iris)) +irisDF <- createDataFrame(iris) # Fit bisecting k-means model with four centers model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4) @@ -34,7 +34,7 @@ model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4) fitted.model <- fitted(model, "centers") # Model summary -showDF(fitted.model) +summary(fitted.model) # fitted values on training data fitted <- predict(model, df) From 0cfb8914d8082f3311a3427dbe44001a1503b168 Mon Sep 17 00:00:00 2001 From: krishnakalyan3 Date: Thu, 2 Feb 2017 12:18:06 +0100 Subject: [PATCH 5/5] fix orider --- R/pkg/vignettes/sparkr-vignettes.Rmd | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index d5e43735a6a4..a7cac2f503d1 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -488,14 +488,14 @@ SparkR supports the following machine learning models and algorithms. #### Clustering +* Bisecting $k$-means + * Gaussian Mixture Model (GMM) * $k$-means Clustering * Latent Dirichlet Allocation (LDA) -* Bisecting $k$-means - #### Collaborative Filtering * Alternating Least Squares (ALS) @@ -740,6 +740,18 @@ summary(rfModel) predictions <- predict(rfModel, df) ``` +#### Bisecting k-Means + +`spark.bisectingKmeans` is a kind of [hierarchical clustering](https://en.wikipedia.org/wiki/Hierarchical_clustering) using a divisive (or "top-down") approach: all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy. + +```{r} +df <- createDataFrame(iris) +model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4) +summary(kmeansModel) +fitted <- predict(model, df) +head(select(fitted, "Sepal_Length", "prediction")) +``` + #### Gaussian Mixture Model `spark.gaussianMixture` fits multivariate [Gaussian Mixture Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) (GMM) against a `SparkDataFrame`. [Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (EM) is used to approximate the maximum likelihood estimator (MLE) of the model. @@ -821,18 +833,6 @@ perplexity <- spark.perplexity(model, corpusDF) perplexity ``` -#### Bisecting k-means - -`spark.bisectingKmeans` is a kind of [hierarchical clustering](https://en.wikipedia.org/wiki/Hierarchical_clustering) using a divisive (or "top-down") approach: all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy. - -```{r} -df <- createDataFrame(iris) -model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4) -summary(kmeansModel) -fitted <- predict(model, df) -head(select(fitted, "Sepal_Length", "prediction")) -``` - #### Alternating Least Squares `spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614).