address comments

WeichenXu123 · WeichenXu123 · commit acbf9e4d116f · 2018-04-25T16:57:51.000+08:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala
@@ -21,8 +21,7 @@ import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.fpm.{PrefixSpan => mllibPrefixSpan}
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.col
-import org.apache.spark.sql.types.{LongType, StructField, StructType}
-import org.apache.spark.storage.StorageLevel
+import org.apache.spark.sql.types.{ArrayType, LongType, StructField, StructType}
 
 /**
  * :: Experimental ::
@@ -44,26 +43,37 @@ object PrefixSpan {
    *
    * @param dataset A dataset or a dataframe containing a sequence column which is
    *                {{{Seq[Seq[_]]}}} type
-   * @param sequenceCol the name of the sequence column in dataset
+   * @param sequenceCol the name of the sequence column in dataset, rows with nulls in this column
+   *                    are ignored
    * @param minSupport the minimal support level of the sequential pattern, any pattern that
    *                   appears more than (minSupport * size-of-the-dataset) times will be output
-   *                  (default: `0.1`).
-   * @param maxPatternLength the maximal length of the sequential pattern, any pattern that appears
-   *                         less than maxPatternLength will be output (default: `10`).
+   *                  (recommended value: `0.1`).
+   * @param maxPatternLength the maximal length of the sequential pattern
+   *                         (recommended value: `10`).
    * @param maxLocalProjDBSize The maximum number of items (including delimiters used in the
    *                           internal storage format) allowed in a projected database before
    *                           local processing. If a projected database exceeds this size, another
-   *                           iteration of distributed prefix growth is run (default: `32000000`).
-   * @return A dataframe that contains columns of sequence and corresponding frequency.
+   *                           iteration of distributed prefix growth is run
+   *                           (recommended value: `32000000`).
+   * @return A `DataFrame` that contains columns of sequence and corresponding frequency.
+   *         The schema of it will be:
+   *          - `sequence: Seq[Seq[T]]` (T is the item type)
+   *          - `frequency: Long`
    */
   @Since("2.4.0")
-  def findFrequentSequentPatterns(
+  def findFrequentSequentialPatterns(
       dataset: Dataset[_],
       sequenceCol: String,
-      minSupport: Double = 0.1,
-      maxPatternLength: Int = 10,
-      maxLocalProjDBSize: Long = 32000000L): DataFrame = {
-    val handlePersistence = dataset.storageLevel == StorageLevel.NONE
+      minSupport: Double,
+      maxPatternLength: Int,
+      maxLocalProjDBSize: Long): DataFrame = {
+
+    val inputType = dataset.schema(sequenceCol).dataType
+    require(inputType.isInstanceOf[ArrayType] &&
+      inputType.asInstanceOf[ArrayType].elementType.isInstanceOf[ArrayType],
+      s"The input column must be ArrayType and the array element type must also be ArrayType, " +
+      s"but got $inputType.")
+
 
     val data = dataset.select(sequenceCol)
     val sequences = data.where(col(sequenceCol).isNotNull).rdd
@@ -73,18 +83,13 @@ object PrefixSpan {
       .setMinSupport(minSupport)
       .setMaxPatternLength(maxPatternLength)
       .setMaxLocalProjDBSize(maxLocalProjDBSize)
-    if (handlePersistence) {
-      sequences.persist(StorageLevel.MEMORY_AND_DISK)
-    }
+
     val rows = mllibPrefixSpan.run(sequences).freqSequences.map(f => Row(f.sequence, f.freq))
     val schema = StructType(Seq(
       StructField("sequence", dataset.schema(sequenceCol).dataType, nullable = false),
-      StructField("freq", LongType, nullable = false)))
+      StructField("frequency", LongType, nullable = false)))
     val freqSequences = dataset.sparkSession.createDataFrame(rows, schema)
 
-    if (handlePersistence) {
-      sequences.unpersist()
-    }
     freqSequences
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -49,8 +49,7 @@ import org.apache.spark.storage.StorageLevel
  *
  * @param minSupport the minimal support level of the sequential pattern, any pattern that appears
  *                   more than (minSupport * size-of-the-dataset) times will be output
- * @param maxPatternLength the maximal length of the sequential pattern, any pattern that appears
- *                         less than maxPatternLength will be output
+ * @param maxPatternLength the maximal length of the sequential pattern
  * @param maxLocalProjDBSize The maximum number of items (including delimiters used in the internal
  *                           storage format) allowed in a projected database before local
  *                           processing. If a projected database exceeds this size, another
diff --git a/mllib/src/test/scala/org/apache/spark/ml/fpm/PrefixSpanSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/fpm/PrefixSpanSuite.scala
@@ -25,14 +25,13 @@ class PrefixSpanSuite extends MLTest {
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-    smallDataset = Seq(Seq(Seq(1, 2), Seq(1, 2, 3))).toDF("sequence")
   }
 
-  @transient var smallDataset: DataFrame = _
-
   test("PrefixSpan projections with multiple partial starts") {
-    val result = PrefixSpan.findFrequentSequentPatterns(smallDataset, "sequence",
-      minSupport = 1.0, maxPatternLength = 2).as[(Seq[Seq[Int]], Long)].collect()
+    val smallDataset = Seq(Seq(Seq(1, 2), Seq(1, 2, 3))).toDF("sequence")
+    val result = PrefixSpan.findFrequentSequentialPatterns(smallDataset, "sequence",
+      minSupport = 1.0, maxPatternLength = 2, maxLocalProjDBSize = 32000000)
+      .as[(Seq[Seq[Int]], Long)].collect()
     val expected = Array(
       (Seq(Seq(1)), 1L),
       (Seq(Seq(1, 2)), 1L),
@@ -49,6 +48,32 @@ class PrefixSpanSuite extends MLTest {
     compareResults[Int](expected, result)
   }
 
+  /*
+  To verify expected results for `smallTestData`, create file "prefixSpanSeqs2" with content
+  (format = (transactionID, idxInTransaction, numItemsinItemset, itemset)):
+    1 1 2 1 2
+    1 2 1 3
+    2 1 1 1
+    2 2 2 3 2
+    2 3 2 1 2
+    3 1 2 1 2
+    3 2 1 5
+    4 1 1 6
+  In R, run:
+    library("arulesSequences")
+    prefixSpanSeqs = read_baskets("prefixSpanSeqs", info = c("sequenceID","eventID","SIZE"))
+    freqItemSeq = cspade(prefixSpanSeqs,
+                         parameter = 0.5, maxlen = 5 ))
+    resSeq = as(freqItemSeq, "data.frame")
+    resSeq
+
+       sequence support
+    1     <{1}>    0.75
+    2     <{2}>    0.75
+    3     <{3}>    0.50
+    4 <{1},{3}>    0.50
+    5   <{1,2}>    0.75
+ */
   val smallTestData = Seq(
     Seq(Seq(1, 2), Seq(3)),
     Seq(Seq(1), Seq(3, 2), Seq(1, 2)),
@@ -65,8 +90,18 @@ class PrefixSpanSuite extends MLTest {
 
   test("PrefixSpan Integer type, variable-size itemsets") {
     val df = smallTestData.toDF("sequence")
-    val result = PrefixSpan.findFrequentSequentPatterns(df, "sequence",
-      minSupport = 0.5, maxPatternLength = 5).as[(Seq[Seq[Int]], Long)].collect()
+    val result = PrefixSpan.findFrequentSequentialPatterns(df, "sequence",
+      minSupport = 0.5, maxPatternLength = 5, maxLocalProjDBSize = 32000000)
+      .as[(Seq[Seq[Int]], Long)].collect()
+
+    compareResults[Int](smallTestDataExpectedResult, result)
+  }
+
+  test("PrefixSpan input row with nulls") {
+    val df = (smallTestData :+ null).toDF("sequence")
+    val result = PrefixSpan.findFrequentSequentialPatterns(df, "sequence",
+      minSupport = 0.5, maxPatternLength = 5, maxLocalProjDBSize = 32000000)
+      .as[(Seq[Seq[Int]], Long)].collect()
 
     compareResults[Int](smallTestDataExpectedResult, result)
   }
@@ -76,8 +111,9 @@ class PrefixSpanSuite extends MLTest {
     val df = smallTestData
       .map(seq => seq.map(itemSet => itemSet.map(intToString)))
       .toDF("sequence")
-    val result = PrefixSpan.findFrequentSequentPatterns(df, "sequence",
-      minSupport = 0.5, maxPatternLength = 5).as[(Seq[Seq[String]], Long)].collect()
+    val result = PrefixSpan.findFrequentSequentialPatterns(df, "sequence",
+      minSupport = 0.5, maxPatternLength = 5, maxLocalProjDBSize = 32000000)
+      .as[(Seq[Seq[String]], Long)].collect()
 
     val expected = smallTestDataExpectedResult.map { case (seq, freq) =>
       (seq.map(itemSet => itemSet.map(intToString)), freq)