@@ -25,14 +25,13 @@ class PrefixSpanSuite extends MLTest {
2525
2626 override def beforeAll (): Unit = {
2727 super .beforeAll()
28- smallDataset = Seq (Seq (Seq (1 , 2 ), Seq (1 , 2 , 3 ))).toDF(" sequence" )
2928 }
3029
31- @ transient var smallDataset : DataFrame = _
32-
3330 test(" PrefixSpan projections with multiple partial starts" ) {
34- val result = PrefixSpan .findFrequentSequentPatterns(smallDataset, " sequence" ,
35- minSupport = 1.0 , maxPatternLength = 2 ).as[(Seq [Seq [Int ]], Long )].collect()
31+ val smallDataset = Seq (Seq (Seq (1 , 2 ), Seq (1 , 2 , 3 ))).toDF(" sequence" )
32+ val result = PrefixSpan .findFrequentSequentialPatterns(smallDataset, " sequence" ,
33+ minSupport = 1.0 , maxPatternLength = 2 , maxLocalProjDBSize = 32000000 )
34+ .as[(Seq [Seq [Int ]], Long )].collect()
3635 val expected = Array (
3736 (Seq (Seq (1 )), 1L ),
3837 (Seq (Seq (1 , 2 )), 1L ),
@@ -49,6 +48,32 @@ class PrefixSpanSuite extends MLTest {
4948 compareResults[Int ](expected, result)
5049 }
5150
51+ /*
52+ To verify expected results for `smallTestData`, create file "prefixSpanSeqs2" with content
53+ (format = (transactionID, idxInTransaction, numItemsinItemset, itemset)):
54+ 1 1 2 1 2
55+ 1 2 1 3
56+ 2 1 1 1
57+ 2 2 2 3 2
58+ 2 3 2 1 2
59+ 3 1 2 1 2
60+ 3 2 1 5
61+ 4 1 1 6
62+ In R, run:
63+ library("arulesSequences")
64+ prefixSpanSeqs = read_baskets("prefixSpanSeqs", info = c("sequenceID","eventID","SIZE"))
65+ freqItemSeq = cspade(prefixSpanSeqs,
66+ parameter = 0.5, maxlen = 5 ))
67+ resSeq = as(freqItemSeq, "data.frame")
68+ resSeq
69+
70+ sequence support
71+ 1 <{1}> 0.75
72+ 2 <{2}> 0.75
73+ 3 <{3}> 0.50
74+ 4 <{1},{3}> 0.50
75+ 5 <{1,2}> 0.75
76+ */
5277 val smallTestData = Seq (
5378 Seq (Seq (1 , 2 ), Seq (3 )),
5479 Seq (Seq (1 ), Seq (3 , 2 ), Seq (1 , 2 )),
@@ -65,8 +90,18 @@ class PrefixSpanSuite extends MLTest {
6590
6691 test(" PrefixSpan Integer type, variable-size itemsets" ) {
6792 val df = smallTestData.toDF(" sequence" )
68- val result = PrefixSpan .findFrequentSequentPatterns(df, " sequence" ,
69- minSupport = 0.5 , maxPatternLength = 5 ).as[(Seq [Seq [Int ]], Long )].collect()
93+ val result = PrefixSpan .findFrequentSequentialPatterns(df, " sequence" ,
94+ minSupport = 0.5 , maxPatternLength = 5 , maxLocalProjDBSize = 32000000 )
95+ .as[(Seq [Seq [Int ]], Long )].collect()
96+
97+ compareResults[Int ](smallTestDataExpectedResult, result)
98+ }
99+
100+ test(" PrefixSpan input row with nulls" ) {
101+ val df = (smallTestData :+ null ).toDF(" sequence" )
102+ val result = PrefixSpan .findFrequentSequentialPatterns(df, " sequence" ,
103+ minSupport = 0.5 , maxPatternLength = 5 , maxLocalProjDBSize = 32000000 )
104+ .as[(Seq [Seq [Int ]], Long )].collect()
70105
71106 compareResults[Int ](smallTestDataExpectedResult, result)
72107 }
@@ -76,8 +111,9 @@ class PrefixSpanSuite extends MLTest {
76111 val df = smallTestData
77112 .map(seq => seq.map(itemSet => itemSet.map(intToString)))
78113 .toDF(" sequence" )
79- val result = PrefixSpan .findFrequentSequentPatterns(df, " sequence" ,
80- minSupport = 0.5 , maxPatternLength = 5 ).as[(Seq [Seq [String ]], Long )].collect()
114+ val result = PrefixSpan .findFrequentSequentialPatterns(df, " sequence" ,
115+ minSupport = 0.5 , maxPatternLength = 5 , maxLocalProjDBSize = 32000000 )
116+ .as[(Seq [Seq [String ]], Long )].collect()
81117
82118 val expected = smallTestDataExpectedResult.map { case (seq, freq) =>
83119 (seq.map(itemSet => itemSet.map(intToString)), freq)
0 commit comments