11
11
// ===----------------------------------------------------------------------===//
12
12
13
13
#include " llvm/Support/SuffixTree.h"
14
+ #include " llvm/ADT/SmallPtrSet.h"
14
15
#include " llvm/Support/Allocator.h"
15
16
#include " llvm/Support/Casting.h"
16
17
#include " llvm/Support/SuffixTreeNode.h"
18
+ #include < stack>
17
19
18
20
using namespace llvm ;
19
21
@@ -26,7 +28,9 @@ static size_t numElementsInSubstring(const SuffixTreeNode *N) {
26
28
return N->getEndIdx () - N->getStartIdx () + 1 ;
27
29
}
28
30
29
- SuffixTree::SuffixTree (const ArrayRef<unsigned > &Str) : Str(Str) {
31
+ SuffixTree::SuffixTree (const ArrayRef<unsigned > &Str,
32
+ bool OutlinerLeafDescendants)
33
+ : Str(Str), OutlinerLeafDescendants(OutlinerLeafDescendants) {
30
34
Root = insertRoot ();
31
35
Active.Node = Root;
32
36
@@ -46,6 +50,11 @@ SuffixTree::SuffixTree(const ArrayRef<unsigned> &Str) : Str(Str) {
46
50
// Set the suffix indices of each leaf.
47
51
assert (Root && " Root node can't be nullptr!" );
48
52
setSuffixIndices ();
53
+
54
+ // Collect all leaf nodes of the suffix tree. And for each internal node,
55
+ // record the range of leaf nodes that are descendants of it.
56
+ if (OutlinerLeafDescendants)
57
+ setLeafNodes ();
49
58
}
50
59
51
60
SuffixTreeNode *SuffixTree::insertLeaf (SuffixTreeInternalNode &Parent,
@@ -105,6 +114,68 @@ void SuffixTree::setSuffixIndices() {
105
114
}
106
115
}
107
116
117
+ void SuffixTree::setLeafNodes () {
118
+ // A stack that keeps track of nodes to visit for post-order DFS traversal.
119
+ std::stack<SuffixTreeNode *> ToVisit;
120
+ ToVisit.push (Root);
121
+
122
+ // This keeps track of the index of the next leaf node to be added to
123
+ // the LeafNodes vector of the suffix tree.
124
+ unsigned LeafCounter = 0 ;
125
+
126
+ // This keeps track of nodes whose children have been added to the stack
127
+ // during the post-order depth-first traversal of the tree.
128
+ llvm::SmallPtrSet<SuffixTreeInternalNode *, 32 > ChildrenAddedToStack;
129
+
130
+ // Traverse the tree in post-order.
131
+ while (!ToVisit.empty ()) {
132
+ SuffixTreeNode *CurrNode = ToVisit.top ();
133
+ ToVisit.pop ();
134
+ if (auto *CurrInternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode)) {
135
+ // The current node is an internal node.
136
+ if (ChildrenAddedToStack.find (CurrInternalNode) !=
137
+ ChildrenAddedToStack.end ()) {
138
+ // If the children of the current node has been added to the stack,
139
+ // then this is the second time we visit this node and at this point,
140
+ // all of its children have already been processed. Now, we can
141
+ // set its LeftLeafIdx and RightLeafIdx;
142
+ auto it = CurrInternalNode->Children .begin ();
143
+ if (it != CurrInternalNode->Children .end ()) {
144
+ // Get the first child to use its RightLeafIdx. The RightLeafIdx is
145
+ // used as the first child is the initial one added to the stack, so
146
+ // it's the last one to be processed. This implies that the leaf
147
+ // descendants of the first child are assigned the largest index
148
+ // numbers.
149
+ CurrNode->setRightLeafIdx (it->second ->getRightLeafIdx ());
150
+ // get the last child to use its LeftLeafIdx.
151
+ while (std::next (it) != CurrInternalNode->Children .end ())
152
+ it = std::next (it);
153
+ CurrNode->setLeftLeafIdx (it->second ->getLeftLeafIdx ());
154
+ assert (CurrNode->getLeftLeafIdx () <= CurrNode->getRightLeafIdx () &&
155
+ " LeftLeafIdx should not be larger than RightLeafIdx" );
156
+ }
157
+ } else {
158
+ // This is the first time we visit this node. This means that its
159
+ // children have not been added to the stack yet. Hence, we will add
160
+ // the current node back to the stack and add its children to the
161
+ // stack for processing.
162
+ ToVisit.push (CurrNode);
163
+ for (auto &ChildPair : CurrInternalNode->Children )
164
+ ToVisit.push (ChildPair.second );
165
+ ChildrenAddedToStack.insert (CurrInternalNode);
166
+ }
167
+ } else {
168
+ // The current node is a leaf node.
169
+ // We can simplyset its LeftLeafIdx and RightLeafIdx.
170
+ CurrNode->setLeftLeafIdx (LeafCounter);
171
+ CurrNode->setRightLeafIdx (LeafCounter);
172
+ LeafCounter++;
173
+ auto *CurrLeafNode = cast<SuffixTreeLeafNode>(CurrNode);
174
+ LeafNodes.push_back (CurrLeafNode);
175
+ }
176
+ }
177
+ }
178
+
108
179
unsigned SuffixTree::extend (unsigned EndIdx, unsigned SuffixesToAdd) {
109
180
SuffixTreeInternalNode *NeedsLink = nullptr ;
110
181
@@ -230,6 +301,7 @@ void SuffixTree::RepeatedSubstringIterator::advance() {
230
301
231
302
// Each leaf node represents a repeat of a string.
232
303
SmallVector<unsigned > RepeatedSubstringStarts;
304
+ SmallVector<SuffixTreeLeafNode *> LeafDescendants;
233
305
234
306
// Continue visiting nodes until we find one which repeats more than once.
235
307
while (!InternalNodesToVisit.empty ()) {
@@ -241,30 +313,35 @@ void SuffixTree::RepeatedSubstringIterator::advance() {
241
313
// it's too short, we'll quit.
242
314
unsigned Length = Curr->getConcatLen ();
243
315
244
- // Iterate over each child, saving internal nodes for visiting, and
245
- // leaf nodes' SuffixIdx in RepeatedSubstringStarts. Internal nodes
246
- // represent individual strings, which may repeat.
247
- for (auto &ChildPair : Curr->Children ) {
316
+ // Iterate over each child, saving internal nodes for visiting.
317
+ // Internal nodes represent individual strings, which may repeat.
318
+ for (auto &ChildPair : Curr->Children )
248
319
// Save all of this node's children for processing.
249
320
if (auto *InternalChild =
250
- dyn_cast<SuffixTreeInternalNode>(ChildPair.second )) {
321
+ dyn_cast<SuffixTreeInternalNode>(ChildPair.second ))
251
322
InternalNodesToVisit.push_back (InternalChild);
252
- continue ;
253
- }
254
-
255
- if (Length < MinLength)
256
- continue ;
257
-
258
- // Have an occurrence of a potentially repeated string. Save it.
259
- auto *Leaf = cast<SuffixTreeLeafNode>(ChildPair.second );
260
- RepeatedSubstringStarts.push_back (Leaf->getSuffixIdx ());
261
- }
323
+
324
+ // If length of repeated substring is below threshold, then skip it.
325
+ if (Length < MinLength)
326
+ continue ;
262
327
263
328
// The root never represents a repeated substring. If we're looking at
264
329
// that, then skip it.
265
330
if (Curr->isRoot ())
266
331
continue ;
267
332
333
+ // Collect leaf children or leaf descendants by OutlinerLeafDescendants.
334
+ if (!OutlinerLeafDescendants) {
335
+ for (auto &ChildPair : Curr->Children )
336
+ if (auto *Leaf = dyn_cast<SuffixTreeLeafNode>(ChildPair.second ))
337
+ RepeatedSubstringStarts.push_back (Leaf->getSuffixIdx ());
338
+ } else {
339
+ LeafDescendants.assign (LeafNodes.begin () + Curr->getLeftLeafIdx (),
340
+ LeafNodes.begin () + Curr->getRightLeafIdx () + 1 );
341
+ for (SuffixTreeLeafNode *Leaf : LeafDescendants)
342
+ RepeatedSubstringStarts.push_back (Leaf->getSuffixIdx ());
343
+ }
344
+
268
345
// Do we have any repeated substrings?
269
346
if (RepeatedSubstringStarts.size () < 2 )
270
347
continue ;
0 commit comments