Skip to content

Commit a0667c1

Browse files
Lexeyandrewvk
authored andcommitted
Added method to enumerate all suffixes in the suffix tree
1 parent b7a6a6e commit a0667c1

File tree

4 files changed

+221
-60
lines changed

4 files changed

+221
-60
lines changed

Experimental/src/CodeJam.Experimental.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
<Compile Include="Collections\DisjointSets.cs" />
6767
<Compile Include="Collections\DisjointSetsBase.cs" />
6868
<Compile Include="Collections\DisjointSetsT.cs" />
69+
<Compile Include="Collections\Suffix.cs" />
6970
<Compile Include="Collections\SuffixTreeBase.cs" />
7071
<Compile Include="Collections\SuffixTree.cs" />
7172
<Compile Include="Parsing\BinaryOperator.cs" />
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
using JetBrains.Annotations;
2+
3+
namespace CodeJam.Collections
4+
{
5+
/// <summary>Suffix of the SuffixTree</summary>
6+
[PublicAPI]
7+
public struct Suffix
8+
{
9+
/// <summary>Buffer of all added strings</summary>
10+
private readonly string _buffer;
11+
/// <summary>Offset to the beginning of the suffix in the buffer</summary>
12+
private readonly int _offset;
13+
14+
/// <summary>Constructs a new suffix</summary>
15+
/// <param name="buffer">Buffer with all added strings</param>
16+
/// <param name="sourceIndex">Source string index</param>
17+
/// <param name="offset">Offset of the suffix in the buffer</param>
18+
/// <param name="length">Length of the suffix</param>
19+
internal Suffix([NotNull] string buffer, int sourceIndex, int offset, int length)
20+
{
21+
DebugCode.NotNull(buffer, nameof(buffer));
22+
DebugCode.ValidIndex(sourceIndex, nameof(sourceIndex));
23+
DebugCode.ValidIndexAndCount(offset, nameof(offset), length, nameof(length), buffer.Length);
24+
_buffer = buffer;
25+
SourceIndex = sourceIndex;
26+
_offset = offset;
27+
Length = length;
28+
}
29+
30+
/// <summary>
31+
/// The index of the source string in the order or addition to the Suffix tree
32+
/// <remarks>0 - for the first added string, 1 - for the second, etc</remarks>
33+
/// </summary>
34+
public int SourceIndex { get; }
35+
/// <summary>The length of the suffix</summary>
36+
public int Length { get; }
37+
/// <summary>The suffix value</summary>
38+
public string Value => _buffer.Substring(_offset, Length);
39+
40+
/// <summary>String conversion operator</summary>
41+
/// <param name="suffix">The suffix to convert</param>
42+
public static implicit operator string(Suffix suffix) => suffix.Value;
43+
}
44+
}

Experimental/src/Collections/SuffixTreeBase.cs

Lines changed: 139 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55

66
namespace CodeJam.Collections
77
{
8-
public abstract class SuffixTreeBase
9-
{
8+
public abstract class SuffixTreeBase
9+
{
1010
/// <summary>Node alignment in Print output</summary>
1111
private const int Align = 6;
1212
/// <summary>Root node index</summary>
13-
protected const int RootNodeIndex = 0;
13+
protected const int RootNodeIndex = 0;
1414

1515
/// <summary>Tree nodes</summary>
1616
private readonly List<Node> _nodes;
@@ -21,27 +21,27 @@ public abstract class SuffixTreeBase
2121
/// <summary>Adds a new node</summary>
2222
/// <param name="node">A node to add</param>
2323
/// <returns>Index of the node</returns>
24-
protected int AddNode(Node node)
25-
{
26-
var index = _nodes.Count;
24+
protected int AddNode(Node node)
25+
{
26+
var index = _nodes.Count;
2727
_nodes.Add(node);
28-
return index;
29-
}
28+
return index;
29+
}
3030

3131
/// <summary>Updates the node at the index</summary>
3232
/// <param name="index">The index to update</param>
3333
/// <param name="node">The new node value</param>
34-
protected void UpdateNode(int index, Node node) => _nodes[index] = node;
34+
protected void UpdateNode(int index, Node node) => _nodes[index] = node;
3535

36-
/// <summary>Gets a node at the index</summary>
36+
/// <summary>Gets a node at the index</summary>
3737
/// <param name="index">The index of the node</param>
3838
/// <returns>The node</returns>
39-
protected Node GetNode(int index) => _nodes[index];
39+
protected Node GetNode(int index) => _nodes[index];
4040

4141
/// <summary>Number of nodes</summary>
42-
protected int NodesCount => _nodes.Count;
42+
protected int NodesCount => _nodes.Count;
4343

44-
/// <summary>Concatenated input strings</summary>
44+
/// <summary>Concatenated input strings</summary>
4545
protected string InternalData { get; private set; }
4646

4747
/// <summary>List of end positions of added strings inside the InternalData</summary>
@@ -75,73 +75,141 @@ public void Add([NotNull]string data)
7575
BuildFor(begin, InternalData.Length);
7676
}
7777

78+
/// <summary>Enumerates all suffixes in the suffix tree</summary>
79+
/// <remarks>May return suffixes with the same value of the they are present in different source strings</remarks>
80+
/// <returns>The enumeration of all suffixes</returns>
81+
[PublicAPI]
82+
public IEnumerable<Suffix> AllSuffixes()
83+
{
84+
if (Root.IsLeaf) // Empty tree
85+
{
86+
yield break;
87+
}
88+
89+
var branchStack = new Stack<BranchPoint>();
90+
var branchPoint = new BranchPoint { Node = Root, EdgeIndex = 0 };
91+
var length = 0;
92+
for (;;)
93+
{
94+
var edge = GetNode(branchPoint.Node.Children[branchPoint.EdgeIndex]);
95+
var edgeLength = edge.Length;
96+
length += edgeLength;
97+
if (!edge.IsTerminal)
98+
{
99+
branchPoint.Length = edgeLength;
100+
branchStack.Push(branchPoint);
101+
branchPoint = new BranchPoint { Node = edge, EdgeIndex = 0 };
102+
continue;
103+
}
104+
105+
// We have descended to a terminal edge. Let's produce a suffix
106+
var end = edge.End;
107+
var offset = end - length;
108+
var sourceIndex = GetSourceIndexByEnd(end);
109+
yield return new Suffix(InternalData, sourceIndex, offset, length);
110+
111+
// Move to the next suffix branch
112+
for (;;)
113+
{
114+
length -= edgeLength;
115+
var nextEdgeIndex = branchPoint.EdgeIndex + 1;
116+
if (nextEdgeIndex < branchPoint.Node.Children.Count)
117+
{
118+
branchPoint.EdgeIndex = nextEdgeIndex;
119+
break;
120+
}
121+
// There is no more branches on the current level
122+
// Return to the previous level
123+
if (branchStack.Count == 0)
124+
{
125+
// no more branches to visit
126+
DebugCode.AssertState(length == 0, "Illegal final length. Check logic");
127+
yield break;
128+
}
129+
branchPoint = branchStack.Pop();
130+
edgeLength = branchPoint.Length;
131+
}
132+
}
133+
}
134+
135+
/// <summary>Locates the source string index by the suffix end</summary>
136+
/// <param name="end">The suffix end</param>
137+
/// <returns>The source string index</returns>
138+
private int GetSourceIndexByEnd(int end)
139+
{
140+
var index = EndPositions.LowerBound(end);
141+
DebugCode.AssertState(index < EndPositions.Count && EndPositions[index] == end
142+
, "Invalid source index computed. Check logic");
143+
return index;
144+
}
145+
78146
/// <summary>Appends suffixes for the last added string</summary>
79-
protected abstract void BuildFor(int begin, int end);
147+
protected abstract void BuildFor(int begin, int end);
80148

81149
/// <summary>Creates a comparer for nodes against a char</summary>
82150
/// <returns>The comparer</returns>
83-
protected Func<int, char, int> GetComparer() => (index, c) =>
84-
{
85-
var node = GetNode(index);
86-
if (node.Begin == node.End) // no char always less than any char
87-
{
88-
return -1;
89-
}
151+
protected Func<int, char, int> GetComparer() => (index, c) =>
152+
{
153+
var node = GetNode(index);
154+
if (node.Begin == node.End) // no char always less than any char
155+
{
156+
return -1;
157+
}
90158
var firstChar = InternalData[node.Begin];
91-
return firstChar - c;
92-
};
159+
return firstChar - c;
160+
};
93161

94162
/// <summary>Prints the tree structure to the string for the debugging purposes</summary>
95163
/// <returns>The tree structure as a string</returns>
96164
[Pure]
97165
public string Print()
98-
{
99-
var sb = new StringBuilder();
100-
var currentIndex = RootNodeIndex;
101-
var stack = new List<ValueTuple<int, int>>();
102-
for (;;)
103-
{
104-
PrintNodeWithPath(sb, currentIndex, stack);
105-
var node = GetNode(currentIndex);
106-
if (node.Children != null)
107-
{
108-
stack.Add(ValueTuple.Create(currentIndex, node.Children.Count - 2));
109-
currentIndex = node.Children[node.Children.Count - 1];
166+
{
167+
var sb = new StringBuilder();
168+
var currentIndex = RootNodeIndex;
169+
var stack = new List<ValueTuple<int, int>>();
170+
for (;;)
171+
{
172+
PrintNodeWithPath(sb, currentIndex, stack);
173+
var node = GetNode(currentIndex);
174+
if (node.Children != null)
175+
{
176+
stack.Add(ValueTuple.Create(currentIndex, node.Children.Count - 2));
177+
currentIndex = node.Children[node.Children.Count - 1];
110178
continue;
111-
}
112-
currentIndex = -1;
113-
while (stack.Count > 0)
114-
{
179+
}
180+
currentIndex = -1;
181+
while (stack.Count > 0)
182+
{
115183
var t = stack[stack.Count - 1];
116184
stack.RemoveAt(stack.Count - 1);
117185
node = GetNode(t.Item1);
118186
var nextChild = t.Item2;
119-
if (nextChild >= 0)
120-
{
121-
currentIndex = node.Children[nextChild];
187+
if (nextChild >= 0)
188+
{
189+
currentIndex = node.Children[nextChild];
122190
stack.Add(ValueTuple.Create(t.Item1, nextChild - 1));
123-
break;
124-
}
191+
break;
192+
}
125193
}
126-
if (currentIndex == -1)
127-
{
128-
break;
129-
}
130-
}
194+
if (currentIndex == -1)
195+
{
196+
break;
197+
}
198+
}
131199
return sb.ToString();
132-
}
200+
}
133201

134202
/// <summary>Prints a single node representation along with the path prefix</summary>
135203
/// <param name="sb">The builder to print to</param>
136204
/// <param name="nodeIndex">THe index of the node</param>
137205
/// <param name="stack">The stack of nodes to process</param>
138-
private void PrintNodeWithPath([NotNull] StringBuilder sb, int nodeIndex
206+
private void PrintNodeWithPath([NotNull] StringBuilder sb, int nodeIndex
139207
, [NotNull] IReadOnlyList<ValueTuple<int, int>> stack)
140-
{
141-
if (stack.Count > 0)
142-
{
143-
for (var i = 0; i < stack.Count - 1; ++i)
144-
{
208+
{
209+
if (stack.Count > 0)
210+
{
211+
for (var i = 0; i < stack.Count - 1; ++i)
212+
{
145213
sb.Append(stack[i].Item2 >= 0 ? '|' : ' ');
146214
sb.Append(' ', Align - 1);
147215
}
@@ -155,13 +223,13 @@ private void PrintNodeWithPath([NotNull] StringBuilder sb, int nodeIndex
155223
sb.Append('_', Align - 1);
156224
}
157225
PrintNodeText(sb, nodeIndex);
158-
}
226+
}
159227

160228
/// <summary>Prints a single node information</summary>
161229
/// <param name="sb">The builder to print to</param>
162230
/// <param name="nodeIndex">The node index</param>
163-
protected virtual void PrintNodeText([NotNull] StringBuilder sb, int nodeIndex)
164-
{
231+
protected virtual void PrintNodeText([NotNull] StringBuilder sb, int nodeIndex)
232+
{
165233
var n = GetNode(nodeIndex);
166234
sb.AppendLine($"({nodeIndex}, [{n.Begin}-{n.End}), {InternalData.Substring(n.Begin, n.Length)})");
167235
}
@@ -175,7 +243,7 @@ protected struct Node
175243
/// <param name="begin">An edge start offset</param>
176244
/// <param name="end">An edge end offset</param>
177245
/// <param name="terminal">Is the edge terminates the string or not</param>
178-
public Node(int begin, int end, bool terminal) : this(begin, end, terminal, null) {}
246+
public Node(int begin, int end, bool terminal) : this(begin, end, terminal, null) { }
179247

180248
/// <summary>Constructs a new node</summary>
181249
/// <param name="begin">An edge start offset</param>
@@ -206,5 +274,16 @@ public Node(int begin, int end, bool terminal, List<int> children)
206274
/// <summary>Length of the corresponding substring</summary>
207275
public int Length => End - Begin;
208276
}
277+
278+
/// <summary>Branching point</summary>
279+
private class BranchPoint
280+
{
281+
/// <summary>The tree node</summary>
282+
public Node Node;
283+
/// <summary>The chosen edge</summary>
284+
public int EdgeIndex;
285+
/// <summary>The length over the edge</summary>
286+
public int Length;
287+
}
209288
}
210289
}

Experimental/tests/Collections/SuffixTreeTest.cs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System;
2+
using System.Collections.Generic;
23
using System.Linq;
34

45
using NUnit.Framework;
@@ -57,6 +58,42 @@ public void Test15RandomMultiple()
5758
}
5859
}
5960

61+
[Test]
62+
public void Test16AllSuffixes()
63+
{
64+
const int length = 50;
65+
for (var numberOfString = 1; numberOfString < 6; ++numberOfString)
66+
{
67+
var strings = Enumerable.Range(0, numberOfString)
68+
.Select(_ => MakeRandomString(length)).ToArray();
69+
var expectedSuffixes = new List<string>();
70+
var expectedCounts = new LazyDictionary<string, List<int>>(_ => new List<int>());
71+
var st = new SuffixTree();
72+
for (var i = 0; i < strings.Length; ++i)
73+
{
74+
var s = strings[i];
75+
st.Add(s);
76+
for (var j = 0; j < s.Length; ++j)
77+
{
78+
var suffix = s.Substring(j);
79+
expectedSuffixes.Add(suffix);
80+
expectedCounts[suffix].Add(i);
81+
}
82+
}
83+
st.Compact();
84+
expectedSuffixes.Sort();
85+
var suffixes = st.AllSuffixes().ToList();
86+
Assert.That(suffixes.Select(_ => _.Value).ToList(), Is.EqualTo(expectedSuffixes));
87+
var grouped = suffixes.Select(_ => new { value = _.Value, source = _.SourceIndex })
88+
.GroupBy(_ => _.value).ToDictionary(_ => _.Key, _ => _.Select(v => v.source).OrderBy(v => v).ToList());
89+
foreach (var v in grouped)
90+
{
91+
Assert.That(v.Value, Is.EqualTo(expectedCounts[v.Key]));
92+
}
93+
}
94+
}
95+
96+
6097
protected override void Check(string expected, params string[] data)
6198
{
6299
base.Check(expected, data);

0 commit comments

Comments
 (0)