Skip to content

Commit 551ff2d

Browse files
authored
chore: split KMP for strings into separate file (#823)
* chore: split KMP for strings into separate file * chore: split KMP for strings into separate file * fix copyright year * fix test
1 parent 51e6e0d commit 551ff2d

File tree

4 files changed

+109
-90
lines changed

4 files changed

+109
-90
lines changed

Batteries/Data/String.lean

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
import Batteries.Data.String.Basic
22
import Batteries.Data.String.Lemmas
3+
import Batteries.Data.String.Matcher

Batteries/Data/String/Basic.lean

Lines changed: 0 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ Copyright (c) 2022 Jannis Limperg. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
Authors: Jannis Limperg, James Gallicchio, F. G. Dorais
55
-/
6-
import Batteries.Data.Array.Match
76

87
instance : Coe String Substring := ⟨String.toSubstring⟩
98

@@ -12,63 +11,6 @@ namespace String
1211
protected theorem Pos.ne_zero_of_lt : {a b : Pos} → a < b → b ≠ 0
1312
| _, _, hlt, rfl => Nat.not_lt_zero _ hlt
1413

15-
/-- Knuth-Morris-Pratt matcher type
16-
17-
This type is used to keep data for running the Knuth-Morris-Pratt (KMP) string matching algorithm.
18-
KMP is a linear time algorithm to locate all substrings of a string that match a given pattern.
19-
Generating the algorithm data is also linear in the length of the pattern but the data can be
20-
re-used to match the same pattern over different strings.
21-
22-
The KMP data for a pattern string can be generated using `Matcher.ofString`. Then `Matcher.find?`
23-
and `Matcher.findAll` can be used to run the algorithm on an input string.
24-
```
25-
def m := Matcher.ofString "abba"
26-
27-
#eval Option.isSome <| m.find? "AbbabbA" -- false
28-
#eval Option.isSome <| m.find? "aabbaa" -- true
29-
30-
#eval Array.size <| m.findAll "abbabba" -- 2
31-
#eval Array.size <| m.findAll "abbabbabba" -- 3
32-
```
33-
-/
34-
structure Matcher extends Array.Matcher Char where
35-
/-- The pattern for the matcher -/
36-
pattern : Substring
37-
38-
/-- Make KMP matcher from pattern substring -/
39-
@[inline] def Matcher.ofSubstring (pattern : Substring) : Matcher where
40-
toMatcher := Array.Matcher.ofStream pattern
41-
pattern := pattern
42-
43-
/-- Make KMP matcher from pattern string -/
44-
@[inline] def Matcher.ofString (pattern : String) : Matcher :=
45-
Matcher.ofSubstring pattern
46-
47-
/-- The byte size of the string pattern for the matcher -/
48-
abbrev Matcher.patternSize (m : Matcher) : Nat := m.pattern.bsize
49-
50-
/-- Find all substrings of `s` matching `m.pattern`. -/
51-
partial def Matcher.findAll (m : Matcher) (s : Substring) : Array Substring :=
52-
loop s m.toMatcher #[]
53-
where
54-
/-- Accumulator loop for `String.Matcher.findAll` -/
55-
loop (s : Substring) (am : Array.Matcher Char) (occs : Array Substring) : Array Substring :=
56-
match am.next? s with
57-
| none => occs
58-
| some (s, am) =>
59-
loop s am <| occs.push { s with
60-
startPos := ⟨s.startPos.byteIdx - m.patternSize⟩
61-
stopPos := s.startPos }
62-
63-
/-- Find the first substring of `s` matching `m.pattern`, or `none` if no such substring exists. -/
64-
def Matcher.find? (m : Matcher) (s : Substring) : Option Substring :=
65-
match m.next? s with
66-
| none => none
67-
| some (s, _) =>
68-
some { s with
69-
startPos := ⟨s.startPos.byteIdx - m.patternSize⟩
70-
stopPos := s.startPos }
71-
7214
end String
7315

7416
namespace Substring
@@ -133,41 +75,10 @@ def dropSuffix? (s : Substring) (suff : Substring) : Option Substring :=
13375
else
13476
none
13577

136-
/--
137-
Returns all the substrings of `s` that match `pattern`.
138-
-/
139-
@[inline] def findAllSubstr (s pattern : Substring) : Array Substring :=
140-
(String.Matcher.ofSubstring pattern).findAll s
141-
142-
/--
143-
Returns the first substring of `s` that matches `pattern`,
144-
or `none` if there is no such substring.
145-
-/
146-
@[inline] def findSubstr? (s pattern : Substring) : Option Substring :=
147-
(String.Matcher.ofSubstring pattern).find? s
148-
149-
/--
150-
Returns true iff `pattern` occurs as a substring of `s`.
151-
-/
152-
@[inline] def containsSubstr (s pattern : Substring) : Bool :=
153-
s.findSubstr? pattern |>.isSome
154-
15578
end Substring
15679

15780
namespace String
15881

159-
@[inherit_doc Substring.findAllSubstr]
160-
abbrev findAllSubstr (s : String) (pattern : Substring) : Array Substring :=
161-
(String.Matcher.ofSubstring pattern).findAll s
162-
163-
@[inherit_doc Substring.findSubstr?]
164-
abbrev findSubstr? (s : String) (pattern : Substring) : Option Substring :=
165-
s.toSubstring.findSubstr? pattern
166-
167-
@[inherit_doc Substring.containsSubstr]
168-
abbrev containsSubstr (s : String) (pattern : Substring) : Bool :=
169-
s.toSubstring.containsSubstr pattern
170-
17182
/--
17283
If `pre` is a prefix of `s`, i.e. `s = pre ++ t`, returns the remainder `t`.
17384
-/

Batteries/Data/String/Matcher.lean

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/-
2+
Copyright (c) 2023 F. G. Dorais. All rights reserved.
3+
Released under Apache 2.0 license as described in the file LICENSE.
4+
Authors: F. G. Dorais
5+
-/
6+
import Batteries.Data.Array.Match
7+
import Batteries.Data.String.Basic
8+
9+
namespace String
10+
11+
/-- Knuth-Morris-Pratt matcher type
12+
13+
This type is used to keep data for running the Knuth-Morris-Pratt (KMP) string matching algorithm.
14+
KMP is a linear time algorithm to locate all substrings of a string that match a given pattern.
15+
Generating the algorithm data is also linear in the length of the pattern but the data can be
16+
re-used to match the same pattern over different strings.
17+
18+
The KMP data for a pattern string can be generated using `Matcher.ofString`. Then `Matcher.find?`
19+
and `Matcher.findAll` can be used to run the algorithm on an input string.
20+
```
21+
def m := Matcher.ofString "abba"
22+
23+
#eval Option.isSome <| m.find? "AbbabbA" -- false
24+
#eval Option.isSome <| m.find? "aabbaa" -- true
25+
26+
#eval Array.size <| m.findAll "abbabba" -- 2
27+
#eval Array.size <| m.findAll "abbabbabba" -- 3
28+
```
29+
-/
30+
structure Matcher extends Array.Matcher Char where
31+
/-- The pattern for the matcher -/
32+
pattern : Substring
33+
34+
/-- Make KMP matcher from pattern substring -/
35+
@[inline] def Matcher.ofSubstring (pattern : Substring) : Matcher where
36+
toMatcher := Array.Matcher.ofStream pattern
37+
pattern := pattern
38+
39+
/-- Make KMP matcher from pattern string -/
40+
@[inline] def Matcher.ofString (pattern : String) : Matcher :=
41+
Matcher.ofSubstring pattern
42+
43+
/-- The byte size of the string pattern for the matcher -/
44+
abbrev Matcher.patternSize (m : Matcher) : Nat := m.pattern.bsize
45+
46+
/-- Find all substrings of `s` matching `m.pattern`. -/
47+
partial def Matcher.findAll (m : Matcher) (s : Substring) : Array Substring :=
48+
loop s m.toMatcher #[]
49+
where
50+
/-- Accumulator loop for `String.Matcher.findAll` -/
51+
loop (s : Substring) (am : Array.Matcher Char) (occs : Array Substring) : Array Substring :=
52+
match am.next? s with
53+
| none => occs
54+
| some (s, am) =>
55+
loop s am <| occs.push { s with
56+
startPos := ⟨s.startPos.byteIdx - m.patternSize⟩
57+
stopPos := s.startPos }
58+
59+
/-- Find the first substring of `s` matching `m.pattern`, or `none` if no such substring exists. -/
60+
def Matcher.find? (m : Matcher) (s : Substring) : Option Substring :=
61+
match m.next? s with
62+
| none => none
63+
| some (s, _) =>
64+
some { s with
65+
startPos := ⟨s.startPos.byteIdx - m.patternSize⟩
66+
stopPos := s.startPos }
67+
68+
end String
69+
70+
namespace Substring
71+
72+
/--
73+
Returns all the substrings of `s` that match `pattern`.
74+
-/
75+
@[inline] def findAllSubstr (s pattern : Substring) : Array Substring :=
76+
(String.Matcher.ofSubstring pattern).findAll s
77+
78+
/--
79+
Returns the first substring of `s` that matches `pattern`,
80+
or `none` if there is no such substring.
81+
-/
82+
@[inline] def findSubstr? (s pattern : Substring) : Option Substring :=
83+
(String.Matcher.ofSubstring pattern).find? s
84+
85+
/--
86+
Returns true iff `pattern` occurs as a substring of `s`.
87+
-/
88+
@[inline] def containsSubstr (s pattern : Substring) : Bool :=
89+
s.findSubstr? pattern |>.isSome
90+
91+
end Substring
92+
93+
namespace String
94+
95+
@[inherit_doc Substring.findAllSubstr]
96+
abbrev findAllSubstr (s : String) (pattern : Substring) : Array Substring :=
97+
(String.Matcher.ofSubstring pattern).findAll s
98+
99+
@[inherit_doc Substring.findSubstr?]
100+
abbrev findSubstr? (s : String) (pattern : Substring) : Option Substring :=
101+
s.toSubstring.findSubstr? pattern
102+
103+
@[inherit_doc Substring.containsSubstr]
104+
abbrev containsSubstr (s : String) (pattern : Substring) : Bool :=
105+
s.toSubstring.containsSubstr pattern
106+
107+
end String

test/kmp_matcher.lean

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import Batteries.Data.String.Basic
1+
import Batteries.Data.String.Matcher
22

33
/-! Tests for Knuth-Morris-Pratt matching algorithm -/
44

0 commit comments

Comments
 (0)