-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathlibrary.h
327 lines (273 loc) · 16.9 KB
/
library.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
#pragma once
// #define UNICODE_SUPPORT
#define DEFAULT_SEPARATOR_CHAR XL(' ')
#define DEFAULT_MAX_EDIT_DISTANCE 2
#define DEFAULT_PREFIX_LENGTH 7
#define DEFAULT_COUNT_THRESHOLD 1
#define DEFAULT_INITIAL_CAPACITY 82765
#define DEFAULT_COMPACT_LEVEL 5
#define min3(a, b, c) (min(a, min(b, c)))
#define MAXINT LLONG_MAX
#define M
#define MAXLONG MAXINT
#include <fstream>
#include <sstream>
#include <cstring>
#include <cstdlib>
#include <algorithm>
#include <cctype>
#include <locale>
#include <regex>
#include <iostream>
#include "unordered_set"
#include "include/Defines.h"
#include "include/Helpers.h"
#include "include/EditDistance.h"
#include "cereal/types/unordered_map.hpp"
#include "cereal/types/string.hpp"
#include "cereal/types/vector.hpp"
#include "cereal/types/memory.hpp"
#include "cereal/archives/binary.hpp"
#include "cereal/cereal.hpp"
#include "fstream"
// SymSpell supports compound splitting / decompounding of multi-word input strings with three cases:
// 1. mistakenly inserted space into a correct word led to two incorrect terms
// 2. mistakenly omitted space between two correct words led to one incorrect combined term
// 3. multiple independent input terms with/without spelling errors
namespace symspellcpppy
{
static inline void ltrim(xstring &s)
{
s.erase(s.begin(), find_if(s.begin(), s.end(), [](xchar ch)
{ return !isspace(ch); }));
}
static inline void rtrim(xstring &s)
{
s.erase(find_if(s.rbegin(), s.rend(), [](xchar ch)
{ return !isspace(ch); })
.base(),
s.end());
}
static inline void trim(xstring &s)
{
ltrim(s);
rtrim(s);
}
class Info
{
private:
xstring segmentedstring;
xstring correctedstring;
int distanceSum;
double probabilityLogSum;
public:
void set(xstring &seg, xstring &cor, int d, double prob)
{
segmentedstring = seg;
correctedstring = cor;
distanceSum = d;
probabilityLogSum = prob;
};
xstring getSegmented() const
{
return segmentedstring;
};
xstring getCorrected() const
{
return correctedstring;
};
int getDistance() const
{
return distanceSum;
};
double getProbability() const
{
return probabilityLogSum;
};
};
/// <summary>Controls the closeness/quantity of returned spelling suggestions.</summary>
enum Verbosity
{
/// <summary>Top suggestion with the highest term frequency of the suggestions of smallest edit distance found.</summary>
Top,
/// <summary>All suggestions of smallest edit distance found, suggestions ordered by term frequency.</summary>
Closest,
/// <summary>All suggestions within maxEditDistance, suggestions ordered by edit distance
/// , then by term frequency (slower, no early termination).</summary>
All
};
class SymSpell
{
protected:
int initialCapacity;
int maxDictionaryEditDistance;
int prefixLength; // prefix length 5..7
long countThreshold; // a threshold might be specified, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction
int compactMask;
DistanceAlgorithm distanceAlgorithm = DistanceAlgorithm::DamerauOSADistance;
int maxDictionaryWordLength; // maximum std::unordered_map term length
std::shared_ptr<std::unordered_map<int, std::vector<xstring>>> deletes;
std::unordered_map<xstring, int64_t> words;
std::unordered_map<xstring, int64_t> belowThresholdWords;
public:
int MaxDictionaryEditDistance() const;
int PrefixLength() const;
int MaxLength() const;
long CountThreshold() const;
int WordCount();
int EntryCount();
/// <summary>Create a new instanc of SymSpell.</summary>
/// <remarks>Specifying ann accurate initialCapacity is not essential,
/// but it can help speed up processing by alleviating the need for
/// data restructuring as the size grows.</remarks>
/// <param name="initialCapacity">The expected number of words in dictionary.</param>
/// <param name="maxDictionaryEditDistance">Maximum edit distance for doing lookups.</param>
/// <param name="prefixLength">The length of word prefixes used for spell checking..</param>
/// <param name="countThreshold">The minimum frequency count for dictionary words to be considered correct spellings.</param>
/// <param name="compactLevel">Degree of favoring lower memory use over speed (0=fastest,most memory, 16=slowest,least memory).</param>
explicit SymSpell(int maxDictionaryEditDistance = DEFAULT_MAX_EDIT_DISTANCE,
int prefixLength = DEFAULT_PREFIX_LENGTH, int countThreshold = DEFAULT_COUNT_THRESHOLD,
int initialCapacity = DEFAULT_INITIAL_CAPACITY,
unsigned char compactLevel = DEFAULT_COMPACT_LEVEL);
bool CreateDictionaryEntry(const xstring &key, int64_t count, const std::shared_ptr<SuggestionStage> &staging);
bool DeleteDictionaryEntry(const xstring &key);
std::unordered_map<xstring, long> bigrams;
int64_t bigramCountMin = MAXLONG;
/// <summary>Load multiple dictionary entries from a file of word/frequency count pairs</summary>
/// <remarks>Merges with any dictionary data already loaded.</remarks>
/// <param name="corpus">The path+filename of the file.</param>
/// <param name="termIndex">The column position of the word.</param>
/// <param name="countIndex">The column position of the frequency count.</param>
/// <param name="separatorChars">Separator characters between term(s) and count.</param>
/// <returns>True if file loaded, or false if file not found.</returns>
bool LoadBigramDictionary(const std::string &corpus, int termIndex, int countIndex,
xchar separatorChars = DEFAULT_SEPARATOR_CHAR);
bool LoadBigramDictionary(xifstream &corpusStream, int termIndex, int countIndex,
xchar separatorChars = DEFAULT_SEPARATOR_CHAR);
/// <summary>Load multiple dictionary entries from a file of word/frequency count pairs</summary>
/// <remarks>Merges with any dictionary data already loaded.</remarks>
/// <param name="corpus">The path+filename of the file.</param>
/// <param name="termIndex">The column position of the word.</param>
/// <param name="countIndex">The column position of the frequency count.</param>
/// <param name="separatorChars">Separator characters between term(s) and count.</param>
/// <returns>True if file loaded, or false if file not found.</returns>
bool LoadDictionary(const std::string &corpus, int termIndex, int countIndex,
xchar separatorChars = DEFAULT_SEPARATOR_CHAR);
bool LoadDictionary(xifstream &corpusStream, int termIndex, int countIndex,
xchar separatorChars = DEFAULT_SEPARATOR_CHAR);
/// <summary>Load multiple dictionary words from a file containing plain text.</summary>
/// <remarks>Merges with any dictionary data already loaded.</remarks>
/// <param name="corpus">The path+filename of the file.</param>
/// <returns>True if file loaded, or false if file not found.</returns>
bool CreateDictionary(const std::string &corpus);
bool CreateDictionary(xifstream &corpusStream);
/// <summary>Remove all below threshold words from the dictionary.</summary>
/// <remarks>This can be used to reduce memory consumption after populating the dictionary from
/// a corpus using CreateDictionary.</remarks>
void PurgeBelowThresholdWords();
void CommitStaged(const std::shared_ptr<SuggestionStage> &staging);
/// <summary>Find suggested spellings for a given input word, using the maximum
/// edit distance specified during construction of the SymSpell dictionary.</summary>
/// <param name="input">The word being spell checked.</param>
/// <param name="verbosity">The value controlling the quantity/closeness of the retuned suggestions.</param>
/// <returns>A List of SuggestItem object representing suggested correct spellings for the input word,
/// sorted by edit distance, and secondarily by count frequency.</returns>
std::vector<SuggestItem> Lookup(const xstring &input, Verbosity verbosity);
/// <summary>Find suggested spellings for a given input word, using the maximum
/// edit distance specified during construction of the SymSpell dictionary.</summary>
/// <param name="input">The word being spell checked.</param>
/// <param name="verbosity">The value controlling the quantity/closeness of the retuned suggestions.</param>
/// <param name="maxEditDistance">The maximum edit distance between input and suggested words.</param>
/// <returns>A List of SuggestItem object representing suggested correct spellings for the input word,
/// sorted by edit distance, and secondarily by count frequency.</returns>
std::vector<SuggestItem> Lookup(const xstring &input, Verbosity verbosity, int maxEditDistance);
/// <summary>Find suggested spellings for a given input word.</summary>
/// <param name="input">The word being spell checked.</param>
/// <param name="verbosity">The value controlling the quantity/closeness of the retuned suggestions.</param>
/// <param name="maxEditDistance">The maximum edit distance between input and suggested words.</param>
/// <param name="includeUnknown">Include input word in suggestions, if no words within edit distance found.</param>
/// <returns>A List of SuggestItem object representing suggested correct spellings for the input word,
/// sorted by edit distance, and secondarily by count frequency.</returns>
std::vector<SuggestItem> Lookup(const xstring &input, Verbosity verbosity, int maxEditDistance, bool includeUnknown);
/// <summary>Find suggested spellings for a given input word.</summary>
/// <param name="input">The word being spell checked.</param>
/// <param name="verbosity">The value controlling the quantity/closeness of the retuned suggestions.</param>
/// <param name="maxEditDistance">The maximum edit distance between input and suggested words.</param>
/// <param name="includeUnknown">Include input word in suggestions, if no words within edit distance found.</param>
/// <param name="transfer_casing"> Lower case the word or not
/// <returns>A List of SuggestItem object representing suggested correct spellings for the input word,
/// sorted by edit distance, and secondarily by count frequency.</returns>
std::vector<SuggestItem> Lookup(const xstring &input, Verbosity verbosity, int maxEditDistance, bool includeUnknown, bool transferCasing);
private:
bool
DeleteInSuggestionPrefix(const xstring &deleteSugg, int deleteLen, xstring suggestion, int suggestionLen) const;
static std::vector<xstring> ParseWords(const xstring &text);
std::shared_ptr<std::unordered_set<xstring>>
Edits(const xstring &word, int editDistance, std::shared_ptr<std::unordered_set<xstring>> deleteWords);
std::shared_ptr<std::unordered_set<xstring>> EditsPrefix(const xstring &key);
int GetstringHash(const xstring &s) const;
public:
// ######################
// LookupCompound supports compound aware automatic spelling correction of multi-word input strings with three cases:
// 1. mistakenly inserted space into a correct word led to two incorrect terms
// 2. mistakenly omitted space between two correct words led to one incorrect combined term
// 3. multiple independent input terms with/without spelling errors
/// <summary>Find suggested spellings for a multi-word input string (supports word splitting/merging).</summary>
/// <param name="input">The string being spell checked.</param>
/// <returns>A List of SuggestItem object representing suggested correct spellings for the input string.</returns>
std::vector<SuggestItem> LookupCompound(const xstring &input);
/// <summary>Find suggested spellings for a multi-word input string (supports word splitting/merging).</summary>
/// <param name="input">The string being spell checked.</param>
/// <param name="maxEditDistance">The maximum edit distance between input and suggested words.</param>
/// <returns>A List of SuggestItem object representing suggested correct spellings for the input string.</returns>
std::vector<SuggestItem> LookupCompound(const xstring &input, int editDistanceMax);
/// <summary>Find suggested spellings for a multi-word input string (supports word splitting/merging).</summary>
/// <param name="input">The string being spell checked.</param>
/// <param name="maxEditDistance">The maximum edit distance between input and suggested words.</param>
/// <returns>A List of SuggestItem object representing suggested correct spellings for the input string.</returns>
std::vector<SuggestItem> LookupCompound(const xstring &input, int editDistanceMax, bool transferCasing);
// ######
// WordSegmentation divides a string into words by inserting missing spaces at the appropriate positions
// misspelled words are corrected and do not affect segmentation
// existing spaces are allowed and considered for optimum segmentation
// SymSpell.WordSegmentation uses a novel approach *without* recursion.
// https://medium.com/@wolfgarbe/fast-word-segmentation-for-noisy-text-2c2c41f9e8da
// While each string of length n can be segmentend in 2^n−1 possible compositions https://en.wikipedia.org/wiki/Composition_(combinatorics)
// SymSpell.WordSegmentation has a linear runtime O(n) to find the optimum composition
// number of all words in the corpus used to generate the frequency dictionary
// this is used to calculate the word occurrence probability p from word counts c : p=c/N
// N equals the sum of all counts c in the dictionary only if the dictionary is complete, but not if the dictionary is truncated or filtered
static const int64_t N = 1024908267229L;
/// <summary>Find suggested spellings for a multi-word input string (supports word splitting/merging).</summary>
/// <param name="input">The string being spell checked.</param>
/// <returns>The word segmented string,
/// the word segmented and spelling corrected string,
/// the Edit distance sum between input string and corrected string,
/// the Sum of word occurrence probabilities in log scale (a measure of how common and probable the corrected segmentation is).</returns>
Info WordSegmentation(const xstring &input);
/// <summary>Find suggested spellings for a multi-word input string (supports word splitting/merging).</summary>
/// <param name="input">The string being spell checked.</param>
/// <param name="maxEditDistance">The maximum edit distance between input and corrected words
/// (0=no correction/segmentation only).</param>
/// <returns>The word segmented string,
/// the word segmented and spelling corrected string,
/// the Edit distance sum between input string and corrected string,
/// the Sum of word occurrence probabilities in log scale (a measure of how common and probable the corrected segmentation is).</returns>
Info WordSegmentation(const xstring &input, int maxEditDistance);
/// <summary>Find suggested spellings for a multi-word input string (supports word splitting/merging).</summary>
/// <param name="input">The string being spell checked.</param>
/// <param name="maxSegmentationWordLength">The maximum word length that should be considered.</param>
/// <param name="maxEditDistance">The maximum edit distance between input and corrected words
/// (0=no correction/segmentation only).</param>
/// <returns>The word segmented string,
/// the word segmented and spelling corrected string,
/// the Edit distance sum between input string and corrected string,
/// the Sum of word occurrence probabilities in log scale (a measure of how common and probable the corrected segmentation is).</returns>
Info WordSegmentation(const xstring &input, int maxEditDistance, int maxSegmentationWordLength);
template <class Archive>
void serialize(Archive &ar)
{
ar(deletes, words, maxDictionaryWordLength);
}
};
}