-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlufz-util.h
255 lines (221 loc) · 7.51 KB
/
lufz-util.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#ifndef LUFZ_H_
#define LUFZ_H_
#include <stdlib.h>
#include <set>
#include <string>
#include <vector>
#include "lufz-configs.h"
#include "lufz-utf8.h"
namespace lufz {
const size_t MAX_LINE_LENGTH = 65536;
const int WILDIZE_ALL_BEYOND = 10;
const int MAX_ENTRY_LENGTH = 30;
const std::string VERSION = "v0.08";
const int AGM_INDEX_SHARDS = 2000;
const int INDEX_SHARDS = 2000;
const int PHONE_INDEX_SHARDS = 2000;
struct PhraseInfo {
std::string normalized; // Normalized version (letters and spaces only).
/**
* Ordered index for forms[0]. 0 is always the dummy empty string.
*/
int base_index;
/**
* Case/punctuation variants.
* Each entry only uses letters from the lexicon (can be in
* lower/upper case though—the letters listed in Lexicon.letters[]
* are only uppercase), spaces, and allowed punctuation
* marks (typically just ['-]).
*/
std::set<std::string> forms;
long double importance;
/**
* phones[k] is the kth possible pronunciation for a form.
* It is a vector of phonemes (IPA or ARPAbet).
*/
std::set<std::vector<std::string>> phones;
PhraseInfo() : base_index(-1), importance(0) {}
};
struct Lexicon {
std::vector<PhraseInfo> phrase_infos;
/**
* Only the letters used in the lexicon. All uppoercase.
*/
std::set<std::string> letters;
/**
* The rest of the fields are copied from the config, and
* may include things that do not occur in the lexicon at all.
*/
std::vector<std::string> vowels;
std::vector<std::string> consonants;
std::set<std::string> combiners;
std::set<std::string> punctuations;
std::set<std::string> spaces;
std::map<std::string, std::string> conversions;
};
class LufzUtil {
public:
/**
* config_name should be a key for one of the predefined configs in
* lufz-configs.h. Currently supported values (case-sensitive):
* "Brazilian"
* "English"
* "French"
* "German"
* "Hindi"
* "Italian"
* "Marathi"
* "Phonetics"
* "Portuguese"
* "Spanish"
*/
explicit LufzUtil(const std::string& config_name);
const std::string& Language() {
return language_;
}
const std::string& Script() {
return script_;
}
const std::string& ConfigName() {
return config_->name;
}
std::string Join(const std::vector<std::string>& v, const std::string& delimiter="");
std::vector<std::string> Split(const std::string& str, const std::string& delimiter="");
bool EndsWith(const std::string& str, const std::string& suffix);
/**
* Returns a vector of "parts" of the string. A part is something that
* is a letter (or a case-changed variant of the letter), or punctuation,
* or a space, or a UTF8Char. it does the following preprocessing:
* - Applies Lexicon.conversions.
* - Maps spaces (i.e., converts each kind of space listed in Lexicon.spaces
* into a regular space unless map_spaces is false;
* - Removes leading/trailing spaces, combines consective spaces.
* - If Lexicon.script is LATIN, and if a string is not a letter but it
* can be converted to a letter by removing diacritics, then it does so.
* Eg, for English:
* "[Alpha-x 4\t\tD'e " ->
* {"[", "A", "l", "p", "h", "a", "-", "x", " ", "4", " ", "D", "'", "e"}
*/
std::vector<std::string> PartsOf(const std::string& s, bool maps_spaces = true);
/**
* Takes the result of PartsOf() and removes entries that are not letters
* (ignoring case), not spaces, and not in Lexicon.punctuations.
* Also returns the results of PartsOf() in *parts_of, if not null.
* Eg, for English:
* "[Alpha-x 4\t\tD'e " ->
* {"A", "l", "p", "h", "a", "-", "x", " ", "D", "'", "e"}
*/
std::vector<std::string> PrunedPartsOf(
const std::string& s,
std::vector<std::string>* parts_of = nullptr);
/**
* Takes the result of PrunedPartsOf() and converts all letter-convertible
* entries to letters (changing case if needed), and replaces punctuations
* with spaces, thus returning letters and spaces only. Trims consecutive
* spaces and leading/trailing spaces.
* Also returns the results of PartsOf() in *parts_of, and PrunedPartsOf()
* in *pruned_parts_of, if not null.
* Eg, for English:
* "[Alpha-x 4\t\tD'e " ->
* {"A", "L", "P", "H", "A", " ", "X", " ", "D", " ", "E"}
*/
std::vector<std::string> LetterizedPrunedPartsOf(
const std::string& s,
std::vector<std::string>* parts_of = nullptr,
std::vector<std::string>* pruned_parts_of = nullptr);
/**
* Takes the results of LetterizedPrunedPartsOf and retains
* only the letters.
* Also returns the results of PartsOf() in *parts_of, and PrunedPartsOf()
* in *pruned_parts_of, and LetterizedPrunedPartsOf in
* *letterized_pruned_parts_of, if not null.
* Eg, for English:
* "[Alpha-x 4\t\tD'e " ->
* {"A", "L", "P", "H", "A", "X", "D", "E"}
*/
std::vector<std::string> LettersOf(
const std::string& s,
std::vector<std::string>* parts_of = nullptr,
std::vector<std::string>* pruned_parts_of = nullptr,
std::vector<std::string>* letterized_pruned_parts_of = nullptr);
/**
* Returns the joined output of PartsOf().
*/
std::string StrPartsOf(const std::string& s) {
return Join(PartsOf(s));
}
/**
* Returns the joined output of PrunedPartsOf().
*/
std::string StrPrunedPartsOf(const std::string& s) {
return Join(PrunedPartsOf(s));
}
/**
* Returns the joined output of LetterizedPrunedPartsOf().
*/
std::string StrLetterizedPrunedPartsOf(const std::string& s) {
return Join(LetterizedPrunedPartsOf(s));
}
/**
* Returns the joined output of LettersOf().
*/
std::string StrLettersOf(const std::string& s) {
return Join(LettersOf(s));
}
/**
* Returns true if s is in Lexicon.letters, ignoring case. If "letter"
* is not null, then sets *letter to the actual matched letter when
* returning true.
*/
bool IsLetter(const std::string& s, std::string* letter = nullptr);
/**
* Return true only for punctuation allowed in Lexicon.punctuations.
*/
bool IsPunctuation(const std::string& s);
/**
* Returns true if PartsOf() yields all "?"s only.
*/
bool AllWild(const std::string& s);
/**
* Return indexing key for s.
*/
std::string Key(const std::string& s);
/**
* Return indexing anagram key for s.
*/
std::string AgmKey(const std::string& s);
/**
* A hash value of the string. We use Java's
* hashing algo here (and also in downstream JavaScript code).
*/
static int JavaHash(const std::string& key);
/**
* The shard that key belongs to.
*/
static int IndexShard(const std::string& key, int num_shards);
/**
* Pass "-" as the file name for stdin.
* formats accepted:
* <entry>
* OR (cannot mix the two formats).
* <importance>\t<entry>
* The 0th entry created is always the empty phrase.
* All entries that map to the same StrLetterizedPrunedPartsof() are
* combined (but retained as different "forms" of each other. When
* combining like this, we take the max of importance scores (if any).
*/
bool ReadLexicon(const char* lexicon_file, Lexicon* lexicon, const char* crossed_words_file = nullptr);
private:
/**
* IsLetter() variant that works on a potential letter that has
* already been split into its UTF8Chars and uppercased.
*/
bool IsLetter(const std::vector<std::string>& chars);
const LufzConfig* config_;
std::string language_;
std::string script_;
std::vector<std::string> letters_;
std::map<std::string, int> letter_indices_;
};
} // namespace lufz
#endif // LUFZ_H_