Skip to content

Commit a102ec5

Browse files
author
gabriel.csollei
committed
Add new test to stemmer
1 parent ac4aae9 commit a102ec5

File tree

2 files changed

+244
-0
lines changed

2 files changed

+244
-0
lines changed

src/utils/SlovakStemmer.spec.ts

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import { expect } from "chai";
2+
import "mocha";
3+
import { SlovakStemmer } from "./SlovakStemmer";
4+
5+
describe("SlovakStemmer", () => {
6+
describe("Stemme", () => {
7+
it("It should remove ends of words", () => {
8+
expect(SlovakStemmer.steme("rybárove")).to.be.equal("rybár");
9+
expect(SlovakStemmer.steme("rybárovích")).to.be.equal("rybár");
10+
expect(SlovakStemmer.steme("rybárova")).to.be.equal("rybár");
11+
expect(SlovakStemmer.steme("rybárovi")).to.be.equal("rybár");
12+
expect(SlovakStemmer.steme("rybárov")).to.be.equal("rybár");
13+
expect(SlovakStemmer.steme("rybáry")).to.be.equal("rybár");
14+
15+
expect(SlovakStemmer.steme("Jakub")).to.be.equal("Jakub");
16+
expect(SlovakStemmer.steme("Jakub")).to.be.equal("Jakub");
17+
expect(SlovakStemmer.steme("Jakubovi")).to.be.equal("Jakub");
18+
expect(SlovakStemmer.steme("Jakubom")).to.be.equal("Jakub");
19+
expect(SlovakStemmer.steme("Jakubov")).to.be.equal("Jakub");
20+
expect(SlovakStemmer.steme("Jakuba")).to.be.equal("Jakub");
21+
expect(SlovakStemmer.steme("Jakubovi")).to.be.equal("Jakub");
22+
expect(SlovakStemmer.steme("Jakubom")).to.be.equal("Jakub");
23+
expect(SlovakStemmer.steme("Jakuboch")).to.be.equal("Jakub");
24+
expect(SlovakStemmer.steme("Jakubmi")).to.be.equal("Jakub");
25+
26+
expect(SlovakStemmer.steme("najžľaznatejšieho")).to.be.equal("žľaznat");
27+
expect(SlovakStemmer.steme("najžľaznatejších")).to.be.equal("žľaznat");
28+
expect(SlovakStemmer.steme("najžľaznatejšia")).to.be.equal("žľaznat");
29+
expect(SlovakStemmer.steme("zefektívnenie")).to.be.equal("zefektívn");
30+
// expect(SlovakStemmer.steme("zefektívnenila")).to.be.equal("zefektívn");
31+
expect(SlovakStemmer.steme("umožnenie")).to.be.equal("umožn");
32+
33+
});
34+
});
35+
});

src/utils/SlovakStemmer.ts

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
function removePredpona(char: string): string {
2+
if (char.length > 6 && char.startsWith("naj")) {
3+
return char.substr(3, char.length);
4+
}
5+
6+
return char;
7+
}
8+
9+
function removeCase(key: string): string {
10+
const len = key.length;
11+
if (len > 9 && key.endsWith("ejšieho")
12+
|| key.endsWith("ejšiemu")) {
13+
return key.substring(0, len - 7);
14+
}
15+
16+
if (len > 8 && (key.endsWith("ejších") ||
17+
key.endsWith("encoch") ||
18+
key.endsWith("ejšími") ||
19+
key.endsWith("encami"))) {
20+
return key.substring(0, len - 6);
21+
}
22+
23+
if (len > 7 && (key.endsWith("ejšia") ||
24+
key.endsWith("atami") ||
25+
key.endsWith("atách") ||
26+
key.endsWith("eniec") ||
27+
key.endsWith("encom") ||
28+
key.endsWith("ejšom") ||
29+
key.endsWith("ejším") ||
30+
key.endsWith("ejšej") ||
31+
key.endsWith("ejšou") ||
32+
key.endsWith("ejšiu") ||
33+
key.endsWith("ejšie")
34+
)) {
35+
return key.substring(0, len - 5);
36+
}
37+
38+
if (len > 6 &&
39+
(key.endsWith("eťom") ||
40+
key.endsWith("iami") ||
41+
key.endsWith("atám") ||
42+
key.endsWith("aťom") ||
43+
key.endsWith("ovia") ||
44+
key.endsWith("iach") ||
45+
key.endsWith("atám") ||
46+
key.endsWith("ence") ||
47+
key.endsWith("ieho") ||
48+
key.endsWith("iemu") ||
49+
key.endsWith("ieme") ||
50+
key.endsWith("iete") ||
51+
key.endsWith("ejší") ||
52+
// gabos
53+
key.endsWith("enie"))) {
54+
return key.substring(0, len - 4);
55+
}
56+
57+
if (len > 5 &&
58+
(key.endsWith("ich") || // From cz
59+
key.endsWith("eho") ||
60+
key.endsWith("ych") ||
61+
key.endsWith("ích") || // From cz
62+
key.endsWith("ého") || // From cz
63+
key.endsWith("emi") || // From cz
64+
key.endsWith("ému") || // From cz
65+
key.endsWith("emu") ||
66+
/*key.endsWith("iho") ||*/ // Veľmi malý vplyv
67+
key.endsWith("ími") || // From cz
68+
key.endsWith("imi") ||
69+
key.endsWith("ách") || // From cz
70+
key.endsWith("ých") || // From cz
71+
key.endsWith("ami") || // From cz
72+
/* key.endsWith("ové") ||
73+
key.endsWith("ový") ||
74+
key.endsWith("oví") ||*/
75+
key.endsWith("ovi") || // From cz
76+
key.endsWith("ieť") ||
77+
key.endsWith("ieš") ||
78+
key.endsWith("ejú") ||
79+
key.endsWith("ajú") ||
80+
key.endsWith("ujú") ||
81+
key.endsWith("ejú") ||
82+
key.endsWith("eme") ||
83+
key.endsWith("íte") ||
84+
key.endsWith("íme") ||
85+
key.endsWith("ými") || // From cz
86+
key.endsWith("ymi") ||
87+
key.endsWith("ach") ||
88+
key.endsWith("iam") ||
89+
/*key.endsWith("atá") ||*/
90+
key.endsWith("iac") ||
91+
key.endsWith("ite") ||
92+
key.endsWith("ili") ||
93+
key.endsWith("ila") ||
94+
key.endsWith("ilo") ||
95+
key.endsWith("ime") ||
96+
key.endsWith("och")
97+
)) {
98+
return key.substring(0, len - 3);
99+
}
100+
101+
if (len > 4 &&
102+
(/*key.endsWith("ín") ||*/
103+
key.endsWith("ím") || // From cz
104+
key.endsWith("ám") || // From cz
105+
key.endsWith("am") ||
106+
key.endsWith("us") || // From cz
107+
key.endsWith("ým") || // From cz
108+
key.endsWith("ym") ||
109+
key.endsWith("mi") || // From cz
110+
key.endsWith("ou") || // From cz
111+
key.endsWith("om") ||
112+
key.endsWith("ej") ||
113+
key.endsWith("ov") ||
114+
key.endsWith("ia") ||
115+
key.endsWith("ie") ||
116+
key.endsWith("iu") ||
117+
key.endsWith("im") ||
118+
key.endsWith("ho") ||
119+
key.endsWith("mu") ||
120+
key.endsWith("me") ||
121+
key.endsWith("te") ||
122+
key.endsWith("ať") ||
123+
key.endsWith("aš") ||
124+
key.endsWith("úť") ||
125+
key.endsWith("iť") ||
126+
key.endsWith("íš") ||
127+
key.endsWith("iš") ||
128+
key.endsWith("il") ||
129+
key.endsWith("úc") ||
130+
key.endsWith("eš"))) {
131+
return key.substring(0, len - 2);
132+
}
133+
134+
if (len > 3) {
135+
switch (key[len - 1]) {
136+
case "a":
137+
case "e":
138+
case "i":
139+
case "o":
140+
case "u":
141+
case "ú":
142+
/*case "ô":*/
143+
case "y":
144+
case "á":
145+
case "é":
146+
case "í":
147+
case "ý":
148+
return key.substring(0, len - 1);
149+
}
150+
}
151+
152+
return key;
153+
}
154+
155+
function removePossessives(s: string): string {
156+
const len = s.length;
157+
if (len > 5 && s.endsWith("in") ||
158+
s.endsWith("ov")) {
159+
return s.substr(0, len - 2);
160+
}
161+
162+
return s;
163+
}
164+
165+
function normalize(s: string): string {
166+
const len = s.length;
167+
// toto pravidlo znižuje FP ale zvyšuje FN
168+
/* if (len > 1 && s[len - 2] == "i" && s[len-1]=="c") {
169+
s[len - 2] = s[len - 1]; // e* > *
170+
return len - 1;
171+
}*/
172+
switch (s[len - 1]) {
173+
case "c": // [cč] -> k
174+
case "č":
175+
return s.replace(/./g, (e, i) => i === len - 1 ? e : "k");
176+
case "ľ": // [ľ] -> l
177+
return s.replace(/./g, (e, i) => i === len - 1 ? e : "l");
178+
case "ň": // [ľ] -> l
179+
return s.replace(/./g, (e, i) => i === len - 1 ? e : "n");
180+
case "ť": // [ľ] -> l
181+
return s.replace(/./g, (e, i) => i === len - 1 ? e : "t");
182+
}
183+
184+
if (len > 3 && s[len - 3] === "i" && (s[len - 2] === "e" || s[len - 2] === "a" || s[len - 2] === "u")) {
185+
return s.replace(/./g, (e, i) => {
186+
if (i === len - 3) {
187+
return s[len - 2];
188+
}
189+
if (i === len - 2) {
190+
return s[len - 1];
191+
}
192+
193+
return e;
194+
});
195+
}
196+
197+
return s;
198+
}
199+
200+
export class SlovakStemmer {
201+
public static steme(word: string): string {
202+
const result = removePossessives(removeCase(removePredpona(word)));
203+
if (result.length) {
204+
return normalize(result);
205+
}
206+
207+
return result;
208+
}
209+
}

0 commit comments

Comments
 (0)