Skip to content

Commit f3a0e71

Browse files
committed
Fix a bug when the option to skip white space is disabled (#231)
1 parent df60d94 commit f3a0e71

File tree

2 files changed

+129
-4
lines changed

2 files changed

+129
-4
lines changed

filter/sentence_splitter.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,19 +62,22 @@ func (s SentenceSplitter) ScanSentences(data []byte, atEOF bool) (advance int, t
6262
r, size := utf8.DecodeRune(data[p:])
6363
if s.SkipWhiteSpace && unicode.IsSpace(r) {
6464
p += size
65-
if head {
65+
switch {
66+
case head:
6667
start, end = p, p
67-
} else if s.isDelim(r) {
68+
case s.isDelim(r):
6869
return p, data[start:end], nil
69-
} else if s.DoubleLineFeedSplit && r == '\n' {
70+
case s.DoubleLineFeedSplit && r == '\n':
7071
if nn {
7172
return p, data[start:end], nil
7273
}
7374
nn = true
75+
case nn:
76+
nn = false
7477
}
7578
continue
7679
}
77-
head, nn = false, false // clear flags
80+
head = false
7881
if end != p {
7982
for i := 0; i < size; i++ {
8083
data[end+i] = data[p+i]
@@ -83,6 +86,12 @@ func (s SentenceSplitter) ScanSentences(data []byte, atEOF bool) (advance int, t
8386
p += size
8487
end += size
8588
rcount++
89+
if s.DoubleLineFeedSplit && r == '\n' {
90+
if nn {
91+
return p, data[start:end], nil
92+
}
93+
nn = true
94+
}
8695
if !s.isDelim(r) && rcount < s.MaxRuneLen {
8796
continue
8897
}

filter/sentence_splitter_test.go

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,122 @@ func Test_DelimWhiteSpace(t *testing.T) {
109109
}
110110
}
111111

112+
func Test_DisableSkipWhiteSpace(t *testing.T) {
113+
testdata := []struct {
114+
input string
115+
expect []string
116+
}{
117+
{
118+
input: "",
119+
expect: []string{},
120+
},
121+
{
122+
input: "あああ",
123+
expect: []string{"あああ"},
124+
},
125+
{
126+
input: " ",
127+
expect: []string{" "},
128+
},
129+
{
130+
input: " x x x ",
131+
expect: []string{" ", "x ", "x ", "x "},
132+
},
133+
{
134+
input: "こんにちはは「さようなら 」 \U0001f363が好き",
135+
expect: []string{"こんにちはは「さようなら 」 ", "\U0001f363が好き"},
136+
},
137+
{
138+
input: "こんにちは \t\tさようなら おはよう おやすみ ",
139+
expect: []string{"こんにちは ", "\t\tさようなら ", "おはよう ", "おやすみ "},
140+
},
141+
{
142+
input: "すももも\t\n\n \tももも\n\nもものうち\n\n",
143+
expect: []string{"すももも\t\n\n", " ", "\tももも\n\n", "もものうち\n\n"},
144+
},
145+
}
146+
147+
s := filter.SentenceSplitter{
148+
Delim: []rune{' ', ' '}, // white spaces
149+
Follower: []rune{'.', '」', '」', '』', ')', ')', '}', '}', '〉', '》'},
150+
SkipWhiteSpace: false,
151+
DoubleLineFeedSplit: true,
152+
MaxRuneLen: 256,
153+
}
154+
for _, d := range testdata {
155+
scanner := bufio.NewScanner(strings.NewReader(d.input))
156+
scanner.Split(s.ScanSentences)
157+
r := make([]string, 0, len(d.expect))
158+
for scanner.Scan() {
159+
r = append(r, scanner.Text())
160+
}
161+
if !reflect.DeepEqual(r, d.expect) {
162+
t.Errorf("input %q, got %#v, expected %#v", d.input, r, d.expect)
163+
}
164+
if got := strings.Join(r, ""); got != d.input {
165+
t.Errorf("got len=%d %q, input len=%d %q", len(got), got, len(d.input), d.input)
166+
}
167+
}
168+
}
169+
170+
func Test_DisableSkipWhiteSpaceAndDoubleLineFeedSplit(t *testing.T) {
171+
testdata := []struct {
172+
input string
173+
expect []string
174+
}{
175+
{
176+
input: "",
177+
expect: []string{},
178+
},
179+
{
180+
input: "あああ",
181+
expect: []string{"あああ"},
182+
},
183+
{
184+
input: " ",
185+
expect: []string{" "},
186+
},
187+
{
188+
input: " x x x ",
189+
expect: []string{" ", "x ", "x ", "x "},
190+
},
191+
{
192+
input: "こんにちはは「さようなら 」 \U0001f363が好き",
193+
expect: []string{"こんにちはは「さようなら 」 ", "\U0001f363が好き"},
194+
},
195+
{
196+
input: "こんにちは \t\tさようなら おはよう おやすみ ",
197+
expect: []string{"こんにちは ", "\t\tさようなら ", "おはよう ", "おやすみ "},
198+
},
199+
{
200+
input: "すももも\t\n\n \tももも\n\nもものうち\n\n",
201+
expect: []string{"すももも\t\n\n ", "\tももも\n\nもものうち\n\n"},
202+
},
203+
}
204+
205+
s := filter.SentenceSplitter{
206+
Delim: []rune{' ', ' '}, // white spaces
207+
Follower: []rune{'.', '」', '」', '』', ')', ')', '}', '}', '〉', '》'},
208+
SkipWhiteSpace: false,
209+
DoubleLineFeedSplit: false,
210+
MaxRuneLen: 256,
211+
}
212+
for _, d := range testdata {
213+
scanner := bufio.NewScanner(strings.NewReader(d.input))
214+
scanner.Split(s.ScanSentences)
215+
r := make([]string, 0, len(d.expect))
216+
for scanner.Scan() {
217+
r = append(r, scanner.Text())
218+
}
219+
if !reflect.DeepEqual(r, d.expect) {
220+
t.Errorf("input %q, got %#v, expected %#v", d.input, r, d.expect)
221+
}
222+
if got := strings.Join(r, ""); got != d.input {
223+
t.Errorf("got len=%d %q, input len=%d %q", len(got), got, len(d.input), d.input)
224+
}
225+
}
226+
}
227+
112228
func Test_ScanSentences(t *testing.T) {
113229
testdata := []struct {
114230
atEnd bool

0 commit comments

Comments
 (0)