Skip to content

Commit

Permalink
Make Reline::Unicode's vi_ ed_ em_ method encoding safe
Browse files Browse the repository at this point in the history
  • Loading branch information
tompng committed Nov 12, 2024
1 parent 7f28396 commit e699b4f
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 50 deletions.
92 changes: 51 additions & 41 deletions lib/reline/unicode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -263,50 +263,49 @@ def self.get_prev_mbchar_size(line, byte_pointer)

def self.em_forward_word(line, byte_pointer)
gcs = line.byteslice(byte_pointer..).grapheme_clusters
nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) }
words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) }
nonwords = gcs.take_while { |c| !word_character?(c) }
words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) }
nonwords.sum(&:bytesize) + words.sum(&:bytesize)
end

def self.em_forward_word_with_capitalization(line, byte_pointer)
gcs = line.byteslice(byte_pointer..).grapheme_clusters
nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) }
words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) }
nonwords = gcs.take_while { |c| !word_character?(c) }
words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) }
[nonwords.sum(&:bytesize) + words.sum(&:bytesize), nonwords.join + words.join.capitalize]
end

def self.em_backward_word(line, byte_pointer)
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) }
words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) }
nonwords = gcs.take_while { |c| !word_character?(c) }
words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) }
nonwords.sum(&:bytesize) + words.sum(&:bytesize)
end

def self.em_big_backward_word(line, byte_pointer)
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
spaces = gcs.take_while { |c| c.match?(/\s/) }
nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) }
spaces = gcs.take_while { |c| space_character?(c) }
nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) }
spaces.sum(&:bytesize) + nonspaces.sum(&:bytesize)
end

def self.ed_transpose_words(line, byte_pointer)
gcs = line.byteslice(0, byte_pointer).grapheme_clusters
pos = gcs.size
gcs += line.byteslice(byte_pointer..).grapheme_clusters
gcs.map! { |c| c.encode(Encoding::UTF_8) }
pos += 1 while pos < gcs.size && gcs[pos].match?(/\P{Word}/)
pos += 1 while pos < gcs.size && !word_character?(gcs[pos])
if pos == gcs.size # 'aaa bbb [cursor] '
pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\P{Word}/)
pos -= 1 while pos > 0 && !word_character?(gcs[pos - 1])
second_word_end = gcs.size
else # 'aaa [cursor]bbb'
pos += 1 while pos < gcs.size && gcs[pos].match?(/\p{Word}/)
pos += 1 while pos < gcs.size && word_character?(gcs[pos])
second_word_end = pos
end
pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\p{Word}/)
pos -= 1 while pos > 0 && word_character?(gcs[pos - 1])
second_word_start = pos
pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\P{Word}/)
pos -= 1 while pos > 0 && !word_character?(gcs[pos - 1])
first_word_end = pos
pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\p{Word}/)
pos -= 1 while pos > 0 && word_character?(gcs[pos - 1])
first_word_start = pos

[first_word_start, first_word_end, second_word_start, second_word_end].map do |idx|
Expand All @@ -316,72 +315,73 @@ def self.ed_transpose_words(line, byte_pointer)

def self.vi_big_forward_word(line, byte_pointer)
gcs = line.byteslice(byte_pointer..).grapheme_clusters
nonspaces = gcs.take_while { |c| c.match?(/\S/) }
spaces = gcs.drop(nonspaces.size).take_while { |c| c.match?(/\s/) }
nonspaces = gcs.take_while { |c| !space_character?(c) }
spaces = gcs.drop(nonspaces.size).take_while { |c| space_character?(c) }
nonspaces.sum(&:bytesize) + spaces.sum(&:bytesize)
end

def self.vi_big_forward_end_word(line, byte_pointer)
gcs = line.byteslice(byte_pointer..).grapheme_clusters
first = gcs.shift(1)
spaces = gcs.take_while { |c| c.match?(/\s/) }
nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) }
spaces = gcs.take_while { |c| space_character?(c) }
nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) }
matched = spaces + nonspaces
matched.pop
first.sum(&:bytesize) + matched.sum(&:bytesize)
end

def self.vi_big_backward_word(line, byte_pointer)
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
spaces = gcs.take_while { |c| c.match?(/\s/) }
nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) }
spaces = gcs.take_while { |c| space_character?(c) }
nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) }
spaces.sum(&:bytesize) + nonspaces.sum(&:bytesize)
end

def self.vi_forward_word(line, byte_pointer, drop_terminate_spaces = false)
gcs = line.byteslice(byte_pointer..).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }
gcs = line.byteslice(byte_pointer..).grapheme_clusters
return 0 if gcs.empty?

regexp =
case gcs.first
when /\p{Word}/
/\p{Word}/
when /\s/
/\s/
c = gcs.first
matched =
if word_character?(c)
gcs.take_while { |c| word_character?(c) }
elsif space_character?(c)
gcs.take_while { |c| space_character?(c) }
else
/[^\p{Word}\s]/
gcs.take_while { |c| !word_character?(c) && !space_character?(c) }
end
matched = gcs.take_while { |c| c.match?(regexp) }

return matched.sum(&:bytesize) if drop_terminate_spaces

spaces = gcs.drop(matched.size).take_while { |c| c.match?(/\s/) }
spaces = gcs.drop(matched.size).take_while { |c| space_character?(c) }
matched.sum(&:bytesize) + spaces.sum(&:bytesize)
end

def self.vi_forward_end_word(line, byte_pointer)
gcs = line.byteslice(byte_pointer..).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }
gcs = line.byteslice(byte_pointer..).grapheme_clusters
return 0 if gcs.empty?
return gcs.first.bytesize if gcs.size == 1

start = gcs.shift
skips = [start]
if start.match?(/\s/) || gcs.first.match?(/\s/)
spaces = gcs.take_while { |c| c.match?(/\s/) }
if space_character?(start) || space_character?(gcs.first)
spaces = gcs.take_while { |c| space_character?(c) }
skips += spaces
gcs.shift(spaces.size)
end
regexp = /\p{Word}/.match?(gcs.first) ? /\p{Word}/ : /[^\p{Word}\s]/
matched = gcs.take_while { |c| c.match?(regexp) }
start_with_word = word_character?(gcs.first)
matched = gcs.take_while { |c| start_with_word ? word_character?(c) : !word_character?(c) && !space_character?(c) }
matched.pop
skips.sum(&:bytesize) + matched.sum(&:bytesize)
end

def self.vi_backward_word(line, byte_pointer)
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }.reverse
spaces = gcs.take_while { |c| c.match?(/\s/) }
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
spaces = gcs.take_while { |c| space_character?(c) }
gcs.shift(spaces.size)
regexp = /\p{Word}/.match?(gcs.first) ? /\p{Word}/ : /[^\p{Word}\s]/
spaces.sum(&:bytesize) + gcs.take_while { |c| c.match?(regexp) }.sum(&:bytesize)
start_with_word = word_character?(gcs.first)
matched = gcs.take_while { |c| start_with_word ? word_character?(c) : !word_character?(c) && !space_character?(c) }
spaces.sum(&:bytesize) + matched.sum(&:bytesize)
end

def self.common_prefix(list, ignore_case: false)
Expand All @@ -399,7 +399,17 @@ def self.common_prefix(list, ignore_case: false)

def self.vi_first_print(line)
gcs = line.grapheme_clusters
spaces = gcs.take_while { |c| c.match?(/\s/) }
spaces = gcs.take_while { |c| space_character?(c) }
spaces.sum(&:bytesize)
end

def self.word_character?(s)
s.encode(Encoding::UTF_8).match?(/\p{Word}/) if s
rescue Encoding::UndefinedConversionError
false
end

def self.space_character?(s)
s.match?(/\s/) if s
end
end
45 changes: 36 additions & 9 deletions test/reline/test_unicode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,15 @@ def test_encoding_conversion

def test_em_forward_word
assert_equal(12, Reline::Unicode.em_forward_word('abc---fooあbar-baz', 3))
assert_equal(11, Reline::Unicode.em_forward_word('abc---fooあbar-baz'.encode('sjis'), 3))
assert_equal(3, Reline::Unicode.em_forward_word('abcfoo', 3))
assert_equal(3, Reline::Unicode.em_forward_word('abc---', 3))
assert_equal(0, Reline::Unicode.em_forward_word('abc', 3))
end

def test_em_forward_word_with_capitalization
assert_equal([12, '---Fooあbar'], Reline::Unicode.em_forward_word_with_capitalization('abc---foOあBar-baz', 3))
assert_equal([11, '---Fooあbar'.encode('sjis')], Reline::Unicode.em_forward_word_with_capitalization('abc---foOあBar-baz'.encode('sjis'), 3))
assert_equal([3, 'Foo'], Reline::Unicode.em_forward_word_with_capitalization('abcfOo', 3))
assert_equal([3, '---'], Reline::Unicode.em_forward_word_with_capitalization('abc---', 3))
assert_equal([0, ''], Reline::Unicode.em_forward_word_with_capitalization('abc', 3))
Expand All @@ -162,13 +164,15 @@ def test_em_forward_word_with_capitalization

def test_em_backward_word
assert_equal(12, Reline::Unicode.em_backward_word('abc foo-barあbaz--- xyz', 20))
assert_equal(11, Reline::Unicode.em_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19))
assert_equal(2, Reline::Unicode.em_backward_word(' ', 2))
assert_equal(2, Reline::Unicode.em_backward_word('ab', 2))
assert_equal(0, Reline::Unicode.em_backward_word('ab', 0))
end

def test_em_big_backward_word
assert_equal(16, Reline::Unicode.em_big_backward_word('abc foo-barあbaz--- xyz', 20))
assert_equal(15, Reline::Unicode.em_big_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19))
assert_equal(2, Reline::Unicode.em_big_backward_word(' ', 2))
assert_equal(2, Reline::Unicode.em_big_backward_word('ab', 2))
assert_equal(0, Reline::Unicode.em_big_backward_word('ab', 0))
Expand All @@ -184,20 +188,20 @@ def test_ed_transpose_words
assert_equal([3, 5, 6, 8], Reline::Unicode.ed_transpose_words('aa bb cc ', 7))
assert_equal([3, 5, 6, 10], Reline::Unicode.ed_transpose_words('aa bb cc ', 8))
assert_equal([3, 5, 6, 10], Reline::Unicode.ed_transpose_words('aa bb cc ', 9))
word1 = 'fooあ'
word2 = 'barあbaz'
left = 'aaa -'
middle = '- -'
right = '- bbb'
expected = [left.bytesize, (left + word1).bytesize, (left + word1 + middle).bytesize, (left + word1 + middle + word2).bytesize]
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize))
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize))
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize + word2.bytesize - 1))
['sjis', 'utf-8'].each do |encoding|
texts = ['fooあ', 'barあbaz', 'aaa -', '- -', '- bbb']
word1, word2, left, middle, right = texts.map { |text| text.encode(encoding) }
expected = [left.bytesize, (left + word1).bytesize, (left + word1 + middle).bytesize, (left + word1 + middle + word2).bytesize]
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize))
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize))
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize + word2.bytesize - 1))
end
end

def test_vi_big_forward_word
assert_equal(18, Reline::Unicode.vi_big_forward_word('abc---fooあbar-baz xyz', 3))
assert_equal(8, Reline::Unicode.vi_big_forward_word('abcfooあ --', 3))
assert_equal(7, Reline::Unicode.vi_big_forward_word('abcfooあ --'.encode('sjis'), 3))
assert_equal(6, Reline::Unicode.vi_big_forward_word('abcfooあ', 3))
assert_equal(3, Reline::Unicode.vi_big_forward_word('abc- ', 3))
assert_equal(0, Reline::Unicode.vi_big_forward_word('abc', 3))
Expand All @@ -211,6 +215,7 @@ def test_vi_big_forward_end_word
assert_equal(1, Reline::Unicode.vi_big_forward_end_word('aa b', 0))
assert_equal(3, Reline::Unicode.vi_big_forward_end_word(' aa b', 0))
assert_equal(15, Reline::Unicode.vi_big_forward_end_word('abc---fooあbar-baz xyz', 3))
assert_equal(14, Reline::Unicode.vi_big_forward_end_word('abc---fooあbar-baz xyz'.encode('sjis'), 3))
assert_equal(3, Reline::Unicode.vi_big_forward_end_word('abcfooあ --', 3))
assert_equal(3, Reline::Unicode.vi_big_forward_end_word('abcfooあ', 3))
assert_equal(2, Reline::Unicode.vi_big_forward_end_word('abc- ', 3))
Expand All @@ -219,6 +224,7 @@ def test_vi_big_forward_end_word

def test_vi_big_backward_word
assert_equal(16, Reline::Unicode.vi_big_backward_word('abc foo-barあbaz--- xyz', 20))
assert_equal(15, Reline::Unicode.vi_big_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19))
assert_equal(2, Reline::Unicode.vi_big_backward_word(' ', 2))
assert_equal(2, Reline::Unicode.vi_big_backward_word('ab', 2))
assert_equal(0, Reline::Unicode.vi_big_backward_word('ab', 0))
Expand All @@ -227,6 +233,7 @@ def test_vi_big_backward_word
def test_vi_forward_word
assert_equal(3, Reline::Unicode.vi_forward_word('abc---fooあbar-baz', 3))
assert_equal(9, Reline::Unicode.vi_forward_word('abc---fooあbar-baz', 6))
assert_equal(8, Reline::Unicode.vi_forward_word('abc---fooあbar-baz'.encode('sjis'), 6))
assert_equal(6, Reline::Unicode.vi_forward_word('abcfooあ', 3))
assert_equal(3, Reline::Unicode.vi_forward_word('abc---', 3))
assert_equal(0, Reline::Unicode.vi_forward_word('abc', 3))
Expand All @@ -235,6 +242,7 @@ def test_vi_forward_word
def test_vi_forward_end_word
assert_equal(2, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz', 3))
assert_equal(8, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz', 6))
assert_equal(7, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz'.encode('sjis'), 6))
assert_equal(3, Reline::Unicode.vi_forward_end_word('abcfooあ', 3))
assert_equal(2, Reline::Unicode.vi_forward_end_word('abc---', 3))
assert_equal(0, Reline::Unicode.vi_forward_end_word('abc', 3))
Expand All @@ -243,6 +251,7 @@ def test_vi_forward_end_word
def test_vi_backward_word
assert_equal(3, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz', 20))
assert_equal(9, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz', 17))
assert_equal(8, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 16))
assert_equal(2, Reline::Unicode.vi_backward_word(' ', 2))
assert_equal(2, Reline::Unicode.vi_backward_word('ab', 2))
assert_equal(0, Reline::Unicode.vi_backward_word('ab', 0))
Expand All @@ -252,6 +261,24 @@ def test_vi_first_print
assert_equal(3, Reline::Unicode.vi_first_print(' abcdefg'))
assert_equal(3, Reline::Unicode.vi_first_print(' '))
assert_equal(0, Reline::Unicode.vi_first_print('abc'))
assert_equal(0, Reline::Unicode.vi_first_print('あ'))
assert_equal(0, Reline::Unicode.vi_first_print('あ'.encode('sjis')))
assert_equal(0, Reline::Unicode.vi_first_print(''))
end

def test_character_type
assert(Reline::Unicode.word_character?('a'))
assert(Reline::Unicode.word_character?('あ'))
assert(Reline::Unicode.word_character?('あ'.encode('sjis')))
refute(Reline::Unicode.word_character?(33345.chr('sjis')))
refute(Reline::Unicode.word_character?('-'))
refute(Reline::Unicode.word_character?(nil))

assert(Reline::Unicode.space_character?(' '))
refute(Reline::Unicode.space_character?('あ'))
refute(Reline::Unicode.space_character?('あ'.encode('sjis')))
refute(Reline::Unicode.space_character?(33345.chr('sjis')))
refute(Reline::Unicode.space_character?('-'))
refute(Reline::Unicode.space_character?(nil))
end
end

0 comments on commit e699b4f

Please sign in to comment.