From 7f283963753e8d857b795309eca2922674b51fe8 Mon Sep 17 00:00:00 2001 From: tompng Date: Sun, 9 Jun 2024 13:34:34 +0900 Subject: [PATCH] Refactor Reline::Unicode vi_ ed_ em_ methods --- lib/reline/unicode.rb | 436 ++++++++---------------------------- test/reline/test_unicode.rb | 14 +- 2 files changed, 97 insertions(+), 353 deletions(-) diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb index f8eb365069..afcdaf1e43 100644 --- a/lib/reline/unicode.rb +++ b/lib/reline/unicode.rb @@ -262,375 +262,126 @@ def self.get_prev_mbchar_size(line, byte_pointer) end def self.em_forward_word(line, byte_pointer) - byte_size = 0 - while line.bytesize > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\p{Word}/ - byte_size += size - end - while line.bytesize > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\P{Word}/ - byte_size += size - end - byte_size + gcs = line.byteslice(byte_pointer..).grapheme_clusters + nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) } + words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) } + nonwords.sum(&:bytesize) + words.sum(&:bytesize) end def self.em_forward_word_with_capitalization(line, byte_pointer) - byte_size = 0 - new_str = String.new - while line.bytesize > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\p{Word}/ - new_str += mbchar - byte_size += size - end - first = true - while line.bytesize > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\P{Word}/ - if first - new_str += mbchar.upcase - first = false - else - new_str += mbchar.downcase - end - byte_size += size - end - [byte_size, new_str] + gcs = line.byteslice(byte_pointer..).grapheme_clusters + nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) } + words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) } + [nonwords.sum(&:bytesize) + words.sum(&:bytesize), nonwords.join + words.join.capitalize] end def self.em_backward_word(line, byte_pointer) - byte_size = 0 - while 0 < (byte_pointer - byte_size) - size = get_prev_mbchar_size(line, byte_pointer - byte_size) - mbchar = line.byteslice(byte_pointer - byte_size - size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\p{Word}/ - byte_size += size - end - while 0 < (byte_pointer - byte_size) - size = get_prev_mbchar_size(line, byte_pointer - byte_size) - mbchar = line.byteslice(byte_pointer - byte_size - size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\P{Word}/ - byte_size += size - end - byte_size + gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse + nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) } + words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) } + nonwords.sum(&:bytesize) + words.sum(&:bytesize) end def self.em_big_backward_word(line, byte_pointer) - byte_size = 0 - while 0 < (byte_pointer - byte_size) - size = get_prev_mbchar_size(line, byte_pointer - byte_size) - mbchar = line.byteslice(byte_pointer - byte_size - size, size) - break if mbchar =~ /\S/ - byte_size += size - end - while 0 < (byte_pointer - byte_size) - size = get_prev_mbchar_size(line, byte_pointer - byte_size) - mbchar = line.byteslice(byte_pointer - byte_size - size, size) - break if mbchar =~ /\s/ - byte_size += size - end - byte_size + gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse + spaces = gcs.take_while { |c| c.match?(/\s/) } + nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) } + spaces.sum(&:bytesize) + nonspaces.sum(&:bytesize) end def self.ed_transpose_words(line, byte_pointer) - right_word_start = nil - size = get_next_mbchar_size(line, byte_pointer) - mbchar = line.byteslice(byte_pointer, size) - if size.zero? - # ' aaa bbb [cursor]' - byte_size = 0 - while 0 < (byte_pointer + byte_size) - size = get_prev_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size - size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\p{Word}/ - byte_size -= size - end - while 0 < (byte_pointer + byte_size) - size = get_prev_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size - size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\P{Word}/ - byte_size -= size - end - right_word_start = byte_pointer + byte_size - byte_size = 0 - while line.bytesize > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\P{Word}/ - byte_size += size - end - after_start = byte_pointer + byte_size - elsif mbchar.encode(Encoding::UTF_8) =~ /\p{Word}/ - # ' aaa bb[cursor]b' - byte_size = 0 - while 0 < (byte_pointer + byte_size) - size = get_prev_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size - size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\P{Word}/ - byte_size -= size - end - right_word_start = byte_pointer + byte_size - byte_size = 0 - while line.bytesize > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\P{Word}/ - byte_size += size - end - after_start = byte_pointer + byte_size - else - byte_size = 0 - while (line.bytesize - 1) > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\p{Word}/ - byte_size += size - end - if (byte_pointer + byte_size) == (line.bytesize - 1) - # ' aaa bbb [cursor] ' - after_start = line.bytesize - while 0 < (byte_pointer + byte_size) - size = get_prev_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size - size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\p{Word}/ - byte_size -= size - end - while 0 < (byte_pointer + byte_size) - size = get_prev_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size - size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\P{Word}/ - byte_size -= size - end - right_word_start = byte_pointer + byte_size - else - # ' aaa [cursor] bbb ' - right_word_start = byte_pointer + byte_size - while line.bytesize > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\P{Word}/ - byte_size += size - end - after_start = byte_pointer + byte_size - end - end - byte_size = right_word_start - byte_pointer - while 0 < (byte_pointer + byte_size) - size = get_prev_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size - size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\p{Word}/ - byte_size -= size - end - middle_start = byte_pointer + byte_size - byte_size = middle_start - byte_pointer - while 0 < (byte_pointer + byte_size) - size = get_prev_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size - size, size) - break if mbchar.encode(Encoding::UTF_8) =~ /\P{Word}/ - byte_size -= size + gcs = line.byteslice(0, byte_pointer).grapheme_clusters + pos = gcs.size + gcs += line.byteslice(byte_pointer..).grapheme_clusters + gcs.map! { |c| c.encode(Encoding::UTF_8) } + pos += 1 while pos < gcs.size && gcs[pos].match?(/\P{Word}/) + if pos == gcs.size # 'aaa bbb [cursor] ' + pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\P{Word}/) + second_word_end = gcs.size + else # 'aaa [cursor]bbb' + pos += 1 while pos < gcs.size && gcs[pos].match?(/\p{Word}/) + second_word_end = pos + end + pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\p{Word}/) + second_word_start = pos + pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\P{Word}/) + first_word_end = pos + pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\p{Word}/) + first_word_start = pos + + [first_word_start, first_word_end, second_word_start, second_word_end].map do |idx| + gcs.take(idx).sum(&:bytesize) end - left_word_start = byte_pointer + byte_size - [left_word_start, middle_start, right_word_start, after_start] end def self.vi_big_forward_word(line, byte_pointer) - byte_size = 0 - while (line.bytesize - 1) > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - break if mbchar =~ /\s/ - byte_size += size - end - while (line.bytesize - 1) > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - break if mbchar =~ /\S/ - byte_size += size - end - byte_size + gcs = line.byteslice(byte_pointer..).grapheme_clusters + nonspaces = gcs.take_while { |c| c.match?(/\S/) } + spaces = gcs.drop(nonspaces.size).take_while { |c| c.match?(/\s/) } + nonspaces.sum(&:bytesize) + spaces.sum(&:bytesize) end def self.vi_big_forward_end_word(line, byte_pointer) - if (line.bytesize - 1) > byte_pointer - size = get_next_mbchar_size(line, byte_pointer) - byte_size = size - else - return 0 - end - while (line.bytesize - 1) > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - break if mbchar =~ /\S/ - byte_size += size - end - prev_byte_size = byte_size - while line.bytesize > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - break if mbchar =~ /\s/ - prev_byte_size = byte_size - byte_size += size - end - prev_byte_size + gcs = line.byteslice(byte_pointer..).grapheme_clusters + first = gcs.shift(1) + spaces = gcs.take_while { |c| c.match?(/\s/) } + nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) } + matched = spaces + nonspaces + matched.pop + first.sum(&:bytesize) + matched.sum(&:bytesize) end def self.vi_big_backward_word(line, byte_pointer) - byte_size = 0 - while 0 < (byte_pointer - byte_size) - size = get_prev_mbchar_size(line, byte_pointer - byte_size) - mbchar = line.byteslice(byte_pointer - byte_size - size, size) - break if mbchar =~ /\S/ - byte_size += size - end - while 0 < (byte_pointer - byte_size) - size = get_prev_mbchar_size(line, byte_pointer - byte_size) - mbchar = line.byteslice(byte_pointer - byte_size - size, size) - break if mbchar =~ /\s/ - byte_size += size - end - byte_size + gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse + spaces = gcs.take_while { |c| c.match?(/\s/) } + nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) } + spaces.sum(&:bytesize) + nonspaces.sum(&:bytesize) end def self.vi_forward_word(line, byte_pointer, drop_terminate_spaces = false) - if line.bytesize > byte_pointer - size = get_next_mbchar_size(line, byte_pointer) - mbchar = line.byteslice(byte_pointer, size) - if mbchar =~ /\w/ - started_by = :word - elsif mbchar =~ /\s/ - started_by = :space + gcs = line.byteslice(byte_pointer..).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) } + return 0 if gcs.empty? + + regexp = + case gcs.first + when /\p{Word}/ + /\p{Word}/ + when /\s/ + /\s/ else - started_by = :non_word_printable + /[^\p{Word}\s]/ end - byte_size = size - else - return 0 - end - while line.bytesize > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - case started_by - when :word - break if mbchar =~ /\W/ - when :space - break if mbchar =~ /\S/ - when :non_word_printable - break if mbchar =~ /\w|\s/ - end - byte_size += size - end - return byte_size if drop_terminate_spaces - while line.bytesize > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - break if mbchar =~ /\S/ - byte_size += size - end - byte_size + matched = gcs.take_while { |c| c.match?(regexp) } + return matched.sum(&:bytesize) if drop_terminate_spaces + + spaces = gcs.drop(matched.size).take_while { |c| c.match?(/\s/) } + matched.sum(&:bytesize) + spaces.sum(&:bytesize) end def self.vi_forward_end_word(line, byte_pointer) - if (line.bytesize - 1) > byte_pointer - size = get_next_mbchar_size(line, byte_pointer) - mbchar = line.byteslice(byte_pointer, size) - if mbchar =~ /\w/ - started_by = :word - elsif mbchar =~ /\s/ - started_by = :space - else - started_by = :non_word_printable - end - byte_size = size - else - return 0 - end - if (line.bytesize - 1) > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - if mbchar =~ /\w/ - second = :word - elsif mbchar =~ /\s/ - second = :space - else - second = :non_word_printable - end - second_byte_size = size - else - return byte_size - end - if second == :space - byte_size += second_byte_size - while (line.bytesize - 1) > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - if mbchar =~ /\S/ - if mbchar =~ /\w/ - started_by = :word - else - started_by = :non_word_printable - end - break - end - byte_size += size - end - else - case [started_by, second] - when [:word, :non_word_printable], [:non_word_printable, :word] - started_by = second - else - byte_size += second_byte_size - started_by = second - end - end - prev_byte_size = byte_size - while line.bytesize > (byte_pointer + byte_size) - size = get_next_mbchar_size(line, byte_pointer + byte_size) - mbchar = line.byteslice(byte_pointer + byte_size, size) - case started_by - when :word - break if mbchar =~ /\W/ - when :non_word_printable - break if mbchar =~ /[\w\s]/ - end - prev_byte_size = byte_size - byte_size += size - end - prev_byte_size + gcs = line.byteslice(byte_pointer..).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) } + return 0 if gcs.empty? + return gcs.first.bytesize if gcs.size == 1 + + start = gcs.shift + skips = [start] + if start.match?(/\s/) || gcs.first.match?(/\s/) + spaces = gcs.take_while { |c| c.match?(/\s/) } + skips += spaces + gcs.shift(spaces.size) + end + regexp = /\p{Word}/.match?(gcs.first) ? /\p{Word}/ : /[^\p{Word}\s]/ + matched = gcs.take_while { |c| c.match?(regexp) } + matched.pop + skips.sum(&:bytesize) + matched.sum(&:bytesize) end def self.vi_backward_word(line, byte_pointer) - byte_size = 0 - while 0 < (byte_pointer - byte_size) - size = get_prev_mbchar_size(line, byte_pointer - byte_size) - mbchar = line.byteslice(byte_pointer - byte_size - size, size) - if mbchar =~ /\S/ - if mbchar =~ /\w/ - started_by = :word - else - started_by = :non_word_printable - end - break - end - byte_size += size - end - while 0 < (byte_pointer - byte_size) - size = get_prev_mbchar_size(line, byte_pointer - byte_size) - mbchar = line.byteslice(byte_pointer - byte_size - size, size) - case started_by - when :word - break if mbchar =~ /\W/ - when :non_word_printable - break if mbchar =~ /[\w\s]/ - end - byte_size += size - end - byte_size + gcs = line.byteslice(0, byte_pointer).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }.reverse + spaces = gcs.take_while { |c| c.match?(/\s/) } + gcs.shift(spaces.size) + regexp = /\p{Word}/.match?(gcs.first) ? /\p{Word}/ : /[^\p{Word}\s]/ + spaces.sum(&:bytesize) + gcs.take_while { |c| c.match?(regexp) }.sum(&:bytesize) end def self.common_prefix(list, ignore_case: false) @@ -647,15 +398,8 @@ def self.common_prefix(list, ignore_case: false) end def self.vi_first_print(line) - byte_size = 0 - while (line.bytesize - 1) > byte_size - size = get_next_mbchar_size(line, byte_size) - mbchar = line.byteslice(byte_size, size) - if mbchar =~ /\S/ - break - end - byte_size += size - end - byte_size + gcs = line.grapheme_clusters + spaces = gcs.take_while { |c| c.match?(/\s/) } + spaces.sum(&:bytesize) end end diff --git a/test/reline/test_unicode.rb b/test/reline/test_unicode.rb index 07ed8c6230..773dcc9874 100644 --- a/test/reline/test_unicode.rb +++ b/test/reline/test_unicode.rb @@ -199,7 +199,7 @@ def test_vi_big_forward_word assert_equal(18, Reline::Unicode.vi_big_forward_word('abc---fooあbar-baz xyz', 3)) assert_equal(8, Reline::Unicode.vi_big_forward_word('abcfooあ --', 3)) assert_equal(6, Reline::Unicode.vi_big_forward_word('abcfooあ', 3)) - assert_equal(2, Reline::Unicode.vi_big_forward_word('abc- ', 3)) # maybe inconsistent + assert_equal(3, Reline::Unicode.vi_big_forward_word('abc- ', 3)) assert_equal(0, Reline::Unicode.vi_big_forward_word('abc', 3)) end @@ -226,23 +226,23 @@ def test_vi_big_backward_word def test_vi_forward_word assert_equal(3, Reline::Unicode.vi_forward_word('abc---fooあbar-baz', 3)) - assert_equal(3, Reline::Unicode.vi_forward_word('abc---fooあbar-baz', 6)) # maybe bug - assert_equal(3, Reline::Unicode.vi_forward_word('abcfooあ', 3)) # maybe bug + assert_equal(9, Reline::Unicode.vi_forward_word('abc---fooあbar-baz', 6)) + assert_equal(6, Reline::Unicode.vi_forward_word('abcfooあ', 3)) assert_equal(3, Reline::Unicode.vi_forward_word('abc---', 3)) assert_equal(0, Reline::Unicode.vi_forward_word('abc', 3)) end def test_vi_forward_end_word assert_equal(2, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz', 3)) - assert_equal(2, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz', 6)) # maybe bug - assert_equal(2, Reline::Unicode.vi_forward_end_word('abcfooあ', 3)) # maybe bug + assert_equal(8, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz', 6)) + assert_equal(3, Reline::Unicode.vi_forward_end_word('abcfooあ', 3)) assert_equal(2, Reline::Unicode.vi_forward_end_word('abc---', 3)) assert_equal(0, Reline::Unicode.vi_forward_end_word('abc', 3)) end def test_vi_backward_word assert_equal(3, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz', 20)) - assert_equal(3, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz', 17)) # maybe bug + assert_equal(9, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz', 17)) assert_equal(2, Reline::Unicode.vi_backward_word(' ', 2)) assert_equal(2, Reline::Unicode.vi_backward_word('ab', 2)) assert_equal(0, Reline::Unicode.vi_backward_word('ab', 0)) @@ -250,7 +250,7 @@ def test_vi_backward_word def test_vi_first_print assert_equal(3, Reline::Unicode.vi_first_print(' abcdefg')) - assert_equal(2, Reline::Unicode.vi_first_print(' ')) # maybe inconsistent + assert_equal(3, Reline::Unicode.vi_first_print(' ')) assert_equal(0, Reline::Unicode.vi_first_print('abc')) assert_equal(0, Reline::Unicode.vi_first_print('')) end