Skip to content

Commit

Permalink
Calculate mbchar width with bsearch (#632)
Browse files Browse the repository at this point in the history
  • Loading branch information
tompng authored Aug 29, 2024
1 parent 14784ed commit 0851e93
Show file tree
Hide file tree
Showing 3 changed files with 1,311 additions and 1,288 deletions.
93 changes: 35 additions & 58 deletions bin/generate_east_asian_width
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@ if ARGV.empty?
exit 1
end

def unicode_width(type, category)
return 0 if category == 'Mn' # Nonspacing Mark
case type
when 'F', 'W' # Fullwidth, Wide
2
when 'H', 'Na', 'N' # Halfwidth, Narrow, Neutral
1
when 'A' # Ambiguous
-1
end
end

open(ARGV.first, 'rt') do |f|
if m = f.gets.match(/^# EastAsianWidth-(\d+\.\d+\.\d+)\.txt/)
unicode_version = m[1]
Expand All @@ -13,66 +25,31 @@ open(ARGV.first, 'rt') do |f|
unicode_version = nil
end

list = []
widths = []
f.each_line do |line|
next unless m = line.match(/^(\h+)(?:\.\.(\h+))?\s*;\s*(\w+)\s+#.+/)
next unless /^(?<first>\h+)(?:\.\.(?<last>\h+))?\s*;\s*(?<type>\w+)\s+# +(?<category>[^ ]+)/ =~ line

first = m[1].to_i(16)
last = m[2]&.to_i(16) || first
type = m[3].to_sym
if !list.empty? and (list.last[:range].last + 1) == first and list.last[:type] == type
list.last[:range] = (list.last[:range].first..last)
else
# [\u{D800}-\u{DFFF}] cause error.
unless ((0xD800..0xDFFF).to_a & (first..last).to_a).empty?
unless (first..0xD7FF).to_a.empty?
list << {
range: (first..0xD7FF),
type: type.to_sym
}
end
unless (0xE000..last).to_a.empty?
list << {
range: (first..0xD7FF),
type: type.to_sym
}
end
else
list << {
range: (first..last),
type: type.to_sym
}
end
end
range = first.to_i(16)..(last || first).to_i(16)
widths.fill(unicode_width(type, category), range)
end
grouped = list.group_by { |item| item[:type] }.map { |item| [item.first, item.last.map { |row| row[:range] }] }.to_h
grouped = %i{F H W Na A N}.map { |type| [type, grouped[type]] }
puts <<EOH
class Reline::Unicode::EastAsianWidth
# This is based on EastAsianWidth.txt
# UNICODE_VERSION = #{unicode_version ? "'#{unicode_version}'" : 'nil'}

EOH
puts grouped.map { |item|
type, ranges = item
output = " # %s\n" %
case type
when :F then 'Fullwidth'
when :H then 'Halfwidth'
when :W then 'Wide'
when :Na then 'Narrow'
when :A then 'Ambiguous'
when :N then 'Neutral'
end
output += " TYPE_%s = /^[\#{ %%W(\n" % type.upcase
output += ranges.map { |range|
if range.first == range.last
' \u{%04X}' % range.first
else
' \u{%04X}-\u{%04X}' % [range.first, range.last]
end
}.join("\n")
output += "\n ).join }]/\n"
}.join("\n")
puts 'end'
# EscapedPairs
[*0x00..0x1F, 0x7F].each { |ord| widths[ord] = 2 }
# printable ASCII chars
(0x20..0x7E).each { |ord| widths[ord] = 1 }

chunks = widths.each_with_index.chunk { |width, _idx| width || 1 }
chunk_last_ords = chunks.map { |width, chunk| [chunk.last.last, width] }
chunk_last_ords << [0x7fffffff, 1]

puts <<~EOH
class Reline::Unicode::EastAsianWidth
# This is based on EastAsianWidth.txt
# UNICODE_VERSION = #{unicode_version ? "'#{unicode_version}'" : 'nil'}
CHUNK_LAST, CHUNK_WIDTH = [
#{chunk_last_ords.map { |ord, width| " [0x#{ord.to_s(16)}, #{width}]" }.join(",\n")}
].transpose.map(&:freeze)
end
EOH
end
53 changes: 14 additions & 39 deletions lib/reline/unicode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,51 +56,26 @@ def self.escape_for_print(str)

require 'reline/unicode/east_asian_width'

HalfwidthDakutenHandakuten = /[\u{FF9E}\u{FF9F}]/

MBCharWidthRE = /
(?<width_2_1>
[#{ EscapedChars.map {|c| "\\x%02x" % c.ord }.join }] (?# ^ + char, such as ^M, ^H, ^[, ...)
)
| (?<width_3>^\u{2E3B}) (?# THREE-EM DASH)
| (?<width_0>^\p{M})
| (?<width_2_2>
#{ EastAsianWidth::TYPE_F }
| #{ EastAsianWidth::TYPE_W }
)
| (?<width_1>
#{ EastAsianWidth::TYPE_H }
| #{ EastAsianWidth::TYPE_NA }
| #{ EastAsianWidth::TYPE_N }
)(?!#{ HalfwidthDakutenHandakuten })
| (?<width_2_3>
(?: #{ EastAsianWidth::TYPE_H }
| #{ EastAsianWidth::TYPE_NA }
| #{ EastAsianWidth::TYPE_N })
#{ HalfwidthDakutenHandakuten }
)
| (?<ambiguous_width>
#{EastAsianWidth::TYPE_A}
)
/x

def self.get_mbchar_width(mbchar)
ord = mbchar.ord
if (0x00 <= ord and ord <= 0x1F) # in EscapedPairs
if ord <= 0x1F # in EscapedPairs
return 2
elsif (0x20 <= ord and ord <= 0x7E) # printable ASCII chars
elsif ord <= 0x7E # printable ASCII chars
return 1
end
m = mbchar.encode(Encoding::UTF_8).match(MBCharWidthRE)
case
when m.nil? then 1 # TODO should be U+FFFD � REPLACEMENT CHARACTER
when m[:width_2_1], m[:width_2_2], m[:width_2_3] then 2
when m[:width_3] then 3
when m[:width_0] then 0
when m[:width_1] then 1
when m[:ambiguous_width] then Reline.ambiguous_width
utf8_mbchar = mbchar.encode(Encoding::UTF_8)
ord = utf8_mbchar.ord
chunk_index = EastAsianWidth::CHUNK_LAST.bsearch_index { |o| ord <= o }
size = EastAsianWidth::CHUNK_WIDTH[chunk_index]
if size == -1
Reline.ambiguous_width
elsif size == 1 && utf8_mbchar.size >= 2
second_char_ord = utf8_mbchar[1].ord
# Halfwidth Dakuten Handakuten
# Only these two character has Letter Modifier category and can be combined in a single grapheme cluster
(second_char_ord == 0xFF9E || second_char_ord == 0xFF9F) ? 2 : 1
else
nil
size
end
end

Expand Down
Loading

0 comments on commit 0851e93

Please sign in to comment.