Skip to content

Commit

Permalink
Variation Selector 15 (VS-15, U+FE0E) support.
Browse files Browse the repository at this point in the history
I did a few spot checks of VS-15 when implementing VS-16, and
erroneously believed that all emojis in VS-15 sequences were already
listed as an EAW width of 1. But that's not true. There are several
emojis that are "wide" that are changed to "narrow" with VS-15.
  • Loading branch information
jquast committed Feb 14, 2024
1 parent b0e4c88 commit 651f52c
Show file tree
Hide file tree
Showing 11 changed files with 270 additions and 28 deletions.
68 changes: 58 additions & 10 deletions bin/update-tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,19 +417,22 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
"""
table: dict[UnicodeVersion, TableDef] = {}
unicode_latest = fetch_unicode_versions()[-1]
hex_str_vs = 'FE0F'

wide_tables = fetch_table_wide_data().table
unicode_version = UnicodeVersion.parse('9.0.0')

# parse table formatted by the latest emoji release (developed with
# 15.1.0) and parse a single file for all individual releases
table[unicode_version] = parse_vs16_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
ubound_unicode_version=unicode_version)
table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
ubound_unicode_version=unicode_version,
hex_str_vs=hex_str_vs)

# parse and join the final emoji release 12.0 of the earlier "type"
table[unicode_version].values.update(
parse_vs16_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
ubound_unicode_version=unicode_version).values)
parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
ubound_unicode_version=unicode_version,
hex_str_vs=hex_str_vs).values)

# perform culling on any values that are already understood as 'wide'
# without the variation-16 selector
Expand All @@ -442,16 +445,61 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
return UnicodeTableRenderCtx('VS16_NARROW_TO_WIDE', table)


def parse_vs16_data(fname: str, ubound_unicode_version: UnicodeVersion):
def parse_vs_data(fname: str, ubound_unicode_version: UnicodeVersion, hex_str_vs: str):
with open(fname, encoding='utf-8') as fin:
table_iter = parse_vs16_table(fin)
table_iter = parse_vs_table(fin, hex_str_vs)
# pull "date string"
date = next(table_iter).comment.split(':', 1)[1].strip()
# pull values only matching this unicode version and lower
values = {entry.code_range[0] for entry in table_iter}
return TableDef(ubound_unicode_version, date, values)


def fetch_table_vs15_data() -> UnicodeTableRenderCtx:
"""
Fetch and create a "wide to narrow variation-15" lookup table.
Characters in this table are wide, but when combined with a variation selector-15 (\uFE0E), they
become narrow, for the given versions of unicode.
UNICODE_VERSION=9.0.0 or greater is required to enable detection of the effect of *any*
'variation selector-15' wide emoji becoming narrow.
Some terminals display U+231a, u+FE0E as a narrow font, but consuming a wide cell (iTerm2),
while most others display it as a wide cell, only.
It is fair to call these ambiguous, see related 'ucs-detect' project.
"""
table: dict[UnicodeVersion, TableDef] = {}
unicode_latest = fetch_unicode_versions()[-1]
hex_str_vs = 'FE0E'

wide_tables = fetch_table_wide_data().table
unicode_version = UnicodeVersion.parse('9.0.0')

# parse table formatted by the latest emoji release (developed with
# 15.1.0) and parse a single file for all individual releases
table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
ubound_unicode_version=unicode_version,
hex_str_vs=hex_str_vs)

# parse and join the final emoji release 12.0 of the earlier "type"
table[unicode_version].values.update(
parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
ubound_unicode_version=unicode_version,
hex_str_vs=hex_str_vs).values)

# perform culling on any values that are already understood as 'narrow'
# without the variation-15 selector
wide_table = wide_tables[unicode_version].as_value_ranges()
table[unicode_version].values = {
ucs for ucs in table[unicode_version].values
if _bisearch(ucs, wide_table)
}

return UnicodeTableRenderCtx('VS15_WIDE_TO_NARROW', table)


def cite_source_description(filename: str) -> tuple[str, str]:
"""Return unicode.org source data file's own description as citation."""
with open(filename, encoding='utf-8') as f:
Expand Down Expand Up @@ -496,9 +544,8 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
yield TableEntry(code_range, tuple(properties), comment)


def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
"""Parse emoji-variation-sequences.txt for codepoints that preceed 0xFE0F."""
hex_str_vs16 = 'FE0F'
def parse_vs_table(fp: Iterable[str], hex_str_vs: str = 'FE0F') -> Iterator[TableEntry]:
"""Parse emoji-variation-sequences.txt for codepoints that precede `hex_str_vs`"""
for line in fp:
data, _, comment = line.partition('#')
data_fields: Iterator[str] = (field.strip() for field in data.split(';'))
Expand All @@ -510,7 +557,7 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
yield TableEntry(None, tuple(properties), comment)
continue
code_points = code_points_str.split()
if len(code_points) == 2 and code_points[1] == hex_str_vs16:
if len(code_points) == 2 and code_points[1] == hex_str_vs:
# yeild a single "code range" entry for a single value that preceeds FE0F
yield TableEntry((int(code_points[0], 16), int(code_points[0], 16)), tuple(properties), comment)

Expand Down Expand Up @@ -663,6 +710,7 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
UnicodeVersionPyRenderCtx(fetch_unicode_versions())
)
yield UnicodeTableRenderDef.new('table_vs16.py', fetch_table_vs16_data())
yield UnicodeTableRenderDef.new('table_vs15.py', fetch_table_vs15_data())
yield UnicodeTableRenderDef.new('table_wide.py', fetch_table_wide_data())
yield UnicodeTableRenderDef.new('table_zero.py', fetch_table_zero_data())
yield UnicodeVersionRstRenderDef.new(fetch_source_headers())
Expand Down
5 changes: 2 additions & 3 deletions bin/verify-table-integrity.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@


def bisearch_pair(ucs, table):
"""
A copy of wcwidth._bisearch() but also returns the range of matched values.
"""
"""A copy of wcwidth._bisearch() but also returns the range of matched values."""
lbound = 0
ubound = len(table) - 1

Expand All @@ -85,6 +83,7 @@ def bisearch_pair(ucs, table):


def main(log: logging.Logger):
# local
from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, list_versions

reversed_uni_versions = list(reversed(list_versions()))
Expand Down
4 changes: 4 additions & 0 deletions docs/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,10 @@ Other Languages
History
=======

0.2.14 *2024-02-14*
* **Bugfix** accounting of some kinds of emoji sequences using U+FE0E
Variation Selector 15 (`PR #999`_).

0.2.13 *2024-01-06*
* **Bugfix** zero-width support for Hangul Jamo (Korean)

Expand Down
4 changes: 4 additions & 0 deletions docs/specs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ Width of 1
String characters are measured width of 1 when they are not
measured as `Width of 0`_ or `Width of 2`_.

Any character in sequence with `U+FE0E`_ (variation Selector 15) defined
by `emoji-variation-sequences.txt`_ as ``text style``.

Width of 2
----------

Expand All @@ -73,6 +76,7 @@ Any character in sequence with `U+FE0F`_ (Variation Selector 16) defined by
.. _`U+2029`: https://codepoints.net/U+2029
.. _`U+D7B0`: https://codepoints.net/U+D7B0
.. _`U+D7FF`: https://codepoints.net/U+D7FF
.. _`U+FE0E`: https://codepoints.net/U+FE0E
.. _`U+FE0F`: https://codepoints.net/U+FE0F
.. _`DerivedGeneralCategory.txt`: https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
.. _`EastAsianWidth.txt`: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def main():
setuptools.setup(
name='wcwidth',
# NOTE: manually manage __version__ in wcwidth/__init__.py !
version='0.2.13',
version='0.2.14',
description=(
"Measures the displayed width of unicode strings in a terminal"),
long_description=codecs.open(
Expand Down
75 changes: 69 additions & 6 deletions tests/test_emojis.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def test_longer_emoji_zwj_sequence():
u"\u200d" # 'Cf', 'N' -- ZERO WIDTH JOINER
u"\U0001F9D1" # 'So', 'W' -- ADULT
u"\U0001F3FD" # 'Sk', 'W' -- EMOJI MODIFIER FITZPATRICK TYPE-4
) * 2
) * 2
# This test adapted from https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf
expect_length_each = (2, 0, 0, 1, 0, 0, 2, 0, 2, 0) * 2
expect_length_phrase = 4
Expand All @@ -148,8 +148,8 @@ def test_longer_emoji_zwj_sequence():
def read_sequences_from_file(filename):
fp = codecs.open(os.path.join(os.path.dirname(__file__), filename), 'r', encoding='utf-8')
lines = [line.strip()
for line in fp.readlines()
if not line.startswith('#') and line.strip()]
for line in fp.readlines()
if not line.startswith('#') and line.strip()]
fp.close()
sequences = [make_sequence_from_line(line) for line in lines]
return lines, sequences
Expand Down Expand Up @@ -184,7 +184,7 @@ def test_recommended_emoji_zwj_sequences():

def test_recommended_variation_16_sequences():
"""
Test wcswidth of all of the unicode.org-published emoji-variation-sequences.txt
Test wcswidth of vs-16 sequences from unicode.org's emoji-variation-sequences.txt
"""
# given,
lines, sequences = read_sequences_from_file('emoji-variation-sequences.txt')
Expand All @@ -210,6 +210,34 @@ def test_recommended_variation_16_sequences():
assert num >= 742


def test_recommended_variation_15_sequences():
"""
Test wcswidth of vs-15 sequences from unicode.org's emoji-variation-sequences.txt
"""
# given,
lines, sequences = read_sequences_from_file('emoji-variation-sequences.txt')

errors = []
num = 0
for sequence, line in zip(sequences, lines):
num += 1
if '\ufe0e' not in sequence:
# filter for only \uFE0E (VS-15)
continue
measured_width = wcwidth.wcswidth(sequence)
if measured_width != 1:
errors.append({
'expected_width': 1,
'line': line,
'measured_width': wcwidth.wcswidth(sequence),
'sequence': sequence,
})

# verify
assert errors == []
assert num >= 742


def test_unicode_9_vs16():
"""Verify effect of VS-16 on unicode_version 9.0 and later"""
phrase = (u"\u2640" # FEMALE SIGN
Expand All @@ -226,8 +254,26 @@ def test_unicode_9_vs16():
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase


def test_unicode_9_vs15():
"""Verify effect of VS-16 on unicode_version 9.0 and later"""
phrase = (u"\U0001f4da" # BOOKS
u"\uFE0E") # VARIATION SELECTOR-15

expect_length_each = (2, 0)
expect_length_phrase = 1

# exercise,
length_each = tuple(wcwidth.wcwidth(w_char, unicode_version='9.0') for w_char in phrase)
length_phrase = wcwidth.wcswidth(phrase, unicode_version='9.0')

# verify.
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase


def test_unicode_8_vs16():
"""Verify that VS-16 has no effect on unicode_version 8.0 and earler"""
"""Verify that VS-16 has no effect on unicode_version 8.0 and earlier"""
phrase = (u"\u2640" # FEMALE SIGN
u"\uFE0F") # VARIATION SELECTOR-16

Expand All @@ -240,4 +286,21 @@ def test_unicode_8_vs16():

# verify.
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase
assert length_phrase == expect_length_phrase


def test_unicode_8_vs15():
"""Verify that VS-15 has no effect on unicode_version 8.0 and earlier"""
phrase = (u"\U0001f4da" # BOOKS
u"\uFE0E") # VARIATION SELECTOR-15

expect_length_each = (1, 0)
expect_length_phrase = 1

# exercise,
length_each = tuple(wcwidth.wcwidth(w_char, unicode_version='8.0') for w_char in phrase)
length_phrase = wcwidth.wcswidth(phrase, unicode_version='8.0')

# verify.
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase
5 changes: 4 additions & 1 deletion tests/test_table_integrity.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
"""
Executes verify-table-integrity.py as a unit test.
"""
# std imports
import os
import sys
import subprocess

# 3rd party
import pytest


@pytest.mark.skipif(sys.version_info[:2] != (3, 12), reason='Test only with a single version of python')
def test_verify_table_integrity():
subprocess.check_output([sys.executable, os.path.join(os.path.dirname(__file__),
os.path.pardir,
'bin',
'verify-table-integrity.py')])
'verify-table-integrity.py')])
6 changes: 5 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,13 @@ basepython = python3.11
commands = {envbindir}/isort --quiet --apply --recursive wcwidth tests bin

[testenv:pylint]
# Files table_vs15.py and table_wide.py erroneously report "duplicate lines".
# Except for adding '# pylint: disable=duplicate-code' to the template files, we
# can chose only to disable a specific check, or specific files. We ignore the
# files.
basepython = python3.11
commands = {envbindir}/pylint --rcfile={toxinidir}/.pylintrc \
--ignore=tests,docs,setup.py,conf.py,build,distutils,.pyenv,.git,.tox \
--ignore=tests,docs,setup.py,conf.py,build,distutils,.pyenv,.git,.tox,table_wide.py,table_vs15.py \
{posargs:{toxinidir}}/wcwidth

[testenv:flake8]
Expand Down
14 changes: 8 additions & 6 deletions wcwidth/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
"""
# re-export all functions & definitions, even private ones, from top-level
# module path, to allow for 'from wcwidth import _private_func'. Of course,
# user beware that any _private function may disappear or change signature at
# any future version.
# user beware that any _private functions or variables not exported by __all__
# may disappear or change signature at any future version.

# local
from .wcwidth import ZERO_WIDTH # noqa
from .wcwidth import (WIDE_EASTASIAN,
VS15_WIDE_TO_NARROW,
VS16_NARROW_TO_WIDE,
wcwidth,
wcswidth,
Expand All @@ -23,7 +24,8 @@
# 'from wcwidth import *', but also to say, "This is the public API".
__all__ = ('wcwidth', 'wcswidth', 'list_versions')

# We also used pkg_resources to load unicode version tables from version.json,
# generated by bin/update-tables.py, but some environments are unable to
# import pkg_resources for one reason or another, yikes!
__version__ = '0.2.13'
# We previously used pkg_resources to load unicode version tables from
# 'version.json', generated by bin/update-tables.py, but some environments are
# unable to import pkg_resources for one reason or another, so this is
# MANUALLY DUPLICATED here and in setup.py
__version__ = '0.2.14'
Loading

0 comments on commit 651f52c

Please sign in to comment.