Bugfixes for zero-width characters (#91)

Major ----- Bugfix zero-with characters, closes #57, #47, #45, #39, #26, #25, #24, #22, #8, wow ! This is mostly achieved by replacing `ZERO_WIDTH_CF` with dynamic parsing by Category codes in bin/update-tables.py and putting those in the zero-wide tables. Tests ----- - `verify-table-integrity.py` exercises a "bug" of duplicated tables that has no effect, because wcswidth() first checks for zero-width, and that is preferred in cases of conflict. This PR also resolves that error of duplication. - new automatic tests for balinese, kr jamo, zero-width emoji, devanagari, tamil, kannada. - added pytest-benchmark plugin, example use: # baseline tox -epy312 -- --verbose --benchmark-save=original # compare tox -epy312 -- --verbose --benchmark-compare=.benchmarks/Linux-CPython-3.12-64bit/0001_original.json
jquast · Oct 30, 2023 · 04d6d90 · 04d6d90
1 parent 4f41d0c
commit 04d6d90
Show file tree

Hide file tree

Showing 18 changed files with 2,735 additions and 2,512 deletions.
diff --git a/bin/update-tables.py b/bin/update-tables.py
@@ -52,7 +52,7 @@
 JINJA_ENV = jinja2.Environment(
     loader=jinja2.FileSystemLoader(os.path.join(PATH_UP, 'code_templates')),
     keep_trailing_newline=True)
-UTC_NOW = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC")
+UTC_NOW = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
 
 CONNECT_TIMEOUT = int(os.environ.get('CONNECT_TIMEOUT', '10'))
 FETCH_BLOCKSIZE = int(os.environ.get('FETCH_BLOCKSIZE', '4096'))
@@ -72,7 +72,7 @@ class UnicodeVersion:
     @classmethod
     def parse(cls, version_str: str) -> UnicodeVersion:
         """
-        parse a version string.
+        Parse a version string.
 
         >>> UnicodeVersion.parse("14.0.0")
         UnicodeVersion(major=14, minor=0, micro=0)
@@ -90,21 +90,99 @@ def __str__(self) -> str:
 @dataclass(frozen=True)
 class TableEntry:
     """An entry of a unicode table."""
-    code_range: range | None
+    code_range: tuple[int, int] | None
     properties: tuple[str, ...]
     comment: str
 
+    def filter_by_category(self, category_codes: str, wide: int) -> bool:
+        """
+        Return whether entry matches given category code and displayed width.
+
+        Categories are described here, https://www.unicode.org/reports/tr44/#GC_Values_Table
+        """
+        if self.code_range is None:
+            return False
+        elif self.properties[0] == 'Sk':
+            if 'EMOJI MODIFIER' in self.comment:
+                # These codepoints are fullwidth when used without emoji, 0-width with.
+                # Generate code that expects the best case, that is always combined
+                return wide == 0
+            elif 'FULLWIDTH' in self.comment:
+                # Some codepoints in 'Sk' categories are fullwidth(!)
+                # at this time just 3, FULLWIDTH: CIRCUMFLEX ACCENT, GRAVE ACCENT, and MACRON
+                return wide == 2
+            else:
+                # the rest are narrow
+                return wide == 1
+        # Me Enclosing Mark
+        # Mn Nonspacing Mark
+        # Cf Format
+        # Zl Line Separator
+        # Zp Paragraph Separator
+        if self.properties[0] in ('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp'):
+            return wide == 0
+        # F  Fullwidth
+        # W  Wide
+        if self.properties[0] in ('W', 'F'):
+            return wide == 2
+        return wide == 1
+
+    @staticmethod
+    def parse_category_values(category_codes: str,
+                              table_iter: Iterator[TableEntry],
+                              wide: int) -> set[tuple[int, int]]:
+        """Parse value ranges of unicode data files, by given category and width."""
+        return {n
+                for entry in table_iter
+                if entry.filter_by_category(category_codes, wide)
+                for n in list(range(entry.code_range[0], entry.code_range[1]))}
+
 
 @dataclass
 class TableDef:
     filename: str
     date: str
-    values: list[tuple[str, str, str]]
+    values: set[int]
+
+    def as_value_ranges(self) -> list[tuple[int, int]]:
+        """Return a list of tuple of (start, end) ranges for given set of 'values'."""
+        table: list[tuple[int, int]] = []
+        values_iter = iter(sorted(self.values))
+        start = end = next(values_iter)
+        table.append((start, end))
+
+        for value in values_iter:
+            # remove last-most entry for comparison,
+            start, end = table.pop()
+            if end == value - 1:
+                # continuation of existing range, rewrite
+                table.append((start, value,))
+            else:
+                # non-continuation: insert back previous range,
+                table.append((start, end,))
+                # and start a new one
+                table.append((value, value,))
+        return table
+
+    @property
+    def hex_range_descriptions(self) -> list[tuple[str, str, str]]:
+        """Convert integers into string table of (hex_start, hex_end, txt_description)."""
+        pytable_values: list[tuple[str, str, str]] = []
+        for start, end in self.as_value_ranges():
+            hex_start, hex_end = f'0x{start:05x}', f'0x{end:05x}'
+            ucs_start, ucs_end = chr(start), chr(end)
+            name_start = name_ucs(ucs_start) or '(nil)'
+            name_end = name_ucs(ucs_end) or '(nil)'
+            if name_start != name_end:
+                txt_description = f'{name_start[:24].rstrip():24s}..{name_end[:24].rstrip()}'
+            else:
+                txt_description = f'{name_start[:48]}'
+            pytable_values.append((hex_start, hex_end, txt_description))
+        return pytable_values
 
 
 @dataclass(frozen=True)
 class RenderContext:
-
     def to_dict(self) -> dict[str, Any]:
         return {field.name: getattr(self, field.name)
                 for field in fields(self)}
@@ -145,11 +223,11 @@ def __post_init__(self) -> None:
         }
 
     def render(self) -> str:
-        """just like jinja2.Template.render."""
+        """Just like jinja2.Template.render."""
         return self._template.render(self._render_context)
 
     def generate(self) -> Iterator[str]:
-        """just like jinja2.Template.generate."""
+        """Just like jinja2.Template.generate."""
         return self._template.generate(self._render_context)
 
 
@@ -248,22 +326,38 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
     """Fetch and update east-asian tables."""
     table: dict[UnicodeVersion, TableDef] = {}
     for version in fetch_unicode_versions():
-        fname = os.path.join(PATH_DATA, f'EastAsianWidth-{version}.txt')
-        do_retrieve(url=URL_EASTASIAN_WIDTH.format(version=version), fname=fname)
-        table[version] = parse_category(fname=fname, category_codes=('W', 'F',))
+        # parse typical 'wide' characters by categories 'W' and 'F',
+        fname_eaw = os.path.join(PATH_DATA, f'EastAsianWidth-{version}.txt')
+        do_retrieve(url=URL_EASTASIAN_WIDTH.format(version=version), fname=fname_eaw)
+        table[version] = parse_category(fname=fname_eaw, category_codes=('W', 'F'), wide=2)
+
+        # subtract(!) wide characters that are defined as 'W' category in EAW, but
+        # as a zero-width category 'Mn' or 'Mc' in DGC, which is preferred.
+        fname_dgc = os.path.join(PATH_DATA, f'DerivedGeneralCategory-{version}.txt')
+        do_retrieve(url=URL_UNICODE_DERIVED_AGE.format(version=version), fname=fname_dgc)
+        table[version].values.discard(parse_category(fname=fname_dgc, category_codes=('Mn', 'Mc'), wide=0).values)
+
+        # join with some atypical 'wide' characters defined only by category
+        # 'Sk' in DGC
+        table[version].values.update(parse_category(fname=fname_dgc, category_codes=('Sk',), wide=2).values)
     return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)
 
 
 def fetch_table_zero_data() -> UnicodeTableRenderCtx:
-    """Fetch and update zero width tables."""
+    """
+    Fetch and update zero width tables.
+
+    See also: https://unicode.org/L2/L2002/02368-default-ignorable.html
+    """
     table: dict[UnicodeVersion, TableDef] = {}
     for version in fetch_unicode_versions():
-        fname = os.path.join(PATH_DATA, f'DerivedGeneralCategory-{version}.txt')
-        do_retrieve(url=URL_DERIVED_CATEGORY.format(version=version), fname=fname)
-        # TODO: test whether all of category, 'Cf' should be 'zero
-        #       width', or, just the subset 2060..2064, see open issue
-        #       https://github.com/jquast/wcwidth/issues/26
-        table[version] = parse_category(fname=fname, category_codes=('Me', 'Mn',))
+        # Determine values of zero-width character lookup table by the following category codes
+        fname_dgc = os.path.join(PATH_DATA, f'DerivedGeneralCategory-{version}.txt')
+        do_retrieve(url=URL_DERIVED_CATEGORY.format(version=version), fname=fname_dgc)
+        table[version] = parse_category(fname=fname_dgc, category_codes=('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp', 'Sk'), wide=0)
+
+        # And, include NULL
+        table[version].values.add(0)
     return UnicodeTableRenderCtx('ZERO_WIDTH', table)
 
 
@@ -277,54 +371,11 @@ def cite_source_description(filename: str) -> tuple[str, str]:
     return fname, date
 
 
-def make_table(values: Collection[int]) -> tuple[tuple[int, int], ...]:
-    """
-    Return a tuple of lookup tables for given values.
-
-    >>> make_table([0,1,2,5,6,7,9])
-    ((0, 2), (5, 7), (9, 9))
-    """
-    table: list[tuple[int, int]] = []
-    values_iter = iter(values)
-    start = end = next(values_iter)
-    table.append((start, end))
-
-    for value in values_iter:
-        start, end = table.pop()
-        if end == value - 1:
-            # continuation of existing range
-            table.append((start, value,))
-        else:
-            # put back existing range,
-            table.append((start, end,))
-            # and start a new one
-            table.append((value, value,))
-    return tuple(table)
-
-
-def convert_values_to_string_table(
-    values: Collection[tuple[int, int]],
-) -> list[tuple[str, str, str]]:
-    """Convert integers into string table of (hex_start, hex_end, txt_description)."""
-    pytable_values: list[tuple[str, str, str]] = []
-    for start, end in values:
-        hex_start, hex_end = (f'0x{start:05x}', f'0x{end:05x}')
-        ucs_start, ucs_end = chr(start), chr(end)
-        name_start, name_end = '(nil)', '(nil)'
-        try:
-            name_start = string.capwords(unicodedata.name(ucs_start))
-        except ValueError:
-            pass
-        try:
-            name_end = string.capwords(unicodedata.name(ucs_end))
-        except ValueError:
-            pass
-        if name_start != name_end:
-            txt_description = f'{name_start[:24].rstrip():24s}..{name_end[:24].rstrip()}'
-        else:
-            txt_description = f'{name_start[:48]}'
-        pytable_values.append((hex_start, hex_end, txt_description))
-    return pytable_values
+def name_ucs(ucs: str) -> str:
+    try:
+        return string.capwords(unicodedata.name(ucs))
+    except ValueError:
+        return None
 
 
 def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
@@ -346,13 +397,12 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
             start, end = code_points_str.split('..')
         else:
             start = end = code_points_str
-        code_range = range(int(start, base=16),
-                           int(end, base=16) + 1)
+        code_range = (int(start, base=16), int(end, base=16) + 1)
 
         yield TableEntry(code_range, tuple(properties), comment)
 
 
-def parse_category(fname: str, category_codes: Container[str]) -> TableDef:
+def parse_category(fname: str, category_codes: Container[str], wide: int) -> TableDef:
     """Parse value ranges of unicode data files, by given categories into string tables."""
     print(f'parsing {fname}: ', end='', flush=True)
 
@@ -363,16 +413,9 @@ def parse_category(fname: str, category_codes: Container[str]) -> TableDef:
         version = next(table_iter).comment.strip()
         # and "date string" from second line
         date = next(table_iter).comment.split(':', 1)[1].strip()
-
-        values: set[int] = set()
-        for entry in table_iter:
-            if (entry.code_range is not None
-                    and entry.properties[0] in category_codes):
-                values.update(entry.code_range)
-
-    txt_values = convert_values_to_string_table(make_table(sorted(values)))
+        values = TableEntry.parse_category_values(category_codes, table_iter, wide)
     print('ok')
-    return TableDef(version, date, txt_values)
+    return TableDef(version, date, values)
 
 
 @functools.cache
@@ -401,7 +444,7 @@ def is_url_newer(url: str, fname: str) -> bool:
 def do_retrieve(url: str, fname: str) -> None:
     """Retrieve given url to target filepath fname."""
     folder = os.path.dirname(fname)
-    if not os.path.exists(folder):
+    if folder and not os.path.exists(folder):
         os.makedirs(folder)
     if not is_url_newer(url, fname):
         return
@@ -431,9 +474,9 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
         yield UnicodeVersionPyRenderDef.new(
             UnicodeVersionPyRenderCtx(fetch_unicode_versions())
         )
-        yield UnicodeVersionRstRenderDef.new(fetch_source_headers())
         yield UnicodeTableRenderDef.new('table_wide.py', fetch_table_wide_data())
         yield UnicodeTableRenderDef.new('table_zero.py', fetch_table_zero_data())
+        yield UnicodeVersionRstRenderDef.new(fetch_source_headers())
 
     for render_def in get_codegen_definitions():
         with open(render_def.output_filename, 'w', encoding='utf-8', newline='\n') as fout:
@@ -445,3 +488,4 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
 
 if __name__ == '__main__':
     main()
+
diff --git a/bin/wcwidth-browser.py b/bin/wcwidth-browser.py
@@ -159,12 +159,12 @@ class Style(object):
     #         Too few public methods (0/2)
     @staticmethod
     def attr_major(text):
-        """non-stylized callable for "major" text, for non-ttys."""
+        """Non-stylized callable for "major" text, for non-ttys."""
         return text
 
     @staticmethod
     def attr_minor(text):
-        """non-stylized callable for "minor" text, for non-ttys."""
+        """Non-stylized callable for "minor" text, for non-ttys."""
         return text
 
     delimiter = '|'

diff --git a/code_templates/python_table.py.j2 b/code_templates/python_table.py.j2
@@ -9,7 +9,7 @@ This code generated by {{this_filepath}} on {{utc_now}}.
         # Source: {{ table_def.filename }}
         # Date: {{ table_def.date }}
         #
-{%- for hex_start, hex_end, txt_description in table_def.values %}
+{%- for hex_start, hex_end, txt_description in table_def.hex_range_descriptions %}
         ({{ hex_start }}, {{ hex_end }},),  # {{txt_description}}
 {%- endfor %}
     ),

diff --git a/docs/api.rst b/docs/api.rst
@@ -2,10 +2,9 @@
 Public API
 ==========
 
-This package follows SEMVER_ rules for version, therefore, for all of the
-given functions signatures, at example version 1.1.1, you may use version
-dependency ``>=1.1.1,<2.0`` for forward compatibility of future wcwidth
-versions.
+This package follows SEMVER_ rules.  Therefore, for the functions of the below
+list, you may safely use version dependency definition ``wcwidth<2`` in your
+requirements.txt or equivalent. Their signatures will never change.
 
 .. autofunction:: wcwidth.wcwidth
 
@@ -22,7 +21,7 @@ Private API
 These functions should only be used for wcwidth development, and not used by
 dependent packages except with care and by use of frozen version dependency,
 as these functions may change names, signatures, or disappear entirely at any
-time in the future, and not reflected by SEMVER rules.
+time in the future, and not reflected by SEMVER_ rules!
 
 If stable public API for any of the given functions is needed, please suggest a
 Pull Request!

diff --git a/docs/conf.py b/docs/conf.py
@@ -99,7 +99,7 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-#html_static_path = ['_static']
+# html_static_path = ['_static']
 
 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.

diff --git a/docs/index.rst b/docs/index.rst
@@ -5,6 +5,7 @@ wcwidth
 
    intro
    unicode_version
+   specs
    api
 
 Indices and tables

diff --git a/docs/intro.rst b/docs/intro.rst
@@ -216,6 +216,11 @@ Other Languages
 =======
 History
 =======
+0.2.9 *2023-10-20*
+  * **Bugfix** zero-width characters used in Emoji ZWJ sequences, Balinese,
+    Jamo, Devanagari, Tamil, Kannada and others (`PR #91`).
+  * **Updated** to include `Specification <Specification_from_pypi>`_ of
+    character measurements.
 
 0.2.8 *2023-09-30*
   * Include requirements files in the source distibution (`PR #82`).
@@ -296,6 +301,7 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c::
  * for any purpose and without fee is hereby granted. The author
  * disclaims all warranties with regard to this software.
 
+.. _`Specification_from_pypi`: https://wcwidth.readthedocs.io/en/latest/specs.html
 .. _`tox`: https://tox.wiki/en/latest/
 .. _`prospector`: https://github.com/landscapeio/prospector
 .. _`combining`: https://en.wikipedia.org/wiki/Combining_character