gen-wcwidth.py

#!/usr/bin/env python3
# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>

import os
import re
import subprocess
import sys
from collections import defaultdict
from contextlib import contextmanager
from functools import lru_cache, partial
from html.entities import html5
from itertools import groupby
from operator import itemgetter
from typing import (
    Callable,
    DefaultDict,
    Dict,
    FrozenSet,
    Generator,
    Iterable,
    List,
    Optional,
    Set,
    Tuple,
    Union,
)
from urllib.request import urlopen

os.chdir(os.path.dirname(os.path.abspath(__file__)))

non_characters = frozenset(range(0xfffe, 0x10ffff, 0x10000))
non_characters |= frozenset(range(0xffff, 0x10ffff + 1, 0x10000))
non_characters |= frozenset(range(0xfdd0, 0xfdf0))
if len(non_characters) != 66:
    raise SystemExit('non_characters table incorrect')
emoji_skin_tone_modifiers = frozenset(range(0x1f3fb, 0x1F3FF + 1))


def get_data(fname: str, folder: str = 'UCD') -> Iterable[str]:
    url = f'https://www.unicode.org/Public/{folder}/latest/{fname}'
    bn = os.path.basename(url)
    local = os.path.join('/tmp', bn)
    if os.path.exists(local):
        with open(local, 'rb') as f:
            data = f.read()
    else:
        data = urlopen(url).read()
        with open(local, 'wb') as f:
            f.write(data)
    for line in data.decode('utf-8').splitlines():
        line = line.strip()
        if line and not line.startswith('#'):
            yield line


@lru_cache(maxsize=2)
def unicode_version() -> Tuple[int, int, int]:
    for line in get_data("ReadMe.txt"):
        m = re.search(r'Version\s+(\d+)\.(\d+)\.(\d+)', line)
        if m is not None:
            return int(m.group(1)), int(m.group(2)), int(m.group(3))
    raise ValueError('Could not find Unicode Version')


# Map of class names to set of codepoints in class
class_maps: Dict[str, Set[int]] = {}
all_symbols: Set[int] = set()
name_map: Dict[int, str] = {}
word_search_map: DefaultDict[str, Set[int]] = defaultdict(set)
soft_hyphen = 0xad
flag_codepoints = frozenset(range(0x1F1E6, 0x1F1E6 + 26))
# See https://github.com/harfbuzz/harfbuzz/issues/169
marks = set(emoji_skin_tone_modifiers) | flag_codepoints
not_assigned = set(range(0, sys.maxunicode))
property_maps: Dict[str, Set[int]] = defaultdict(set)


def parse_prop_list() -> None:
    global marks
    for line in get_data('ucd/PropList.txt'):
        if line.startswith('#'):
            continue
        cp_or_range, rest = line.split(';', 1)
        chars = parse_range_spec(cp_or_range.strip())
        name = rest.strip().split()[0]
        property_maps[name] |= chars
    # see https://www.unicode.org/faq/unsup_char.html#3
    marks |= property_maps['Other_Default_Ignorable_Code_Point']


def parse_ucd() -> None:

    def add_word(w: str, c: int) -> None:
        if c <= 32 or c == 127 or 128 <= c <= 159:
            return
        if len(w) > 1:
            word_search_map[w.lower()].add(c)

    first: Optional[int] = None
    for word, c in html5.items():
        if len(c) == 1:
            add_word(word.rstrip(';'), ord(c))
    word_search_map['nnbsp'].add(0x202f)
    for line in get_data('ucd/UnicodeData.txt'):
        parts = [x.strip() for x in line.split(';')]
        codepoint = int(parts[0], 16)
        name = parts[1] or parts[10]
        if name == '<control>':
            name = parts[10]
        if name:
            name_map[codepoint] = name
            for word in name.lower().split():
                add_word(word, codepoint)
        category = parts[2]
        s = class_maps.setdefault(category, set())
        desc = parts[1]
        codepoints: Union[Tuple[int, ...], Iterable[int]] = (codepoint,)
        if first is None:
            if desc.endswith(', First>'):
                first = codepoint
                continue
        else:
            codepoints = range(first, codepoint + 1)
            first = None
        for codepoint in codepoints:
            s.add(codepoint)
            not_assigned.discard(codepoint)
            if category.startswith('M'):
                marks.add(codepoint)
            elif category.startswith('S'):
                all_symbols.add(codepoint)
            elif category == 'Cf':
                # we add Cf to marks as it contains things like tags and zero
                # width chars. Not sure if *all* of Cf should be treated as
                # combining chars, might need to add individual exceptions in
                # the future.
                marks.add(codepoint)

    with open('nerd-fonts-glyphs.txt') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            code, category, name = line.split(' ', 2)
            codepoint = int(code, 16)
            if name and codepoint not in name_map:
                name_map[codepoint] = name.upper()
                for word in name.lower().split():
                    add_word(word, codepoint)

    # Some common synonyms
    word_search_map['bee'] |= word_search_map['honeybee']
    word_search_map['lambda'] |= word_search_map['lamda']
    word_search_map['lamda'] |= word_search_map['lambda']
    word_search_map['diamond'] |= word_search_map['gem']


def parse_range_spec(spec: str) -> Set[int]:
    spec = spec.strip()
    if '..' in spec:
        chars_ = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
        chars = set(range(chars_[0], chars_[1] + 1))
    else:
        chars = {int(spec, 16)}
    return chars


def split_two(line: str) -> Tuple[Set[int], str]:
    spec, rest = line.split(';', 1)
    spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
    return parse_range_spec(spec), rest


all_emoji: Set[int] = set()
emoji_presentation_bases: Set[int] = set()
narrow_emoji: Set[int] = set()
wide_emoji: Set[int] = set()
flags: Dict[int, List[int]] = {}


def parse_basic_emoji(spec: str) -> None:
    parts = list(filter(None, spec.split()))
    has_emoji_presentation = len(parts) < 2
    chars = parse_range_spec(parts[0])
    all_emoji.update(chars)
    emoji_presentation_bases.update(chars)
    (wide_emoji if has_emoji_presentation else narrow_emoji).update(chars)


def parse_keycap_sequence(spec: str) -> None:
    base, fe0f, cc = list(filter(None, spec.split()))
    chars = parse_range_spec(base)
    all_emoji.update(chars)
    emoji_presentation_bases.update(chars)
    narrow_emoji.update(chars)


def parse_flag_emoji_sequence(spec: str) -> None:
    a, b = list(filter(None, spec.split()))
    left, right = int(a, 16), int(b, 16)
    chars = {left, right}
    all_emoji.update(chars)
    wide_emoji.update(chars)
    emoji_presentation_bases.update(chars)
    flags.setdefault(left, []).append(right)


def parse_emoji_tag_sequence(spec: str) -> None:
    a = int(spec.split()[0], 16)
    all_emoji.add(a)
    wide_emoji.add(a)
    emoji_presentation_bases.add(a)


def parse_emoji_modifier_sequence(spec: str) -> None:
    a, b = list(filter(None, spec.split()))
    char, mod = int(a, 16), int(b, 16)
    mod
    all_emoji.add(char)
    wide_emoji.add(char)
    emoji_presentation_bases.add(char)


def parse_emoji() -> None:
    for line in get_data('emoji-sequences.txt', 'emoji'):
        parts = [x.strip() for x in line.split(';')]
        if len(parts) < 2:
            continue
        data, etype = parts[:2]
        if etype == 'Basic_Emoji':
            parse_basic_emoji(data)
        elif etype == 'Emoji_Keycap_Sequence':
            parse_keycap_sequence(data)
        elif etype == 'RGI_Emoji_Flag_Sequence':
            parse_flag_emoji_sequence(data)
        elif etype == 'RGI_Emoji_Tag_Sequence':
            parse_emoji_tag_sequence(data)
        elif etype == 'RGI_Emoji_Modifier_Sequence':
            parse_emoji_modifier_sequence(data)


doublewidth: Set[int] = set()
ambiguous: Set[int] = set()


def parse_eaw() -> None:
    global doublewidth, ambiguous
    seen: Set[int] = set()
    for line in get_data('ucd/EastAsianWidth.txt'):
        chars, eaw = split_two(line)
        if eaw == 'A':
            ambiguous |= chars
            seen |= chars
        elif eaw in ('W', 'F'):
            doublewidth |= chars
            seen |= chars
    doublewidth |= set(range(0x3400, 0x4DBF + 1)) - seen
    doublewidth |= set(range(0x4E00, 0x9FFF + 1)) - seen
    doublewidth |= set(range(0xF900, 0xFAFF + 1)) - seen
    doublewidth |= set(range(0x20000, 0x2FFFD + 1)) - seen
    doublewidth |= set(range(0x30000, 0x3FFFD + 1)) - seen


def get_ranges(items: List[int]) -> Generator[Union[int, Tuple[int, int]], None, None]:
    items.sort()
    for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
        group = tuple(map(itemgetter(1), g))
        a, b = group[0], group[-1]
        if a == b:
            yield a
        else:
            yield a, b


def write_case(spec: Union[Tuple[int, ...], int], p: Callable[..., None], for_go: bool = False) -> None:
    if isinstance(spec, tuple):
        if for_go:
            v = ', '.join(f'0x{x:x}' for x in range(spec[0], spec[1] + 1))
            p(f'\t\tcase {v}:')
        else:
            p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
    else:
        p(f'\t\tcase 0x{spec:x}:')


@contextmanager
def create_header(path: str, include_data_types: bool = True) -> Generator[Callable[..., None], None, None]:
    with open(path, 'w') as f:
        p = partial(print, file=f)
        p('// Unicode data, built from the Unicode Standard', '.'.join(map(str, unicode_version())))
        p(f'// Code generated by {os.path.basename(__file__)}, DO NOT EDIT.', end='\n\n')
        if path.endswith('.h'):
            p('#pragma once')
        if include_data_types:
            p('#include "data-types.h"\n')
            p('START_ALLOW_CASE_RANGE')
        p()
        yield p
        p()
        if include_data_types:
            p('END_ALLOW_CASE_RANGE')


def gen_emoji() -> None:
    with create_header('kitty/emoji.h') as p:
        p('static inline bool\nis_emoji(char_type code) {')
        p('\tswitch(code) {')
        for spec in get_ranges(list(all_emoji)):
            write_case(spec, p)
            p('\t\t\treturn true;')
        p('\t\tdefault: return false;')
        p('\t}')
        p('\treturn false;\n}')

        p('static inline bool\nis_symbol(char_type code) {')
        p('\tswitch(code) {')
        for spec in get_ranges(list(all_symbols)):
            write_case(spec, p)
            p('\t\t\treturn true;')
        p('\t\tdefault: return false;')
        p('\t}')
        p('\treturn false;\n}')


def category_test(
    name: str,
    p: Callable[..., None],
    classes: Iterable[str],
    comment: str,
    use_static: bool = False,
    extra_chars: Union[FrozenSet[int], Set[int]] = frozenset(),
    exclude: Union[Set[int], FrozenSet[int]] = frozenset(),
    least_check_return: Optional[str] = None,
    ascii_range: Optional[str] = None
) -> None:
    static = 'static inline ' if use_static else ''
    chars: Set[int] = set()
    for c in classes:
        chars |= class_maps[c]
    chars |= extra_chars
    chars -= exclude
    p(f'{static}bool\n{name}(char_type code) {{')
    p(f'\t// {comment} ({len(chars)} codepoints)' + ' {{' '{')
    if least_check_return is not None:
        least = min(chars)
        p(f'\tif (LIKELY(code < {least})) return {least_check_return};')
    if ascii_range is not None:
        p(f'\tif (LIKELY(0x20 <= code && code <= 0x7e)) return {ascii_range};')
    p('\tswitch(code) {')
    for spec in get_ranges(list(chars)):
        write_case(spec, p)
        p('\t\t\treturn true;')
    p('\t} // }}}\n')
    p('\treturn false;\n}\n')


def codepoint_to_mark_map(p: Callable[..., None], mark_map: List[int]) -> Dict[int, int]:
    p('\tswitch(c) { // {{{')
    rmap = {c: m for m, c in enumerate(mark_map)}
    for spec in get_ranges(mark_map):
        if isinstance(spec, tuple):
            s = rmap[spec[0]]
            cases = ' '.join(f'case {i}:' for i in range(spec[0], spec[1]+1))
            p(f'\t\t{cases} return {s} + c - {spec[0]};')
        else:
            p(f'\t\tcase {spec}: return {rmap[spec]};')
    p('default: return 0;')
    p('\t} // }}}')
    return rmap


def classes_to_regex(classes: Iterable[str], exclude: str = '') -> Iterable[str]:
    chars: Set[int] = set()
    for c in classes:
        chars |= class_maps[c]
    for x in map(ord, exclude):
        chars.discard(x)

    def as_string(codepoint: int) -> str:
        if codepoint < 256:
            return fr'\x{codepoint:02x}'
        if codepoint <= 0xffff:
            return fr'\u{codepoint:04x}'
        return fr'\U{codepoint:08x}'

    for spec in get_ranges(list(chars)):
        if isinstance(spec, tuple):
            yield '{}-{}'.format(*map(as_string, (spec[0], spec[1])))
        else:
            yield as_string(spec)


def gen_ucd() -> None:
    cz = {c for c in class_maps if c[0] in 'CZ'}
    with create_header('kitty/unicode-data.c') as p:
        p('#include "unicode-data.h"')
        category_test(
                'is_combining_char', p,
                (),
                'Combining and default ignored characters',
                extra_chars=marks,
                least_check_return='false'
        )
        category_test(
            'is_ignored_char', p, 'Cc Cs'.split(),
            'Control characters and non-characters',
            extra_chars=non_characters,
            ascii_range='false'
        )
        category_test(
            'is_non_rendered_char', p, 'Cc Cs Cf'.split(),
            'Other_Default_Ignorable_Code_Point and soft hyphen',
            extra_chars=property_maps['Other_Default_Ignorable_Code_Point'] | set(range(0xfe00, 0xfe0f + 1)),
            ascii_range='false'
        )
        category_test('is_word_char', p, {c for c in class_maps if c[0] in 'LN'}, 'L and N categories')
        category_test('is_CZ_category', p, cz, 'C and Z categories')
        category_test('is_P_category', p, {c for c in class_maps if c[0] == 'P'}, 'P category (punctuation)')
        mark_map = [0] + list(sorted(marks))
        p('char_type codepoint_for_mark(combining_type m) {')
        p(f'\tstatic char_type map[{len(mark_map)}] =', '{', ', '.join(map(str, mark_map)), '}; // {{{ mapping }}}')
        p('\tif (m < arraysz(map)) return map[m];')
        p('\treturn 0;')
        p('}\n')
        p('combining_type mark_for_codepoint(char_type c) {')
        rmap = codepoint_to_mark_map(p, mark_map)
        p('}\n')
        with open('kitty/unicode-data.h', 'r+') as f:
            raw = f.read()
            f.seek(0)
            raw, num = re.subn(
                r'^// START_KNOWN_MARKS.+?^// END_KNOWN_MARKS',
                '// START_KNOWN_MARKS\nstatic const combining_type '
                f'VS15 = {rmap[0xfe0e]}, VS16 = {rmap[0xfe0f]};'
                '\n// END_KNOWN_MARKS', raw, flags=re.MULTILINE | re.DOTALL)
            if not num:
                raise SystemExit('Faile dto patch mark definitions in unicode-data.h')
            f.truncate()
            f.write(raw)

    with open('kittens/hints/url_regex.py', 'w') as f:
        f.write('# generated by gen-wcwidth.py, do not edit\n\n')
        f.write("url_delimiters = '{}'  # noqa".format(''.join(classes_to_regex(cz, exclude='\n\r'))))


def gen_names() -> None:
    with create_header('kittens/unicode_input/names.h') as p:
        mark_to_cp = list(sorted(name_map))
        cp_to_mark = {cp: m for m, cp in enumerate(mark_to_cp)}
        # Mapping of mark to codepoint name
        p(f'static const char* name_map[{len(mark_to_cp)}] = {{' ' // {{{')
        for cp in mark_to_cp:
            w = name_map[cp].replace('"', '\\"')
            p(f'\t"{w}",')
        p("}; // }}}\n")

        # Mapping of mark to codepoint
        p(f'static const char_type mark_to_cp[{len(mark_to_cp)}] = {{' ' // {{{')
        p(', '.join(map(str, mark_to_cp)))
        p('}; // }}}\n')

        # Function to get mark number for codepoint
        p('static char_type mark_for_codepoint(char_type c) {')
        codepoint_to_mark_map(p, mark_to_cp)
        p('}\n')
        p('static inline const char* name_for_codepoint(char_type cp) {')
        p('\tchar_type m = mark_for_codepoint(cp); if (m == 0) return NULL;')
        p('\treturn name_map[m];')
        p('}\n')

        # Array of all words
        word_map = tuple(sorted(word_search_map))
        word_rmap = {w: i for i, w in enumerate(word_map)}
        p(f'static const char* all_words_map[{len(word_map)}] = {{' ' // {{{')
        cwords = (w.replace('"', '\\"') for w in word_map)
        p(', '.join(f'"{w}"' for w in cwords))
        p('}; // }}}\n')

        # Array of sets of marks for each word
        word_to_marks = {word_rmap[w]: frozenset(map(cp_to_mark.__getitem__, cps)) for w, cps in word_search_map.items()}
        all_mark_groups = frozenset(word_to_marks.values())
        array = [0]
        mg_to_offset = {}
        for mg in all_mark_groups:
            mg_to_offset[mg] = len(array)
            array.append(len(mg))
            array.extend(sorted(mg))
        p(f'static const char_type mark_groups[{len(array)}] = {{' ' // {{{')
        p(', '.join(map(str, array)))
        p('}; // }}}\n')
        offsets_array = []
        for wi, w in enumerate(word_map):
            mg = word_to_marks[wi]
            offsets_array.append(mg_to_offset[mg])
        p(f'static const char_type mark_to_offset[{len(offsets_array)}] = {{' ' // {{{')
        p(', '.join(map(str, offsets_array)))
        p('}; // }}}\n')

        # The trie
        p('typedef struct { uint32_t children_offset; uint32_t match_offset; } word_trie;\n')
        all_trie_nodes: List['TrieNode'] = []

        class TrieNode:

            def __init__(self) -> None:
                self.match_offset = 0
                self.children_offset = 0
                self.children: Dict[int, int] = {}

            def add_letter(self, letter: int) -> int:
                if letter not in self.children:
                    self.children[letter] = len(all_trie_nodes)
                    all_trie_nodes.append(TrieNode())
                return self.children[letter]

            def __str__(self) -> str:
                return f'{{ .children_offset={self.children_offset}, .match_offset={self.match_offset} }}'

        root = TrieNode()
        all_trie_nodes.append(root)

        def add_word(word_idx: int, word: str) -> None:
            parent = root
            for letter in map(ord, word):
                idx = parent.add_letter(letter)
                parent = all_trie_nodes[idx]
            parent.match_offset = offsets_array[word_idx]

        for i, word in enumerate(word_map):
            add_word(i, word)
        children_array = [0]
        for node in all_trie_nodes:
            if node.children:
                node.children_offset = len(children_array)
                children_array.append(len(node.children))
                for letter, child_offset in node.children.items():
                    children_array.append((child_offset << 8) | (letter & 0xff))

        p(f'static const word_trie all_trie_nodes[{len(all_trie_nodes)}] = {{' ' // {{{')
        p(',\n'.join(map(str, all_trie_nodes)))
        p('\n}; // }}}\n')
        p(f'static const uint32_t children_array[{len(children_array)}] = {{' ' // {{{')
        p(', '.join(map(str, children_array)))
        p('}; // }}}\n')


def gen_wcwidth() -> None:
    seen: Set[int] = set()
    non_printing = class_maps['Cc'] | class_maps['Cf'] | class_maps['Cs']

    def add(p: Callable[..., None], comment: str, chars_: Union[Set[int], FrozenSet[int]], ret: int, for_go: bool = False) -> None:
        chars = chars_ - seen
        seen.update(chars)
        p(f'\t\t// {comment} ({len(chars)} codepoints)' + ' {{' '{')
        for spec in get_ranges(list(chars)):
            write_case(spec, p, for_go)
            p(f'\t\t\treturn {ret};')
        p('\t\t// }}}\n')

    def add_all(p: Callable[..., None], for_go: bool = False) -> None:
        seen.clear()
        add(p, 'Flags', flag_codepoints, 2, for_go)
        add(p, 'Marks', marks | {0}, 0, for_go)
        add(p, 'Non-printing characters', non_printing, -1, for_go)
        add(p, 'Private use', class_maps['Co'], -3, for_go)
        add(p, 'Text Presentation', narrow_emoji, 1, for_go)
        add(p, 'East Asian ambiguous width', ambiguous, -2, for_go)
        add(p, 'East Asian double width', doublewidth, 2, for_go)
        add(p, 'Emoji Presentation', wide_emoji, 2, for_go)

        add(p, 'Not assigned in the unicode character database', not_assigned, -4, for_go)

        p('\t\tdefault:\n\t\t\treturn 1;')
        p('\t}')
        if for_go:
            p('\t}')
        else:
            p('\treturn 1;\n}')

    with create_header('kitty/wcwidth-std.h') as p, open('tools/wcswidth/std.go', 'w') as gof:
        gop = partial(print, file=gof)
        gop('package wcswidth\n\n')
        gop('func Runewidth(code rune) int {')
        p('static inline int\nwcwidth_std(int32_t code) {')
        p('\tif (LIKELY(0x20 <= code && code <= 0x7e)) { return 1; }')
        p('\tswitch(code) {')
        gop('\tswitch(code) {')
        add_all(p)
        add_all(gop, True)

        p('static inline bool\nis_emoji_presentation_base(uint32_t code) {')
        gop('func IsEmojiPresentationBase(code rune) bool {')
        p('\tswitch(code) {')
        gop('\tswitch(code) {')
        for spec in get_ranges(list(emoji_presentation_bases)):
            write_case(spec, p)
            write_case(spec, gop, for_go=True)
            p('\t\t\treturn true;')
            gop('\t\t\treturn true;')
        p('\t\tdefault: return false;')
        p('\t}')
        gop('\t\tdefault:\n\t\t\treturn false')
        gop('\t}')
        p('\treturn true;\n}')
        gop('\n}')
        uv = unicode_version()
        p(f'#define UNICODE_MAJOR_VERSION {uv[0]}')
        p(f'#define UNICODE_MINOR_VERSION {uv[1]}')
        p(f'#define UNICODE_PATCH_VERSION {uv[2]}')
        gop('var UnicodeDatabaseVersion [3]int = [3]int{' f'{uv[0]}, {uv[1]}, {uv[2]}' + '}')
    subprocess.check_call(['gofmt', '-w', '-s', gof.name])


parse_ucd()
parse_prop_list()
parse_emoji()
parse_eaw()
gen_ucd()
gen_wcwidth()
gen_emoji()
gen_names()