-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_table_zwj.py
97 lines (84 loc) · 3.46 KB
/
make_table_zwj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# This is a minified version of bin/update-tables.py from https://github.com/jquast/wcwidth/
import os
import re
import collections
# third party
import requests
import wcwidth
URL_EMOJI_ZWJ_SEQUENCES = "https://unicode.org/Public/emoji/{version}/emoji-zwj-sequences.txt"
FETCH_BLOCKSIZE = 3096
PATH_UP = os.path.relpath(os.path.join(os.path.dirname(__file__), os.path.pardir))
PATH_DATA = os.path.join(PATH_UP, "data")
def do_retrieve(url: str, fname: str) -> None:
"""Retrieve given url to target filepath fname."""
folder = os.path.dirname(fname)
if folder and not os.path.exists(folder):
os.makedirs(folder, exist_ok=True)
if os.path.exists(fname):
return
resp = requests.get(url, stream=True)
with open(fname, "wb") as fout:
for chunk in resp.iter_content(FETCH_BLOCKSIZE):
fout.write(chunk)
def fetch_zwj_data():
"""Determine Unicode Versions with Emoji Zero Width Join character support."""
# From Unicode® Technical Standard #51
#
# > Starting with Version 11.0 of this specification, the repertoire of
# > emoji characters is synchronized with the Unicode Standard, and has the
# > same version numbering system. For details, see Section 1.5.2, Versioning.
#
# http://www.unicode.org/reports/tr51/#Versioning
# http://www.unicode.org/reports/tr51/#EmojiVersions
fname = os.path.join(PATH_DATA, URL_EMOJI_ZWJ_SEQUENCES.rsplit("/", 1)[-1])
filename, ext = os.path.splitext(fname)
fname = filename + "-latest" + ext
do_retrieve(url=URL_EMOJI_ZWJ_SEQUENCES.format(version="latest"), fname=fname)
pattern = re.compile(r".*# E([0-9.]+)")
versions = set()
result = collections.defaultdict(list)
with open(fname, encoding="utf-8") as f:
for line in f:
if match := re.match(pattern, line):
version = match.group(1)
versions.add(version)
data, _, _ = line.partition("#")
data_fields = (field.strip() for field in data.split(";"))
code_points_str, *_ = data_fields
if code_points_str:
result[version].append(
tuple(int(code_point, 16) for code_point in code_points_str.split())
)
sorted_versions = [
int_str_version_pair[1]
for int_str_version_pair in sorted(
[(wcwidth._wcversion_value(_v_str), _v_str) for _v_str in versions], reverse=True
)
]
return {vv_str: result[vv_str] for vv_str in sorted_versions}
def parse_zwj(fname: str, version: str):
sequences = []
with open(fname, encoding="utf-8") as fin:
for line in fin:
data, _, comment = line.partition("#")
data_fields = (field.strip() for field in data.split(";"))
code_points_str, *type_description = data_fields
if code_points_str:
sequences.append(
tuple(int(code_point, 16) for code_point in code_points_str.split())
)
return sequences
def main():
# create basic python code, skipping all that jinja stuff
print("EMOJI_ZWJ_SEQUENCES = (")
for key, sequences in fetch_zwj_data().items():
print(f" ('{key}', (")
for seq in sequences:
print(f" {seq},")
print(" ),")
print(" ),")
print(")")
if __name__ == "__main__":
# minimal emoji zwj parser, for use:
# $ python make_table_zwj.py > ucs_detect/table_zwj.py
main()