Skip to content

Commit

Permalink
Support Taiwanese phrases
Browse files Browse the repository at this point in the history
  • Loading branch information
ayaka14732 committed Sep 30, 2020
1 parent 2f29bb6 commit f0f2f9b
Show file tree
Hide file tree
Showing 6 changed files with 125 additions and 40 deletions.
31 changes: 31 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Build

on:
push:
branches: [ main ]

jobs:
build:
runs-on: macos-latest
steps:
- uses: actions/checkout@v2
- name: Install otfcc
run: |
brew tap caryll/tap
brew install otfcc-mac64
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
- name: Build
run: |
python build/main.py
- name: Upload artifact
uses: actions/upload-artifact@v2
with:
name: Font files
path: output/*.ttf
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Fan Wun Ming 繁媛明朝
# Fan Wun Ming 繁媛明朝 [![](https://github.com/ayaka14732/FanWunMing/workflows/Build/badge.svg)](https://github.com/ayaka14732/FanWunMing/actions?query=workflow%3ABuild)

![](demo.png)

Expand All @@ -17,8 +17,8 @@ See [release page](https://github.com/ayaka14732/FanWunMing/releases).<br/>

## Build 構建

Install Python and [otfcc](https://github.com/caryll/otfcc). Then run `python build/main.py`.<br/>
安裝 Python 與 [otfcc](https://github.com/caryll/otfcc),然後執行 `python build/main.py`
See [build script](.github/workflows/build.yml).<br/>
參見[建置腳本](.github/workflows/build.yml)

## License 授權條款

Expand Down
105 changes: 68 additions & 37 deletions build/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
from glob import glob
from itertools import chain
import json
from opencc import OpenCC
import os
import subprocess

FONT_VERSION = 1.001
FONT_VERSION = 1.002

# Define the max entries size in a subtable.
# We define a number that is small enough here, so that the entries will not exceed
# the size limit.
SUBTABLE_MAX_COUNT = 5000
SUBTABLE_MAX_COUNT = 4000

# This function is used to split a GSUB table into several subtables.
def grouper(lst, n, start=0):
Expand All @@ -27,11 +28,16 @@ def grouper(lst, n, start=0):
def prepare_files():
'''Download necessary files for the next steps.'''
os.system('mkdir -p output')
os.system('wget -nc -P cache https://github.com/ButTaiwan/genyo-font/releases/download/v1.501/GenYoMin.zip')
os.system('wget -nc -P cache https://cdn.jsdelivr.net/npm/opencc-data@1.0.3/data/STCharacters.txt')
os.system('wget -nc -P cache https://cdn.jsdelivr.net/npm/opencc-data@1.0.3/data/STPhrases.txt')
os.system('wget -nc -P cache https://gist.githubusercontent.com/fatum12/941a10f31ac1ad48ccbc/raw/59d7e29b307ae3439317a975ef390cd729f9bc17/ttc2ttf.pe')
os.system('wget -nc -P cache https://raw.githubusercontent.com/rime-aca/character_set/e7d009a8a185a83f62ad2c903565b8bb85719221/通用規範漢字表.txt')
os.system('wget -q -nc -P cache https://github.com/ButTaiwan/genyo-font/releases/download/v1.501/GenYoMin.zip')
os.system('wget -q -nc -P cache https://cdn.jsdelivr.net/npm/opencc-data@1.0.3/data/STCharacters.txt')
os.system('wget -q -nc -P cache https://cdn.jsdelivr.net/npm/opencc-data@1.0.3/data/STPhrases.txt')
os.system('wget -q -nc -P cache https://cdn.jsdelivr.net/npm/opencc-data@1.0.3/data/TWPhrasesIT.txt')
os.system('wget -q -nc -P cache https://cdn.jsdelivr.net/npm/opencc-data@1.0.3/data/TWPhrasesName.txt')
os.system('wget -q -nc -P cache https://cdn.jsdelivr.net/npm/opencc-data@1.0.3/data/TWPhrasesOther.txt')
os.system('wget -q -nc -P cache https://cdn.jsdelivr.net/npm/opencc-data@1.0.3/data/TWVariants.txt')
os.system('cat cache/TWPhrasesIT.txt cache/TWPhrasesName.txt cache/TWPhrasesOther.txt > cache/TWPhrases.txt')
os.system('wget -q -nc -P cache https://gist.githubusercontent.com/fatum12/941a10f31ac1ad48ccbc/raw/59d7e29b307ae3439317a975ef390cd729f9bc17/ttc2ttf.pe')
os.system('wget -q -nc -P cache https://raw.githubusercontent.com/rime-aca/character_set/e7d009a8a185a83f62ad2c903565b8bb85719221/通用規範漢字表.txt')
os.system('unzip -n -d cache cache/GenYoMin.zip')

# An opentype font can hold at most 65535 glyphs.
Expand Down Expand Up @@ -100,17 +106,19 @@ def build_codepoints_non_han():
# We restrict the Simplified Chinese characters (on the left side of the OpenCC dictionary
# file) to the range of Tongyong Guifan Hanzi Biao, and discard those conversions that are
# out of range. The remained conversions are stored in the entries variable.
#
# Then we calculate the range of “Which Traditional Chinese characters are needed if we
# convert Tongyong Guifan Hanzi Biao to Traditional Chinese”. The range is stored in the
# codepoints variable.
def build_opencc_char_table(codepoints_tonggui, codepoints_font):
def build_opencc_char_table(codepoints_tonggui, codepoints_font, twp=False):
entries = []
codepoints = set()

with open('cache/STCharacters.txt') as f:
with open('cache/STCharacters.txt') as f: # s2t
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
v = t2twp(v) if twp else v # s2t -> s2twp
codepoint_k = ord(k)
codepoint_v = ord(v)
if codepoint_k in codepoints_tonggui and codepoint_v in codepoints_font:
Expand All @@ -119,22 +127,37 @@ def build_opencc_char_table(codepoints_tonggui, codepoints_font):

return entries, codepoints

def build_opencc_word_table(codepoints_tonggui, codepoints_font):
entries = []
def build_opencc_word_table(codepoints_tonggui, codepoints_font, twp=False):
entries = {}
codepoints = set()

with open('cache/STPhrases.txt') as f:
with open('cache/STPhrases.txt') as f: # s2t
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
codepoints_k = [ord(c) for c in k]
codepoints_v = [ord(c) for c in v]
v = t2twp(v) if twp else v # s2t -> s2twp
codepoints_k = tuple(ord(c) for c in k)
codepoints_v = tuple(ord(c) for c in v)
if all(codepoint in codepoints_tonggui for codepoint in codepoints_k) \
and all(codepoint in codepoints_font for codepoint in codepoints_v):
entries.append((codepoints_k, codepoints_v))
entries[codepoints_k] = codepoints_v
codepoints.update(codepoints_v)

return entries, codepoints
if twp:
with open('cache/TWPhrases.txt') as f: # t2twp
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
k = t2s(k) # t2twp -> s2twp
codepoints_k = tuple(ord(c) for c in k)
codepoints_v = tuple(ord(c) for c in v)
if all(codepoint in codepoints_tonggui for codepoint in codepoints_k) \
and all(codepoint in codepoints_font for codepoint in codepoints_v):
entries[codepoints_k] = codepoints_v
codepoints.update(codepoints_v)

# Sort from longest to shortest to force longest match
return sorted(((k, v) for k, v in entries.items()), key=lambda k_v: (-len(k_v[0]), k_v[0])), codepoints

def disassociate_codepoint_and_glyph_name(obj, codepoint, glyph_name):
'''
Expand Down Expand Up @@ -293,49 +316,55 @@ def create_pseu2word_table(obj, feature_name, conversions):
}
obj['GSUB']['lookupOrder'].append('pseu2word')

def build_fanwunming_name_header(style, version, date):
def build_fanwunming_name_header(style, version, date, twp=False):
with open('build/name.json') as f:
name_header = json.load(f)

for item in name_header:
item['nameString'] = item['nameString'] \
.replace('<Style>', style) \
.replace('<Version>', version) \
.replace('<Date>', date)
.replace('<Style>', style) \
.replace('<Version>', version) \
.replace('<Date>', date)

if twp:
item['nameString'] = item['nameString'] \
.replace('繁媛明朝', '繁媛明朝 TW') \
.replace('Fan Wun Ming', 'Fan Wun Ming TW') \
.replace('FanWunMing', 'FanWunMing-TW')

return name_header

def modify_metadata(obj):
def modify_metadata(obj, twp=False):
style = next(item['nameString'] for item in obj['name'] if item['nameID'] == 17)
today = date.today().strftime('%b %d, %Y')

name_header = build_fanwunming_name_header(style, str(FONT_VERSION), today)
name_header = build_fanwunming_name_header(style, str(FONT_VERSION), today, twp=twp)

obj['head']['fontRevision'] = FONT_VERSION
obj['name'] = name_header

def build_dest_path_from_src_path(path):
def build_dest_path_from_src_path(path, twp=False):
'''
>>> build_dest_path_from_src_path('cache/GenYoMin-R.ttc')
'output/FanWunMing-R.ttf'
'''
return path \
.replace('cache/', 'output/') \
.replace('GenYoMin', 'FanWunMing') \
.replace('ttc', 'ttf')
.replace('cache/', 'output/') \
.replace('GenYoMin', 'FanWunMing' + ('-TW' if twp else '')) \
.replace('ttc', 'ttf')

def go(path):
def go(path, twp=False):
font = load_font(path, ttc_index=0)

codepoints_font = build_codepoints_font(font)
codepoints_tonggui = build_codepoints_tonggui() & codepoints_font

codepoints_final = codepoints_tonggui | build_codepoints_non_han() & codepoints_font

entries_char, codepoints_char = build_opencc_char_table(codepoints_tonggui, codepoints_font)
entries_char, codepoints_char = build_opencc_char_table(codepoints_tonggui, codepoints_font, twp=twp)
codepoints_final |= codepoints_char

entries_word, codepoints_word = build_opencc_word_table(codepoints_tonggui, codepoints_font)
entries_word, codepoints_word = build_opencc_word_table(codepoints_tonggui, codepoints_font, twp=twp)
codepoints_final |= codepoints_word

remove_codepoints(font, codepoints_font - codepoints_final)
Expand Down Expand Up @@ -367,13 +396,15 @@ def go(path):
create_char2char_table(font, feature_name, char2char_table)
create_pseu2word_table(font, feature_name, pseu2word_table)

modify_metadata(font)
save_font(font, build_dest_path_from_src_path(path))
modify_metadata(font, twp=twp)
save_font(font, build_dest_path_from_src_path(path, twp=twp))

prepare_files()

def main():
prepare_files()
for path in glob('cache/GenYoMin-*.ttc'):
go(path)
# Initialize OpenCC converters
t2s = OpenCC('t2s').convert
t2twp = OpenCC('./build/t2twp').convert

if __name__ == '__main__':
main()
for path in glob('cache/GenYoMin-*.ttc'):
go(path)
go(path, twp=True)
22 changes: 22 additions & 0 deletions build/t2twp.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"name": "Traditional Chinese to Traditional Chinese (Taiwan standard, with phrases)",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "text",
"file": "../cache/TWPhrases.txt"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "text",
"file": "../cache/TWPhrases.txt"
}, {
"type": "text",
"file": "../cache/TWVariants.txt"
}]
}
}]
}
Binary file modified demo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
opencc<1.2

0 comments on commit f0f2f9b

Please sign in to comment.