diff --git a/Jenkinsfile b/Jenkinsfile index d83e55840..1219aae54 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,9 +1,9 @@ pipeline { agent { - docker { - image 'tnitn_ci_py310:24.07' - args '-v /mnt/jenkins/jenkinsci/TestData:/home/jenkins/TestData -v $HOME/.cache:/root/.cache --shm-size=4g --entrypoint=""' - } + docker { + image 'tnitn_ci_py310:24.07' + args '-v /mnt/jenkins/jenkinsci/TestData:/home/jenkins/TestData -v $HOME/.cache:/root/.cache --shm-size=4g --entrypoint=""' + } } options { timeout(time: 2, unit: 'HOURS') @@ -28,11 +28,11 @@ pipeline { MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/02-18-26-0' + KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-03-25-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { - stage('PyTorch version') { steps { sh 'python -c "import torch; print(torch.__version__)"' @@ -46,7 +46,6 @@ pipeline { } } - stage('L0: Create EN TN/ITN Grammars') { when { anyOf { @@ -54,7 +53,6 @@ pipeline { branch 'staging/**' branch 'staging_*' changeRequest target: 'main' - } } failFast true @@ -79,35 +77,10 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir ${EN_TN_CACHE}' } } - - } - } - stage('L0: Create HI TN/ITN Grammars') { - when { - anyOf { - branch 'main' - branch 'staging/**' - branch 'staging_*' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L0: Hi TN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hi --text="१" --cache_dir ${HI_TN_CACHE}' - } - } - stage('L0: Hi ITN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hi --text="एक" --cache_dir ${HI_TN_CACHE}' - } - } - } } - stage('L0: Create DE/ES TN/ITN Grammars') { + stage('L0: Create DE/ES/FR TN/ITN Grammars') { when { anyOf { branch 'main' @@ -143,38 +116,24 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=es_en --text="ciento uno " --cache_dir ${ES_EN_TN_CACHE}' } } - } - } - - stage('L0: Create AR TN/ITN Grammars') { - when { - anyOf { - branch 'main' - branch 'staging/**' - branch 'staging_*' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L0: AR TN grammars') { + stage('L0: FR TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=ar --text="2" --cache_dir ${AR_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=fr --text="2" --cache_dir ${FR_TN_CACHE}' } } - stage('L0: AR ITN grammars') { + stage('L0: FR ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ar --text="اثنان " --cache_dir ${AR_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=fr --text="cent " --cache_dir ${FR_TN_CACHE}' } } - } } - stage('L0: Create FR TN/ITN & VI TN/ITN & HU TN & IT TN') { + + stage('L0: Create HI/VI/RU TN/ITN') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -182,43 +141,43 @@ pipeline { } failFast true parallel { - stage('L0: FR TN grammars') { + stage('L0: VI ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=fr --text="2" --cache_dir ${FR_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=vi --text="một ngàn " --cache_dir ${VI_TN_CACHE}' } } - stage('L0: FR ITN grammars') { + stage('L0: VI TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=fr --text="cent " --cache_dir ${FR_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=vi --text="100" --cache_dir ${VI_TN_CACHE}' } } - stage('L0: VI ITN grammars') { + stage('L0: RU TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=vi --text="một ngàn " --cache_dir ${VI_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --lang=ru --text="03" --cache_dir ${RU_TN_CACHE}' } } - stage('L0: VI TN grammars') { + stage('L0: RU ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=vi --text="100" --cache_dir ${VI_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ru --text="три " --cache_dir ${RU_TN_CACHE}' } } - stage('L0: HU TN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hu --text="100" --cache_dir ${HU_TN_CACHE}' + stage('L0: Hi TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hi --text="१" --cache_dir ${HI_TN_CACHE}' } } - stage('L0: IT TN grammars') { + stage('L0: Hi ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=it --text="122" --cache_dir ${IT_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hi --text="एक" --cache_dir ${HI_TN_CACHE}' } } } } - stage('L0: Create RU TN/ITN Grammars & SV & PT') { + stage('L0: Create AR/HU/SV/PT/IT TN/ITN Grammars') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -226,60 +185,53 @@ pipeline { } failFast true parallel { - stage('L0: RU TN grammars') { + stage('L0: SV TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --lang=ru --text="03" --cache_dir ${RU_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=sv --text="100" --cache_dir ${SV_TN_CACHE}' } } - stage('L0: RU ITN grammars') { + stage('L0: HU TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ru --text="три " --cache_dir ${RU_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hu --text="100" --cache_dir ${HU_TN_CACHE}' } } - stage('L0: SV TN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=sv --text="100" --cache_dir ${SV_TN_CACHE}' + stage('L0: AR TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=ar --text="2" --cache_dir ${AR_TN_CACHE}' } } - // stage('L0: SV ITN grammars') { - // steps { - // sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=sv --text="hundra " --cache_dir ${SV_TN_CACHE}' - // } - // } - // stage('L0: PT TN grammars') { - // steps { - // sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=pt --text="2" --cache_dir ${DEFAULT_TN_CACHE}' - // } - // } + stage('L0: AR ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ar --text="اثنان " --cache_dir ${AR_TN_CACHE}' + } + } + // stage('L0: SV ITN grammars') { + // steps { + // sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=sv --text="hundra " --cache_dir ${SV_TN_CACHE}' + // } + // } + // stage('L0: PT TN grammars') { + // steps { + // sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=pt --text="2" --cache_dir ${DEFAULT_TN_CACHE}' + // } + // } stage('L0: PT ITN grammars') { steps { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=pt --text="dez " --cache_dir ${PT_TN_CACHE}' } } - } - } - stage('L0: Create He TN/ITN Grammars & MR') { - when { - anyOf { - branch 'main' - branch 'staging/**' - branch 'staging_*' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L0: HE ITN grammars') { + stage('L0: IT TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=he --text="ת " --cache_dir ${HE_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=it --text="122" --cache_dir ${IT_TN_CACHE}' } } } } - stage('L0: Create HY TN/ITN Grammars & MR') { + + stage('L0: Create MR/HE/HY TN/ITN Grammars') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -302,12 +254,18 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hy --text="վեց " --cache_dir ${HY_TN_CACHE}' } } + stage('L0: HE ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=he --text="ת " --cache_dir ${HE_TN_CACHE}' + } + } } } - stage('L0: Create ZH TN/ITN Grammar') { + + stage('L0: Create CJK TN/ITN Grammar') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -325,34 +283,30 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=zh --text="6" --cache_dir ${ZH_TN_CACHE}' } } - } - } - stage('L0: Create JA ITN Grammars') { - when { - anyOf { - branch 'main' - branch 'staging/**' - branch 'staging_*' - changeRequest target: 'main' - } - } - failFast true - parallel { stage('L0: JA ITN grammars') { steps { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ja --text="100" --cache_dir ${JA_TN_CACHE}' } } + stage('L0: KO TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=ko --text="100" --cache_dir ${KO_TN_CACHE}' + } + } + stage('L0: KO ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ko --text="백" --cache_dir ${KO_TN_CACHE}' + } + } } } - -// L1 Tests starts here + // L1 Tests starts here stage('L1: TN/ITN Tests CPU') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -424,7 +378,7 @@ pipeline { steps { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ja/ -m "not pleasefixme" --cpu --tn_cache_dir ${JA_TN_CACHE}' } - } + } stage('L1: Run all MR ITN tests (restore grammars from cache)') { steps { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/mr/ -m "not pleasefixme" --cpu --tn_cache_dir ${MR_TN_CACHE}' @@ -440,13 +394,18 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/he/ -m "not pleasefixme" --cpu --tn_cache_dir ${HE_TN_CACHE}' } } + stage('L1: Run all KO TN/ITN tests (restore grammars from cache)') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ko/ -m "not pleasefixme" --cpu --tn_cache_dir ${KO_TN_CACHE}' + } + } } } - stage('L2: EN Sparrowhawk Tests') { + stage('L2: EN Sparrowhawk Tests') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -458,14 +417,12 @@ pipeline { steps { sh 'CUDA_VISIBLE_DEVICES="" cp -r /workspace/sparrowhawk/documentation/grammars /workspace/sparrowhawk/documentation/grammars_en_itn_grammars_lower_cased && cd tools/text_processing_deployment && bash sh_test.sh --MODE="test_itn_grammars" --OVERWRITE_CACHE=False --FAR_PATH=${EN_TN_CACHE}/SH_ITN --LANGUAGE="en"' sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_inverse_text_normalization.sh /workspace/sparrowhawk/documentation/grammars_en_itn_grammars_lower_cased `pwd`' - } } stage('L2: EN ITN Run Sparrowhawk test - Cased Input') { steps { sh 'CUDA_VISIBLE_DEVICES="" cp -r /workspace/sparrowhawk/documentation/grammars /workspace/sparrowhawk/documentation/grammars_en_itn_grammars_cased && cd tools/text_processing_deployment && bash sh_test.sh --MODE="test_itn_grammars" --INPUT_CASE="cased" --OVERWRITE_CACHE=False --FAR_PATH=${EN_TN_CACHE}/SH_ITN_cased --LANGUAGE="en"' sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_inverse_text_normalization_cased.sh /workspace/sparrowhawk/documentation/grammars_en_itn_grammars_cased `pwd`' - } } stage('L2: EN TN Run Sparrowhawk test') { @@ -474,14 +431,13 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_normalization.sh /workspace/sparrowhawk/documentation/grammars_en_tn_grammars_cased `pwd`' } } - } } - + stage('L2: NeMo text processing') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -500,7 +456,6 @@ pipeline { rm -rf $NORM_OUTPUT_DIR' } } - stage('L2: Eng ITN export') { steps { sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkins/TestData/text_denorm/output_${TIME} && \ @@ -511,8 +466,6 @@ pipeline { rm -rf $DENORM_OUTPUT_DIR' } } - - stage('L2: Eng alignment TN') { steps { sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkins/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \ @@ -521,7 +474,6 @@ pipeline { rm -rf $NORM_OUTPUT_DIR' } } - stage('L2: Eng alignment ITN') { steps { sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkins/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \ @@ -530,16 +482,14 @@ pipeline { rm -rf $DENORM_OUTPUT_DIR' } } - } } } - post { always { sh 'chmod -R 777 .' cleanWs() } } -} \ No newline at end of file +} diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index da85318b1..1ab727660 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -136,6 +136,13 @@ def __init__( from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'ko': # Korean + from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) + else: + raise NotImplementedError(f"Language {lang} has not been supported yet.") self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -180,7 +187,25 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'he', 'hi', 'hy', 'mr', 'ja'], + choices=[ + 'en', + 'de', + 'es', + 'pt', + 'ru', + 'fr', + 'sv', + 'vi', + 'ar', + 'es_en', + 'zh', + 'he', + 'hi', + 'hy', + 'mr', + 'ja', + 'ko', + ], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/__init__.py new file mode 100644 index 000000000..f541211af --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv new file mode 100644 index 000000000..fd2127530 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv @@ -0,0 +1,9 @@ +달러 $ +불 $ +유로 € +엔 ¥ +파운드 £ +위안 ¥ +페소 $ +루피 ₹ +원 ₩ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/measure_units.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/measure_units.tsv new file mode 100644 index 000000000..ff79b8e91 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/measure_units.tsv @@ -0,0 +1,61 @@ +킬로미터 km +미터 m +센티미터 cm +밀리미터 mm +마이크로미터 μm +나노미터 nm +킬로그램 kg +그램 g +톤 t +밀리그램 mg +마이크로그램 μg +리터 L +밀리리터 ml +씨씨 cc +시간 h +분 min +초 s +뉴턴 N +와트 W +킬로와트 kW +킬로와트시 kWh +헤르츠 Hz +킬로헤르츠 kHz +메가헤르츠 MHz +기가헤르츠 GHz +도 ° +퍼센트 % +프로 % +분당회전수 rpm +알피엠 rpm +볼트 V +밀리볼트 mV +킬로볼트 kV +암페어 A +밀리암페어 mA +평 py +제곱미터 m² +제곱킬로미터 km² +제곱센티미터 cm² +세제곱미터 m³ +기가바이트 GB +기가 GB +테라바이트 TB +테라 TB +메가바이트 MB +메가 MB +킬로바이트 KB +바이트 B +비트 bit +칼로리 cal +킬로칼로리 kcal +줄 J +킬로줄 kJ +마력 hp +옴 Ω +파스칼 Pa +헥토파스칼 hPa +데시벨 dB +루멘 lm +럭스 lx +픽셀 px \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/months.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/months.tsv new file mode 100644 index 000000000..52039ef35 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/months.tsv @@ -0,0 +1,12 @@ +일 1 +이 2 +삼 3 +사 4 +오 5 +유 6 +칠 7 +팔 8 +구 9 +시 10 +십일 11 +십이 12 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv new file mode 100644 index 000000000..9871cb9cf --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv @@ -0,0 +1,9 @@ +일 1 +이 2 +삼 3 +사 4 +오 5 +육 6 +칠 7 +팔 8 +구 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv new file mode 100644 index 000000000..cbf967001 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv @@ -0,0 +1 @@ +영 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/counter_suffix.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/counter_suffix.tsv new file mode 100644 index 000000000..e240760ed --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/counter_suffix.tsv @@ -0,0 +1,17 @@ +개 +명 +병 +마리 +대 +송이 +포기 +사람 +자루 +채 +켤레 +그루 +벌 +잔 +장 +권 +살 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit.tsv new file mode 100644 index 000000000..d2fdd1846 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit.tsv @@ -0,0 +1,9 @@ +한 1 +두 2 +세 3 +네 4 +다섯 5 +여섯 6 +일곱 7 +여덟 8 +아홉 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/time/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/data/time/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/time/ten_prefix.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/time/ten_prefix.tsv new file mode 100644 index 000000000..fe2800413 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/time/ten_prefix.tsv @@ -0,0 +1,4 @@ +이 2 +삼 3 +사 4 +오 5 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv new file mode 100644 index 000000000..8044e4006 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv @@ -0,0 +1,23 @@ +한 1 +두 2 +세 3 +네 4 +다섯 5 +여섯 6 +일곱 7 +여덟 8 +아홉 9 +열 10 +열한 11 +열두 12 +열세 13 +열네 14 +열다섯 15 +열여섯 16 +열일곱 17 +열여덟 18 +열아홉 19 +스무 20 +스물한 21 +스물두 22 +스물세 23 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/whitelist.tsv new file mode 100644 index 000000000..5ff302fb6 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/whitelist.tsv @@ -0,0 +1,35 @@ +박사 Dr. +박사 dr. +씨 Mr. +씨 mr. +양 Ms. +양 ms. +여사 Mrs. +여사 mrs. +산 mt. +산 Mt. +교수 Prof. +교수 prof. +시니어 sr. +시니어 Sr. +주니어 jr. +주니어 Jr. +대로 Ave. +대로 ave. +번호 no. +번호 No. +왼쪽 괄호 ( +오른쪽 괄호 ) +더하기 + +마이너스 - +시그마 Σ +에타 η +카파 κ +오메가 ω +시그마 σ +알파 α +뉴 ν +델타 δ +이오타 ι +박사학위 Ph.D. +등 etc. \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py new file mode 100644 index 000000000..50f1eb3b9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py @@ -0,0 +1,292 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.examples import plurals +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_NARROW_NON_BREAK_SPACE = "\u202f" +NEMO_DIGIT = byte.DIGIT +NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() +NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() +NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = "\u00a0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +NEMO_NOT_ALPHA = pynini.difference(NEMO_SIGMA, NEMO_ALPHA).optimize() +NEMO_LOWER_NOT_A = pynini.union( + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", +).optimize() + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) +) + +suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) +# _v = pynini.union("a", "e", "i", "o", "u") +_c = pynini.union( + "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z" +) +_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") +_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") +_s = NEMO_SIGMA + pynutil.insert("s") + +graph_plural = plurals._priority_union( + suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA +).optimize() + +SINGULAR_TO_PLURAL = graph_plural +PLURAL_TO_SINGULAR = pynini.invert(graph_plural) +TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]) +TO_UPPER = pynini.invert(TO_LOWER) +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" +MINUS = pynini.union("minus", "Minus").optimize() + + +def capitalized_input_graph( + graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None +) -> 'pynini.FstLike': + """ + Allow graph input to be capitalized, e.g. for ITN) + + Args: + graph: FstGraph + original_graph_weight: weight to add to the original `graph` + capitalized_graph_weight: weight to add to the capitalized graph + """ + capitalized_graph = pynini.compose(TO_LOWER + NEMO_SIGMA, graph).optimize() + + if original_graph_weight is not None: + graph = pynutil.add_weight(graph, weight=original_graph_weight) + + if capitalized_graph_weight is not None: + capitalized_graph = pynutil.add_weight(capitalized_graph, weight=capitalized_graph_weight) + + graph |= capitalized_graph + return graph + + +def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logging.info(f'Created {file_name}') + + +def get_plurals(fst): + """ + Given singular returns plurals + + Args: + fst: Fst + + Returns plurals to given singular forms + """ + return SINGULAR_TO_PLURAL @ fst + + +def get_singulars(fst): + """ + Given plural returns singulars + + Args: + fst: Fst + + Returns singulars to given plural forms + """ + return PLURAL_TO_SINGULAR @ fst + + +def convert_space(fst) -> 'pynini.FstLike': + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken, *weight in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize()], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" AND ", " and "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + logging.debug(f"This is weight {weight}") + if len(weight) == 0: + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] + ) + else: + additional_labels.extend( + [ + [written, spoken_no_space, weight[0]], + [written_capitalized, spoken_no_space.upper(), weight[0]], + ] + ) + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far') + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> 'pynini.FstLike': + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> 'pynini.FstLike': + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> 'pynini.FstLike': + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py new file mode 100644 index 000000000..8befd4250 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -0,0 +1,118 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals + e.g. 마이너스 이십삼 -> cardinal { integer: "23" negative: "-" } } + + Args: + input_case: accepting Korean input. + """ + + def __init__(self): + super().__init__(name="cardinal", kind="classify") + + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + + ten = pynutil.delete("십") + ten_alt = pynini.cross("십", "1") + ### Responsible for second digit of two digit number. ex) 20's 2 + graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0")) + ### Responsible for the first digit of number. ex) 1,2,3,4,5,,, + graph_ten_component += graph_digit | pynutil.insert("0") + + hundred = pynutil.delete("백") + hundred_alt = pynini.cross("백", "1") + graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) + graph_hundred_component += graph_ten_component + + thousand = pynutil.delete("천") + thousand_alt = pynini.cross("천", "1") + graph_thousand_component = pynini.union(((graph_digit + thousand) | thousand_alt), pynutil.insert("0")) + graph_thousand_component += graph_hundred_component + + # "만" marks the 10,000 unit. + # It shifts the number by four digits (Korean units grow in 4-digit groups). + tenthousand = pynutil.delete("만") + tenthousand_alt = pynini.cross("만", "1") # "만"을 leading 1로 취급 + + # thousand_component가 "0"만 출력하는 케이스를 막고 싶으면(선택) + thousand_input = pynini.project(graph_thousand_component, "input").optimize() + thousand_input_nonempty = pynini.difference(thousand_input, pynini.accep("")).optimize() + graph_thousand_component_nonempty = (thousand_input_nonempty @ graph_thousand_component).optimize() + + # Handle the "만" unit (10,000). + # Korean numbers increase by 4-digit units, so "만" shifts the value by four digits. + # Supports patterns like , 만, and 만. + graph_tenthousand_component = pynini.union( + (graph_thousand_component + tenthousand) + graph_thousand_component, + tenthousand_alt + pynutil.insert("0000"), + # "만" + <1~9999> + tenthousand_alt + graph_thousand_component_nonempty, + # implicit leading part: <0000> + <0~9999> + pynutil.insert("0000") + graph_thousand_component, + ).optimize() + hundredmillion = pynutil.delete("억") + hundredmillion_alt = pynini.cross("억", "1") + graph_hundredmillion_component = pynini.union( + ((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000") + ) + graph_hundredmillion_component += graph_tenthousand_component + + trillion = pynutil.delete("조") + trillion_alt = pynini.cross("조", "1") + graph_trillion_component = pynini.union( + ((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000") + ) + graph_trillion_component += graph_hundredmillion_component + + tenquadrillion = pynutil.delete("경") + tenquadrillion_alt = pynini.cross("경", "1") + graph_tenquadrillion_component = pynini.union( + ((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000") + ) + graph_tenquadrillion_component += graph_trillion_component + + graph = pynini.union( + ### From biggest unit to smallest, everything is included + graph_tenquadrillion_component + | graph_zero + ) + + leading_zero = ( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) + ) + graph = (graph @ leading_zero) | graph_zero + + self.just_cardinals = graph + + negative_sign = pynini.closure( + (pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space, 0, 1 + ) + + final_graph = ( + negative_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py new file mode 100644 index 000000000..b02e80984 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py @@ -0,0 +1,61 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying date, + e.g. 이천십이년 일월 오일 -> date { year: "2012" month: "1" day: "5" } + e.g. 오월 -> date { month: "5" } + e.g. 칠일 -> date { day: "7" } + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="date", kind="classify") + + cardinal = cardinal.just_cardinals + month = pynini.string_file(get_abs_path("data/months.tsv")) + + year_suffix = pynini.cross("년", "") + month_suffix = pynini.cross("월", "") + day_suffix = pynini.cross("일", "") + + delete_space = pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1) + between_fields = delete_space + pynutil.insert(NEMO_SPACE) + + year_component = pynutil.insert("year: \"") + cardinal + year_suffix + pynutil.insert("\"") + month_component = pynutil.insert("month: \"") + month + month_suffix + pynutil.insert("\"") + day_component = pynutil.insert("day: \"") + cardinal + day_suffix + pynutil.insert("\"") + + graph_component = year_component | month_component + + graph_date = ( + year_component + | month_component + | (year_component + between_fields + month_component) + | (month_component + between_fields + day_component) + | (year_component + between_fields + month_component + between_fields + day_component) + ) + + final_graph = graph_component | graph_date + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py new file mode 100644 index 000000000..96cd012b3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py @@ -0,0 +1,74 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +def get_quantity(decimal): + suffix = pynini.union("만", "억", "조", "경") + numbers = decimal + res = numbers + pynutil.insert(' quantity: "') + suffix + pynutil.insert('"') + + return res + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying decimal + e.g. 일점오 -> decimnl { integer_part: "1" fractional_part: "5" } + e.g. 일점오만 -> decimal { integer_part: "1" fractional_part: "5" quantity: "만" } + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="decimal", kind="classify") + + cardinals = cardinal.just_cardinals + man_as_10000 = pynini.cross("만", "10000") + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + decimal_part = pynini.closure(graph_zero | graph_digit, 1) + + decimal_point = pynutil.delete("점") + integer_number = cardinals | man_as_10000 + integer_part = pynutil.insert("integer_part: \"") + integer_number + pynutil.insert("\"") + fractional_part = pynutil.insert("fractional_part: \"") + decimal_part + pynutil.insert("\"") + + graph_decimal_regular = ( + integer_part + decimal_point + pynutil.insert(NEMO_SPACE) + fractional_part + ) # Regular decimal like 1.5 + graph_deicimal_larger = get_quantity( + graph_decimal_regular + ) # If decimal is used to express big numbers like 15000 -> "1.5만" + + self.decimal = graph_decimal_regular | graph_deicimal_larger + self.just_decimal = cardinals | (cardinals + pynini.cross("점", ".") + decimal_part) + + graph_sign = ( + pynutil.insert("negative: \"") + (pynini.cross("마이너스", "-") | pynini.accep("-")) + pynutil.insert("\"") + ) + + final_graph = ( + (graph_sign + pynutil.insert(" ") + graph_decimal_regular) + | (graph_sign + pynutil.insert(" ") + graph_deicimal_larger) + | graph_decimal_regular + | graph_deicimal_larger + ) + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py new file mode 100644 index 000000000..a8f9f919d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py @@ -0,0 +1,139 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst + + +class FractionFst(GraphFst): + def __init__(self, cardinal: GraphFst, decimal: GraphFst): + """ + Fitite state transducer for classifying fractions + e.g., + fraction { denominator: "사" numerator: "삼" } -> 3/4 + fraction { integer_part: "일" denominator: "사" numerator: "삼" } -> 1 3/4 + fraction { denominator: "루트삼" numerator: "일" } -> 1/√3 + fraction { denominator: "일점육오" numerator: "오십" } -> 50/1.65 + fraction { denominator: "이루트육" numerator: "삼" } -> 3/2√6 + """ + super().__init__(name="fraction", kind="classify") + + cardinal = cardinal.just_cardinals + decimal = decimal.just_decimal + + # Expression between fraction. Means the dash "/" + fraction_word = pynutil.delete("분의") + # Expression combining mixed number and fraction. Optional to use + connecting_word = pynutil.delete("와") | pynutil.delete("과") + # Expression for "√" + root_word = pynini.accep("√") | pynini.cross("루트", "√") + + graph_sign = ( + pynutil.insert("negative: \"") + (pynini.accep("-") | pynini.cross("마이너스", "-")) + pynutil.insert("\"") + ) + + # graph_mixed_number considers all of possible combination number you can have in front of fraction + graph_mixed_number = ( + pynutil.insert("integer_part: \"") + + ( + decimal + | (decimal + connecting_word) + | (root_word + decimal) + | (cardinal + root_word + decimal) + | (root_word + decimal + connecting_word) + | (cardinal + root_word + decimal + connecting_word) + | cardinal + | (cardinal + connecting_word) + | (root_word + cardinal) + | (cardinal + root_word + cardinal) + | (root_word + cardinal + connecting_word) + | (cardinal + root_word + cardinal + connecting_word) + ) + + pynutil.insert("\"") + ) + + graph_denominator = ( + pynutil.insert("denominator: \"") + + ( + ( + decimal + | (cardinal + root_word + decimal) + | (root_word + decimal) + | cardinal + | (cardinal + root_word + cardinal) + | (root_word + cardinal) + ) + + pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1) + ) + + pynutil.insert("\"") + ) + + graph_numerator = ( + pynutil.insert("numerator: \"") + + ( + ( + decimal + | (cardinal + root_word + decimal) + | (root_word + decimal) + | cardinal + | (cardinal + root_word + cardinal) + | (root_word + cardinal) + ) + + pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1) + ) + + pynutil.insert("\"") + ) + + graph_fraction_sign = ( + graph_sign + + pynutil.insert(NEMO_SPACE) + + graph_denominator + + pynutil.insert(NEMO_SPACE) + + fraction_word + + graph_numerator + ) + graph_fraction_no_sign = graph_denominator + pynutil.insert(NEMO_SPACE) + fraction_word + graph_numerator + # Only fraction like "1/3" or "- 1/3" + graph_fractions = graph_fraction_sign | graph_fraction_no_sign + # Mixed number fraction like "2 1/3" or "-2 1/3" + graph_mixed_number_fraction = ( + pynini.closure((graph_sign + pynutil.insert(" ")), 0, 1) + + pynutil.add_weight(graph_mixed_number, 1.1) + + pynutil.insert(NEMO_SPACE) + + graph_denominator + + pynutil.insert(NEMO_SPACE) + + fraction_word + + graph_numerator + ) + + # ---- NEW: optional josa after fraction (prevents "이"/"만" from being re-tokenized as cardinal) ---- + josa_single = pynini.union("만", "이", "가", "은", "는", "을", "를", "로", "도", "다") + josa_multi = pynini.union("부터", "까지") + josa = (josa_single | josa_multi | (josa_single + josa_multi)).optimize() + + trailing_josa = pynini.closure( + pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1) # optional space + + pynutil.insert(' suffix: "') + + josa + + pynutil.insert('"'), + 0, + 1, + ) + + final_graph = (graph_fractions | graph_mixed_number_fraction) + trailing_josa + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/measure.py new file mode 100644 index 000000000..5a2d77ced --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/measure.py @@ -0,0 +1,130 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_SPACE, + GraphFst, + convert_space, + delete_extra_space, +) +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying measure + e.g. 십이 킬로그램 -> measure { cardinal { integer: "12" } units: "kg" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="measure", kind="classify") + + base_cardinal = cardinal.just_cardinals + man_as_10000 = pynini.cross("만", "10000") + cardinal_graph = base_cardinal | man_as_10000 + # Graphing fraction (extended to support root denominators like "√") + root_word = pynini.accep("√") | pynini.cross("루트", "√") + root_cardinal = (root_word + cardinal_graph).optimize() # e.g., 루트구 -> √9 + den_for_fraction = (cardinal_graph | root_cardinal).optimize() + num_for_fraction = (cardinal_graph | root_cardinal).optimize() + graph_unit = pynini.string_file(get_abs_path("data/measure_units.tsv")) + + delete_any_space = pynini.closure(pynutil.delete(NEMO_SPACE)) + # Negative sign + negative_word = pynini.union("마이너스", "영하") + graph_negative = pynini.cross(negative_word, 'negative: "true"') + delete_extra_space + optional_graph_negative = pynini.closure(graph_negative, 0, 1) + # Graphing measurement units + unit_singular = convert_space(graph_unit) + # For units that has "/", like km/h + unit_per = ( + unit_singular + + delete_any_space + + pynini.cross(pynini.union("퍼", "당"), "/") + + delete_any_space + + unit_singular + ) + + graph_unit_final = pynutil.insert('units: "') + (unit_singular | unit_per) + pynutil.insert('"') + + # Graphing decimal + graph_digit_tsv = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + graph_zero = pynini.cross("영", "0") | pynini.cross("공", "0") + decimal_fractional_part = pynini.closure(graph_digit_tsv | graph_zero, 1) + + graph_decimal = ( + pynutil.insert('integer_part: "') + + cardinal_graph + + pynutil.insert('"') + + delete_any_space + + pynini.cross("점", NEMO_SPACE) + + delete_any_space + + pynutil.insert('fractional_part: "') + + decimal_fractional_part + + pynutil.insert('"') + ) + + # Graphing fraction + graph_fraction = ( + pynutil.insert("fraction { ") + + pynutil.insert('denominator: "') + + den_for_fraction + + pynutil.insert('"') + + delete_any_space + + pynutil.delete("분의") + + delete_any_space + + pynutil.insert(' numerator: "') + + num_for_fraction + + pynutil.insert('"') + + pynutil.insert(" }") + ) + + final_graph_cardinal = ( + delete_any_space + + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert('integer: "') + + cardinal_graph + + pynutil.insert('"') + + pynutil.insert(" }") + + pynutil.insert(NEMO_SPACE) + + delete_any_space + + graph_unit_final + ) + + final_graph_decimal = ( + delete_any_space + + pynutil.insert("decimal { ") + + optional_graph_negative + + graph_decimal + + pynutil.insert(" }") + + pynutil.insert(NEMO_SPACE) + + delete_any_space + + graph_unit_final + ) + + final_graph_fraction = ( + delete_any_space + graph_fraction + pynutil.insert(NEMO_SPACE) + delete_any_space + graph_unit_final + ) + + final_graph = final_graph_cardinal | final_graph_decimal | final_graph_fraction + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py new file mode 100644 index 000000000..3b46e9a0d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py @@ -0,0 +1,48 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class MoneyFst(GraphFst): + """ + Finite state transducer for classifying money + e.g. 오만 삼천원 -> money { integer_part: "53000" currency: "₩" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst, decimal: GraphFst): + super().__init__(name="money", kind="classify") + + decimals = decimal.just_decimal + currency = pynini.string_file(get_abs_path("data/currency.tsv")) + + man_as_10000 = pynini.cross("만", "10000") + number_for_money = decimals | man_as_10000 + + # Accepting space if there are one between integer and currency + spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) + graph_integer = pynutil.insert('integer_part: "') + number_for_money + pynutil.insert('"') + spacing + graph_unit = pynutil.insert(" currency: \"") + currency + pynutil.insert("\"") + + graph_final = graph_integer + graph_unit + + final_graph = self.add_tokens(graph_final) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py new file mode 100644 index 000000000..094fbebde --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py @@ -0,0 +1,126 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +def get_counter(ordinal): + # counter suffix file (개, 명, 병, 마리, ...) + suffix = pynini.string_file(get_abs_path("data/ordinals/counter_suffix.tsv")) + # allowed trailing josa (optional) to capture forms like "네개를", "여섯명만" + josa_single = pynini.union("만", "이", "가", "은", "는", "을", "를", "로", "도", "다") + josa_multi = pynini.union("부터", "까지") + josa = (josa_single | josa_multi | (josa_single + josa_multi)).optimize() + + counter_field = pynutil.insert('" counter: "') + suffix + suffix_field = pynutil.insert('" suffix: "') + josa + + return ( + ordinal + + pynini.closure(delete_space, 0, 1) + + counter_field + + pynini.closure(pynini.closure(delete_space, 0, 1) + suffix_field, 0, 1) + ) + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying ordinal + Expressing integers in ordinal way for 1-39 and cardinal for 40+ due to Korean grammar. + e.g. 스물세번째 -> ordinal {integer: "23", 23번째} + e.g. 사십오번째 -> ordinal but the integer part is written in cardinal(due to korean grammar) + { integer: "45", 45번쨰} + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="ordinal", kind="classify") + + cardinals = cardinal.just_cardinals + man_as_10000 = pynini.cross("만", "10000") + ordinals_suffix = pynini.accep("번째") # Korean ordinal's morphosyntactic feature + + graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) # 1-9 in ordinals + cardinal_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) # 1-9 in cardinals + + graph_tens_prefix = pynini.cross("열", "1") # First digit for tens + graph_twenties_prefix = pynini.cross("스물", "2") # First digit for twenties + graph_thirties_prefix = pynini.cross("서른", "3") # First digit for thirties + + # Below exclude regular 1 in ordinal and replace with a special 1. Like "first" in English + # The special 1 is a unique ordinal case for Korean and does not repeat for 11, 21, 31 + graph_one = pynini.cross("한", "1") + single_digits = pynini.project(graph_digit, "input").optimize() + graph_one_acceptor = pynini.project(graph_one, "input").optimize() + two_to_nine = pynini.difference(single_digits, graph_one_acceptor).optimize() + graph_two_to_nine = two_to_nine @ graph_digit + graph_first = pynini.cross("첫", "1") + graph_single = graph_two_to_nine | graph_first + + graph_ten = pynini.cross("열", "10") + graph_tens = graph_ten | graph_tens_prefix + graph_digit + + graph_twenty = pynini.cross("스무", "20") + graph_twenties = graph_twenty | graph_twenties_prefix + graph_digit + + graph_thirty = pynini.cross("서른", "30") + graph_thirties = graph_thirty | graph_thirties_prefix + graph_digit + + ordinals = pynini.union( + graph_single, graph_tens, graph_twenties, graph_thirties # 1-9 # 10-19 # 20-29 # 30-39 + ).optimize() + + cardinal_10_to_19 = pynini.cross("십", "10") | (pynini.accep("십") + cardinal_digit) + + cardinal_20_to_29 = pynini.cross("이십", "20") | (pynini.accep("이십") + cardinal_digit) + + cardinal_30_to_39 = pynini.cross("삼십", "30") | (pynini.accep("삼십") + cardinal_digit) + + # FST that include 1-39 in cardinal expression + cardinal_below_40 = pynini.union( + cardinal_digit, cardinal_10_to_19, cardinal_20_to_29, cardinal_30_to_39 + ).optimize() + + # Input includes all cardinal expressions + cardinals_acceptor = pynini.project(cardinals, "input").optimize() + # Input includes cardinal expression from 1 to 39 + cardinals_exception = pynini.project(cardinal_below_40, "input").optimize() + + # All cardinal values except 1 to 39 cardinal values + cardinal_over_40 = pynini.difference(cardinals_acceptor, cardinals_exception).optimize() + cardinal_ordinal_suffix = cardinal_over_40 @ cardinals + + # 1 to 39 in ordinal, everything else cardinal + ordinal_final = pynini.union(ordinals, cardinal_ordinal_suffix, man_as_10000) + + ordinal_graph = ( + pynutil.insert("integer: \"") + ((ordinal_final + delete_space + ordinals_suffix)) + pynutil.insert("\"") + ) + + # Adding various counter suffix for ordinal + # For counting, Korean does not use the speical "첫" for 1. Instead the regular "한" + counters = pynini.union(graph_digit, graph_tens, graph_twenties, graph_thirties).optimize() + + counter_final = get_counter(counters) | get_counter(cardinal_ordinal_suffix) | get_counter(man_as_10000) + + counter_graph = pynutil.insert("integer: \"") + counter_final + pynutil.insert("\"") + + final_graph = ordinal_graph | counter_graph + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py new file mode 100644 index 000000000..381a1188a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py @@ -0,0 +1,57 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying a generic 3-4-4 telephone number. + e.g. 공일공에 일이삼사에 오육칠팔 -> telephone { number_part: "010-1234-5678" } + + """ + + def __init__(self): + super().__init__(name="telephone", kind="classify") + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_zero_alt = pynini.cross("공", "0") + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + + digit = graph_digit | graph_zero | graph_zero_alt + + separator = pynini.cross(pynini.union(NEMO_SPACE, "에"), "-") + + digit2 = digit**2 + digit3 = digit**3 + digit4 = digit**4 + + optional_separator = pynini.closure(separator, 0, 1) + + phone_number_graph = ( + pynutil.insert('number_part: "') + + pynini.union(digit2, digit3) + + optional_separator + + pynini.union(digit3, digit4) + + optional_separator + + digit4 + + pynutil.insert('"') + ) + + graph = phone_number_graph + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py new file mode 100644 index 000000000..30080639b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py @@ -0,0 +1,101 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time + e.g. 열두시 삼십분 -> time { hours: "12" minutes: "30" } + e.g. 12분전 -> time { minutes: "12" suffix: "전" } + e.g. 새벽 두시 -> time { hours: "2" suffix: "새벽" } + e.g. 두시반 -> time { hours: "2" minutes: "30" } + e.g. 오후 두시반 -> time { prefix: "오후" hours: "2" minutes: "30" } + """ + + def __init__(self): + super().__init__(name="time", kind="classify") + + # 1-9 in cardinals for minutes and seconds + cardinal_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + cardinal_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + + graph_tens_prefix = pynini.string_file(get_abs_path("data/time/ten_prefix.tsv")) + + # Graphing 10-19 + graph_ten = pynini.union(pynini.cross("십", "10"), pynini.cross("십", "1") + cardinal_digit).optimize() + # Graphing 20-59 + graph_tens = (graph_tens_prefix + pynini.cross("십", "0")) | ( + graph_tens_prefix + pynini.cross("십", "") + cardinal_digit + ) + + graph_0_to_59 = pynini.union(cardinal_zero, cardinal_digit, graph_ten, graph_tens).optimize() + + # 1-12 for hours + graph_hours = pynini.string_file(get_abs_path("data/time/time_hours.tsv")) + + # Adding space if there are one + spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) + + hour_suffix = pynini.cross("시", "") + minute_suffix = pynini.cross("분", "") + second_suffix = pynini.cross("초", "") + + hour_component = pynutil.insert("hours: \"") + (graph_hours + spacing + hour_suffix) + pynutil.insert("\"") + + # half minute only allowed after hours: "두시반" / "두시 반" + half_minute_component = pynutil.insert('minutes: "30"') + spacing + pynini.cross("반", "") + + minute_component = ( + pynutil.insert("minutes: \"") + (graph_0_to_59 + spacing + minute_suffix) + pynutil.insert("\"") + ) + + second_component = ( + pynutil.insert("seconds: \"") + (graph_0_to_59 + spacing + second_suffix) + pynutil.insert("\"") + ) + + hm_opt = pynini.closure(delete_space + minute_component, 0, 1) + hs_opt = pynini.closure(delete_space + second_component, 0, 1) + + h_half = hour_component + delete_space + half_minute_component + hs_opt + hms = hour_component + hm_opt + hs_opt + ms = minute_component + pynini.closure(delete_space + second_component, 0, 1) + s_only = second_component + + graph_regular = pynini.union(h_half, hms, ms, s_only).optimize() + + # 오전 = AM, 오후 = PM + ampm_words = pynini.union("오전", "오후", "새벽", "아침") + ampm_tag = pynutil.insert('suffix: "') + ampm_words + pynutil.insert('"') + + # 전 = before, 후 = after + suffix_words = pynini.accep("전") | pynini.accep("후") + suffix_tag = pynutil.insert("suffix: \"") + suffix_words + pynutil.insert("\"") + + time_graph = ( + pynini.closure(delete_space + ampm_tag, 0, 1) + + graph_regular + + pynini.closure(delete_space + suffix_tag, 0, 1) + ) + + # Adding cardinal graph to prevent processing out of range numbers + final_graph = time_graph + + self.fst = self.add_tokens(final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..8e5d39c85 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -0,0 +1,127 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + INPUT_LOWER_CASED, + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + input_case: str = INPUT_LOWER_CASED, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_itn_{input_case}_tokenize.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logging.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logging.info(f"Creating ClassifyFst grammars.") + + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal = OrdinalFst(cardinal) + ordinal_graph = ordinal.fst + + decimal = DecimalFst(cardinal) + decimal_graph = decimal.fst + + fraction = FractionFst(cardinal, decimal) + fraction_graph = fraction.fst + + time = TimeFst() + time_graph = time.fst + + date = DateFst(cardinal) + date_graph = date.fst + + money = MoneyFst(cardinal, decimal) + money_graph = money.fst + + telephone = TelephoneFst() + telephone_graph = telephone.fst + + measure = MeasureFst(cardinal) + measure_graph = measure.fst + + word_graph = WordFst().fst + whitelist_graph = WhiteListFst().fst + + classify = ( + pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.1) + | pynutil.add_weight(decimal_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.0) + | pynutil.add_weight(time_graph, 1.0) + | pynutil.add_weight(date_graph, 1.1) + | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.1) + | pynutil.add_weight(measure_graph, 1.1) + | pynutil.add_weight(word_graph, 100) + | pynutil.add_weight(whitelist_graph, 1.01) + ) + + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + space = NEMO_WHITE_SPACE @ delete_extra_space + space_opt = pynini.closure(space, 0, 1) + + graph = delete_space + token + pynini.closure(space_opt + token) + delete_space + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/whitelist.py new file mode 100644 index 000000000..fd443c287 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/whitelist.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for classifying whitelisted tokens + This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv". + """ + + def __init__(self): + super().__init__(name="whitelist", kind="classify") + + whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")) + graph = (pynutil.insert('name: "')) + (whitelist) + pynutil.insert('"') + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py new file mode 100644 index 000000000..0e4dbb93c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_SPACE, GraphFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. sleep -> tokens { name: "sleep" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/utils.py b/nemo_text_processing/inverse_text_normalization/ko/utils.py new file mode 100644 index 000000000..d198c3835 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/utils.py @@ -0,0 +1,20 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +def get_abs_path(rel_path): + + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py new file mode 100644 index 000000000..fb9a76d8e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal + e.g. cardinal { negative: "-" integer: "23" } -> -23 + """ + + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + negative_sign = ( + pynutil.delete("negative:") + + delete_space + + pynutil.delete("\"") + + pynini.accep("-") + + pynutil.delete("\"") + ) + + optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1) + + digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) + integer_cardinal = ( + pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + digits_from_tag + pynutil.delete("\"") + ) + + graph = integer_cardinal + final_graph = optional_sign_output + graph + self.fst = self.delete_tokens(final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py new file mode 100644 index 000000000..88ed973df --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py @@ -0,0 +1,50 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying date, + e.g. 이천십이년 일월 오일 -> date { year: "2012" month: "1" day: "5" } + e.g. 오월 -> date { month: "5" } + e.g. 칠일 -> date { day: "7" } + """ + + def __init__(self): + super().__init__(name="date", kind="verbalize") + + year_component = ( + pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("년") + pynutil.delete("\"") + ) + month_component = ( + pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("월") + pynutil.delete("\"") + ) + day_component = ( + pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("일") + pynutil.delete("\"") + ) + + graph = ( + pynini.closure(pynutil.delete(NEMO_SPACE) + year_component, 0, 1) + + pynini.closure(pynutil.delete(NEMO_SPACE) + month_component, 0, 1) + + pynini.closure(pynutil.delete(NEMO_SPACE) + day_component, 0, 1) + ) + + final_graph = self.delete_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/decimal.py new file mode 100644 index 000000000..65f225f45 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/decimal.py @@ -0,0 +1,52 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class DecimalFst(GraphFst): + """ + Finite state transducer for verbalizing decimal + e.g. decimal { integer_part: "1" fractional_part: "5" } -> 1.5 + e.g. decimal { integer_part: "1" fractional_part: "5" quantity: "만" } -> 1.5만 + """ + + def __init__(self): + super().__init__(name="decimal", kind="verbalize") + + decimal_point = pynutil.insert(".") + integer_part = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + fractional_part = pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + quantity_part = pynutil.delete("quantity: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + + graph_decimal = integer_part + decimal_point + pynutil.delete(" ") + fractional_part + graph_decimal_larger = ( + integer_part + decimal_point + pynutil.delete(" ") + fractional_part + pynutil.delete(" ") + quantity_part + ) + + graph_sign = pynutil.delete("negative: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + + graph = ( + graph_decimal + | graph_decimal_larger + | (graph_sign + pynutil.delete(" ") + graph_decimal) + | (graph_sign + pynutil.delete(" ") + graph_decimal_larger) + ) + + final_graph = self.delete_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/fraction.py new file mode 100644 index 000000000..c079bbbca --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/fraction.py @@ -0,0 +1,76 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_NON_BREAKING_SPACE, + NEMO_NOT_QUOTE, + NEMO_SPACE, + GraphFst, +) + + +class FractionFst(GraphFst): + def __init__(self): + """ + Fitite state transducer for classifying fractions + e.g., + fraction { denominator: "사" numerator: "삼" } -> 3/4 + fraction { integer_part: "일" denominator: "사" numerator: "삼" } -> 1 3/4 + fraction { denominator: "루트삼" numerator: "일" } -> 1/√3 + fraction { denominator: "일점육오" numerator: "오십" } -> 50/1.65 + fraction { denominator: "이루트육" numerator: "삼" } -> 3/2√6 + """ + super().__init__(name="fraction", kind="verbalize") + + sign_component = pynutil.delete("negative: \"") + pynini.closure("-", 1) + pynutil.delete("\"") + + mixed_number_component = ( + pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + + denominator_component = ( + pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + + numerator_component = ( + pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + + suffix_component = ( + pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1) + + pynutil.delete('suffix: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + optional_suffix = pynini.closure(pynutil.insert(" ") + suffix_component, 0, 1) + + regular_graph = ( + pynini.closure((sign_component + pynutil.delete(NEMO_SPACE)), 0, 1) + + pynini.closure( + mixed_number_component + pynutil.delete(NEMO_SPACE) + pynutil.insert(NEMO_NON_BREAKING_SPACE) + ) + + numerator_component + + pynutil.delete(NEMO_SPACE) + + pynutil.insert("/") + + denominator_component + + optional_suffix + ) + + final_graph = self.delete_tokens(regular_graph) + + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/measure.py new file mode 100644 index 000000000..cedf4703d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/measure.py @@ -0,0 +1,88 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying measure + e.g. 십이 킬로그램 -> measure { cardinal { integer: "12" } units: "kg" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self): + super().__init__(name="measure", kind="verbalize") + + measurement = pynini.closure(NEMO_NOT_QUOTE, 1) + + optional_sign = pynini.closure(pynutil.delete('negative: "true"') + delete_space + pynutil.insert("-"), 0, 1) + + unit = pynutil.delete('units: "') + measurement + pynutil.delete('"') + + graph_cardinal = ( + pynutil.delete("cardinal {") + + delete_space + + optional_sign + + delete_space + + pynutil.delete('integer: "') + + measurement + + pynutil.delete('"') + + delete_space + + pynutil.delete("}") + ) + + graph_decimal = ( + pynutil.delete("decimal {") + + delete_space + + optional_sign + + delete_space + + pynutil.delete('integer_part: "') + + measurement + + pynutil.delete('"') + + delete_space + + pynutil.delete('fractional_part: "') + + pynutil.insert(".") + + measurement + + pynutil.delete('"') + + delete_space + + pynutil.delete("}") + ) + + graph_fraction = ( + pynutil.delete("fraction {") + + delete_space + + optional_sign + + delete_space + + pynutil.delete('numerator: "') + + measurement + + pynutil.delete('"') + + delete_space + + pynutil.delete('denominator: "') + + pynutil.insert("/") + + measurement + + pynutil.delete('"') + + delete_space + + pynutil.delete("}") + ) + + graph = (graph_cardinal | graph_decimal | graph_fraction) + delete_space + pynutil.insert(" ") + unit + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py new file mode 100644 index 000000000..45e4c7e2c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py @@ -0,0 +1,57 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_CHAR, + NEMO_SPACE, + GraphFst, + delete_space, +) + + +class MoneyFst(GraphFst): + """ + Finite state transducer for classifying money + e.g. 오만 삼천원 -> money { integer_part: "53000" currency: "원" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self): + super().__init__(name="money", kind="verbalize") + integer = ( + pynutil.delete("integer_part:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_CHAR - NEMO_SPACE, 1) + + pynutil.delete('"') + ) + + unit = ( + pynutil.delete("currency:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_CHAR - NEMO_SPACE, 1) + + pynutil.delete('"') + ) + + optional_space = pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1).optimize() + + graph = unit + optional_space + integer + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py new file mode 100644 index 000000000..0fedb457b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py @@ -0,0 +1,47 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals + e.g. 스물세번째 -> ordinal {integer: "23", 23번째} + e.g. 사십오번째 -> ordinal but the integer part is written in cardinal(due to korean grammar) + { integer: "45", 45번쨰} + """ + + def __init__(self): + super().__init__(name="ordinal", kind="verbalize") + + integer_component = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + counter_component = pynutil.delete("counter: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + suffix_component = pynutil.delete('suffix: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + + graph_with_counter = ( + integer_component + + delete_space + + counter_component + + pynini.closure(delete_space + suffix_component, 0, 1) + ) + + ordinal_verbalizer = pynini.union(graph_with_counter, integer_component) + + final_graph = self.delete_tokens(ordinal_verbalizer) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/telephone.py new file mode 100644 index 000000000..96794c610 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/telephone.py @@ -0,0 +1,33 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying a generic 3-4-4 telephone number. + e.g. 공일공에 일이삼사에 오육칠팔 -> telephone { number: "010-1234-5678" } + + """ + + def __init__(self): + super().__init__(name="telephone", kind="verbalize") + + number_part = pynutil.delete('number_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + delete_tokens = self.delete_tokens(number_part) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py new file mode 100644 index 000000000..4b63ade99 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py @@ -0,0 +1,88 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_DIGIT, + NEMO_NOT_QUOTE, + NEMO_SPACE, + GraphFst, + delete_space, +) + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time + e.g. 열두시 삼십분 -> time { hours: "12" minutes: "30" } + e.g. 12분전 -> time { minutes: "12" suffix: "전" } + e.g. 새벽 두시 -> time { hours: "2" suffix: "새벽" } + e.g. 두시반 -> time { hours: "2" minutes: "30" } + e.g. 오후 두시반 -> time { prefix: "오후" hours: "2" minutes: "30" } + """ + + def __init__(self): + super().__init__(name="time", kind="verbalize") + + hours_component = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + minutes_component = pynutil.delete("minutes: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + seconds_component = pynutil.delete("seconds: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + suffix_component = pynutil.delete("suffix: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + prefix_component = pynutil.delete("prefix: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + # Add a leading zero to single-digit minutes/seconds + single_digit = NEMO_DIGIT + leading_zero = pynutil.insert("0") + single_digit + add_leading_zero = pynini.union(single_digit @ leading_zero, pynini.closure(NEMO_DIGIT, 2)) + + minutes = minutes_component @ add_leading_zero + seconds = seconds_component @ add_leading_zero + + # Defining all the possible combinations + path_h = hours_component + pynutil.insert(":00") + path_m = minutes + path_s = seconds + + path_hm = hours_component + delete_space + pynutil.insert(":") + minutes + path_hs = ( + hours_component + + delete_space + + pynutil.insert(":") + + pynutil.insert("00") + + delete_space + + pynutil.insert(":") + + seconds + ) + path_ms = minutes + delete_space + pynutil.insert(":") + seconds + + path_hms = ( + hours_component + + delete_space + + pynutil.insert(":") + + minutes + + delete_space + + pynutil.insert(":") + + seconds + ) + + time_graph = pynini.union(path_h, path_m, path_s, path_hm, path_hs, path_ms, path_hms) + + # Adding prefix and suffix space + optional_prefix_out = pynini.closure(delete_space + prefix_component, 0, 1) + optional_suffix_out = pynini.closure(delete_space + pynutil.insert(NEMO_SPACE) + suffix_component, 0, 1) + + final_graph = optional_prefix_out + time_graph + optional_suffix_out + self.fst = self.delete_tokens(delete_space + final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py new file mode 100644 index 000000000..17e709555 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pynini + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.date import DateFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal = OrdinalFst() + ordinal_graph = ordinal.fst + + decimal = DecimalFst() + decimal_graph = decimal.fst + + fraction = FractionFst() + fraction_graph = fraction.fst + + time = TimeFst() + time_graph = time.fst + + date = DateFst() + date_graph = date.fst + + money = MoneyFst() + money_graph = money.fst + + telephone = TelephoneFst() + telephone_graph = telephone.fst + + measure = MeasureFst() + measure_graph = measure.fst + + word = WordFst() + word_graph = word.fst + + whitelist_graph = WhiteListFst().fst + + graph = pynini.union( + cardinal_graph, + ordinal_graph, + decimal_graph, + fraction_graph, + time_graph, + date_graph, + money_graph, + telephone_graph, + measure_graph, + word_graph, + whitelist_graph, + ) + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py new file mode 100644 index 000000000..6bcca5fb8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -0,0 +1,45 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_extra_space, delete_space +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now + """ + + def __init__(self): + super().__init__(name="verbalize_final", kind="verbalize") + verbalize = VerbalizeFst().fst + word = WordFst().fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py new file mode 100644 index 000000000..395423017 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class WhiteListFst(GraphFst): + ''' + tokens { whitelist: "ATM" } -> A T M + ''' + + def __init__(self, deterministic: bool = True, lm: bool = False): + super().__init__(name="whitelist", kind="verbalize", deterministic=deterministic) + + whitelist = pynutil.delete("name: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + graph = whitelist + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py new file mode 100644 index 000000000..226b41e08 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -0,0 +1,39 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_CHAR, + NEMO_SIGMA, + GraphFst, + delete_space, +) + + +class WordFst(GraphFst): + """ + Finite state transducer for verbalizing plain tokens + e.g. tokens { name: "sleep" } -> sleep + """ + + def __init__(self): + super().__init__(name="word", kind="verbalize") + chars = pynini.closure(NEMO_CHAR - " ", 1) + char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 0852329d6..c93d8df64 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", 'ja'], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "ko", "mr", "pt", "ru", "sv", "vi", "zh", 'ja'], default="en", type=str, ) diff --git a/nemo_text_processing/text_normalization/ko/__init__.py b/nemo_text_processing/text_normalization/ko/__init__.py new file mode 100644 index 000000000..dd0e509b3 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/text_normalization/ko/data/__init__.py b/nemo_text_processing/text_normalization/ko/data/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/date/__init__.py b/nemo_text_processing/text_normalization/ko/data/date/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/date/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/date/exceptions.tsv b/nemo_text_processing/text_normalization/ko/data/date/exceptions.tsv new file mode 100644 index 000000000..2f54cee92 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/date/exceptions.tsv @@ -0,0 +1,2 @@ +6 유 +10 시 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/date/week.tsv b/nemo_text_processing/text_normalization/ko/data/date/week.tsv new file mode 100644 index 000000000..bc205bc3f --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/date/week.tsv @@ -0,0 +1,8 @@ +월 월요일 +화 화요일 +수 수요일 +목 목요일 +금 금요일 +토 토요일 +일 일요일 +공 공휴일 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/__init__.py b/nemo_text_processing/text_normalization/ko/data/electronic/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/electronic/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv new file mode 100644 index 000000000..f562cfbad --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv @@ -0,0 +1,11 @@ +카드 끝자리 카드 끝자리 +카드 마지막 네자리 카드 마지막 네자리 +카드 마지막 4자리 카드 마지막 네자리 +신용카드 번호 신용카드 번호 +신용카드 신용카드 +체크카드 번호 체크카드 번호 +체크카드 체크카드 +카드번호 카드번호 +결제 카드 결제 카드 +결제카드 결제카드 +카드 카드 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv new file mode 100644 index 000000000..3d04ca298 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv @@ -0,0 +1,28 @@ +.com 닷컴 +.org 닷 오알지 +.gov 닷 거브 +.edu 닷 에듀 +.net 닷 넷 +.ai 닷 에이아이 +.io 닷 아이오 +.dev 닷 데브 +.app 닷 앱 +.cloud 닷 클라우드 +.shop 닷 샵 +.store 닷 스토어 +.co 닷 씨오 +.me 닷 미 +.kr 닷 케이알 +.co.kr 닷 씨오 닷 케이알 +.ac.kr 닷 에이씨 닷 케이알 +.or.kr 닷 오알 닷 케이알 +.go.kr 닷 지오 닷 케이알 +.re.kr 닷 알이 닷 케이알 +.cn 닷 씨엔 +.fr 닷 에프알 +.de 닷 디이 +.it 닷 아이티 +.uk 닷 유케이 +.br 닷 비알 +.in 닷 아이엔 +.ru 닷 알유 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv new file mode 100644 index 000000000..c80d08a69 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv @@ -0,0 +1,6 @@ +.jpg 닷 제이피지 +.png 닷 피엔지 +.pdf 닷 피디에프 +.JPG 닷 제이피지 +.PNG 닷 피엔지 +.PDF 닷 피디에프 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/symbol.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/symbol.tsv new file mode 100644 index 000000000..f551dabf4 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/electronic/symbol.tsv @@ -0,0 +1,12 @@ +@ 골뱅이 +. 점 +/ 슬래시 +- 대시 +_ 언더바 +: 콜론 +? 물음표 += 이퀄 +& 앰퍼샌드 +% 퍼센트 ++ 플러스 +# 샵 diff --git a/nemo_text_processing/text_normalization/ko/data/measure/__init__.py b/nemo_text_processing/text_normalization/ko/data/measure/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/measure/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/measure/unit.tsv b/nemo_text_processing/text_normalization/ko/data/measure/unit.tsv new file mode 100644 index 000000000..ccec41e7f --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/measure/unit.tsv @@ -0,0 +1,18 @@ +kg 킬로그램 +g 그램 +km 킬로미터 +m 미터 +cm 센티미터 +mm 밀리미터 +L 리터 +l 리터 +mL 밀리리터 +ml 밀리리터 +h 시간 +s 초 +N 뉴턴 +W 와트 +Hz 헤르츠 +° 도 +% 퍼센트 +rpm 분당회전수 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/money/__init__.py b/nemo_text_processing/text_normalization/ko/data/money/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/money/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/money/currency_major.tsv b/nemo_text_processing/text_normalization/ko/data/money/currency_major.tsv new file mode 100644 index 000000000..a5ddfeb45 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/money/currency_major.tsv @@ -0,0 +1,22 @@ +₩ 원 +KRW 원 +krw 원 +$ 달러 +US$ 달러 +HK$ 홍콩 달러 +hk$ 홍콩 달러 +€ 유로 +EUR 유로 +¥ 엔 +JPY 엔 +CAD 캐나다 달러 +cad 캐나다 달러 +NZD 뉴질랜드 달러 +nzd 뉴질랜드 달러 +CHF 스위스 프랑 +chf 스위스 프랑 +AED 아랍에미리트 디르함 +aed 아랍에미리트 디르함 +Dh 디르함 +DH 디르함 +Dhs. 디르함 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/number/__init__.py b/nemo_text_processing/text_normalization/ko/data/number/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/number/counter_suffix.tsv b/nemo_text_processing/text_normalization/ko/data/number/counter_suffix.tsv new file mode 100644 index 000000000..6c48f5e7d --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/counter_suffix.tsv @@ -0,0 +1,16 @@ +개 +명 +병 +마리 +송이 +포기 +사람 +자루 +채 +켤레 +그루 +벌 +잔 +장 +권 +살 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/number/digit.tsv b/nemo_text_processing/text_normalization/ko/data/number/digit.tsv new file mode 100644 index 000000000..61a7dddcf --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/digit.tsv @@ -0,0 +1,9 @@ +1 일 +2 이 +3 삼 +4 사 +5 오 +6 육 +7 칠 +8 팔 +9 구 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/number/native_ones.tsv b/nemo_text_processing/text_normalization/ko/data/number/native_ones.tsv new file mode 100644 index 000000000..f9926b64b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/native_ones.tsv @@ -0,0 +1,9 @@ +1 한 +2 두 +3 세 +4 네 +5 다섯 +6 여섯 +7 일곱 +8 여덟 +9 아홉 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/number/tens.tsv b/nemo_text_processing/text_normalization/ko/data/number/tens.tsv new file mode 100644 index 000000000..d8b8e0a2b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/tens.tsv @@ -0,0 +1,9 @@ +1 십 +2 이십 +3 삼십 +4 사십 +5 오십 +6 육십 +7 칠십 +8 팔십 +9 구십 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/number/zero.tsv b/nemo_text_processing/text_normalization/ko/data/number/zero.tsv new file mode 100644 index 000000000..7024c0534 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/zero.tsv @@ -0,0 +1 @@ +0 영 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/ordinal/__init__.py b/nemo_text_processing/text_normalization/ko/data/ordinal/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/ordinal/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/ordinal/digit.tsv b/nemo_text_processing/text_normalization/ko/data/ordinal/digit.tsv new file mode 100644 index 000000000..b3efc4cef --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/ordinal/digit.tsv @@ -0,0 +1,8 @@ +2 두 +3 세 +4 네 +5 다섯 +6 여섯 +7 일곱 +8 여덟 +9 아홉 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/ordinal/exceptions.tsv b/nemo_text_processing/text_normalization/ko/data/ordinal/exceptions.tsv new file mode 100644 index 000000000..ad796a0e8 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/ordinal/exceptions.tsv @@ -0,0 +1,5 @@ +1 첫 +11 열한 +20 스무 +21 스물한 +31 서른한 diff --git a/nemo_text_processing/text_normalization/ko/data/ordinal/tens.tsv b/nemo_text_processing/text_normalization/ko/data/ordinal/tens.tsv new file mode 100644 index 000000000..994a21a1d --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/ordinal/tens.tsv @@ -0,0 +1,2 @@ +10 열 +30 서른 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/ordinal/tens_prefix.tsv b/nemo_text_processing/text_normalization/ko/data/ordinal/tens_prefix.tsv new file mode 100644 index 000000000..3111a82dc --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/ordinal/tens_prefix.tsv @@ -0,0 +1,3 @@ +1 열 +2 스물 +3 서른 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/time/__init__.py b/nemo_text_processing/text_normalization/ko/data/time/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/time/division.tsv b/nemo_text_processing/text_normalization/ko/data/time/division.tsv new file mode 100644 index 000000000..9250d0a8f --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/time/division.tsv @@ -0,0 +1,7 @@ +오전 +오후 +새벽 +아침 +낮 +저녁 +밤 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/time/hour.tsv b/nemo_text_processing/text_normalization/ko/data/time/hour.tsv new file mode 100644 index 000000000..abfccd310 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/time/hour.tsv @@ -0,0 +1,12 @@ +1 한 +2 두 +3 세 +4 네 +5 다섯 +6 여섯 +7 일곱 +8 여덟 +9 아홉 +10 열 +11 열한 +12 열두 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/whitelist.tsv b/nemo_text_processing/text_normalization/ko/data/whitelist.tsv new file mode 100644 index 000000000..82dc1220e --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/whitelist.tsv @@ -0,0 +1,38 @@ +Dr. 박사 +dr. 박사 +Mr. 씨 +mr. 씨 +Ms. 양 +ms. 양 +Mrs. 여사 +mrs. 여사 +mt. 산 +Mt. 산 +Prof. 교수 +prof. 교수 +sr. 시니어 +Sr. 시니어 +jr. 주니어 +Jr. 주니어 +rd. 로 +Rd. 로 +Ave. 대로 +ave. 대로 +no. 번호 +No. 번호 +( 왼쪽 괄호 +) 오른쪽 괄호 ++ 더하기 +- 마이너스 +Σ 시그마 +η 에타 +κ 카파 +ω 오메가 +σ 시그마 +α 알파 +ν 뉴 +δ 델타 +ι 이오타 +vs. 대 +Ph.D. 박사학위 +etc. 등 diff --git a/nemo_text_processing/text_normalization/ko/graph_utils.py b/nemo_text_processing/text_normalization/ko/graph_utils.py new file mode 100644 index 000000000..9db51238f --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/graph_utils.py @@ -0,0 +1,173 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.text_normalization.en.utils import load_labels +from nemo_text_processing.utils.logging import logger + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_DIGIT = byte.DIGIT +NEMO_ALPHA = pynini.union(*[chr(i) for i in range(ord('가'), ord('힣') + 1)]).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"')) +) + + +# Common string literals; expand as you see fit. +username_string = "username" +double_quotes = '"' +domain_string = "domain" +protocol_string = "protocol" +slash = "/" +double_slash = "//" +triple_slash = "///" +file = "file" +period = "." +at = "@" +colon = ":" +https = "https" +http = "http" +www = "www" + + +def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logger.info(f"Created {file_name}") + + +def convert_space(fst) -> "pynini.FstLike": + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, "\u00a0"), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = "lower_cased"): + labels = load_labels(input_file) + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/ko/taggers/__init__.py b/nemo_text_processing/text_normalization/ko/taggers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py new file mode 100644 index 000000000..e2b0f87e3 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py @@ -0,0 +1,327 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class CardinalFst(GraphFst): + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="classify", deterministic=deterministic) + + # Optional small whitespace inside parentheses or after signs + ws = pynini.closure(NEMO_SPACE, 0, 2) + + # Load base .tsv files + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + + digit_except_one = pynini.difference(NEMO_DIGIT, "1") + digit_except_zero_one = pynini.difference(digit_except_one, "0") + + graph_digit_no_zero_one = digit_except_zero_one @ graph_digit + graph_tens = pynini.string_file(get_abs_path("data/number/tens.tsv")) + + # Compose all basic number forms + graph_1_to_99 = (graph_tens + (graph_digit | pynutil.delete('0'))) | graph_digit + + hundreds = NEMO_DIGIT**3 + graph_hundred_component = ( + pynini.cross('1', '백') | (graph_digit_no_zero_one + pynutil.insert('백')) + ) + pynini.union(pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_1_to_99)) + graph_hundred = hundreds @ graph_hundred_component + + thousands = NEMO_DIGIT**4 + graph_thousand_component = pynini.union( + pynini.cross('1', '천'), + graph_digit_no_zero_one + pynutil.insert('천'), + ) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_thousand = thousands @ graph_thousand_component + + ten_thousands = NEMO_DIGIT**5 + graph_ten_thousand_component = pynini.union( + pynini.cross('1', '만'), + graph_digit_no_zero_one + pynutil.insert('만'), + ) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_ten_thousand = ten_thousands @ graph_ten_thousand_component + + hundred_thousands = NEMO_DIGIT**6 + graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert("만")) + pynini.union( + pynini.closure(pynutil.delete("0")), + graph_thousand_component, + (pynutil.delete("0") + graph_hundred_component), + (pynini.closure(pynutil.delete("0")) + graph_1_to_99), + ) + graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component + + millions = NEMO_DIGIT**7 + graph_million_component = ((graph_hundred) + pynutil.insert('만')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_million = millions @ graph_million_component + + ten_millions = NEMO_DIGIT**8 + graph_ten_million_component = ((graph_thousand) + pynutil.insert('만')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_ten_million = ten_millions @ graph_ten_million_component + + hundred_millions = NEMO_DIGIT**9 + graph_hundred_million_component = (graph_digit + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_hundred_million = hundred_millions @ graph_hundred_million_component + + thousand_millions = NEMO_DIGIT**10 + graph_thousand_million_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_thousand_million = thousand_millions @ graph_thousand_million_component + + billions = NEMO_DIGIT**11 + graph_billions_component = ((graph_hundred) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_billions = billions @ graph_billions_component + + ten_billions = NEMO_DIGIT**12 + graph_ten_billions_component = ((graph_thousand) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_ten_billions = ten_billions @ graph_ten_billions_component + + hundred_billions = NEMO_DIGIT**13 + graph_hundred_billions_component = (graph_digit + pynutil.insert('조')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_hundred_billions = hundred_billions @ graph_hundred_billions_component + + trillion = NEMO_DIGIT**14 + graph_trillion_component = ( + (NEMO_DIGIT**2 @ graph_1_to_99) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + ) + graph_trillions = trillion @ graph_trillion_component + + ten_trillions = NEMO_DIGIT**15 + graph_ten_trillions_component = ( + (graph_hundred) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + ) + graph_ten_trillions = ten_trillions @ graph_ten_trillions_component + + hundred_trillions = NEMO_DIGIT**16 + graph_hundred_trillions_component = ( + (graph_thousand) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + ) + graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component + + thousand_trillions = NEMO_DIGIT**17 + graph_thousand_trillions_component = ( + graph_digit + + pynutil.insert('경') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_trillions_component, + pynutil.delete('0') + graph_ten_trillions_component, + pynutil.delete('00') + graph_trillion_component, + pynutil.delete('000') + graph_hundred_billions_component, + pynutil.delete('0000') + graph_ten_billions_component, + pynutil.delete('00000') + graph_billions_component, + pynutil.delete('000000') + graph_thousand_million_component, + pynutil.delete('0000000') + graph_hundred_million_component, + pynutil.delete('00000000') + graph_ten_million_component, + pynutil.delete('000000000') + graph_million_component, + pynutil.delete('0000000000') + graph_hundred_thousand_component, + pynutil.delete('00000000000') + graph_ten_thousand_component, + pynutil.delete('000000000000') + graph_thousand_component, + pynutil.delete('0000000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + ) + graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component + + # FST + graph_num = pynini.union( + graph_thousand_trillions, + graph_hundred_trillions, + graph_ten_trillions, + graph_trillions, + graph_hundred_billions, + graph_ten_billions, + graph_billions, + graph_thousand_million, + graph_hundred_million, + graph_ten_million, + graph_million, + graph_hundred_thousand, + graph_ten_thousand, + graph_thousand, + graph_hundred, + graph_1_to_99, + graph_zero, + ).optimize() + + # ---------------------------- + # Native counting + counters + # e.g., 3개, 2명, 10살 + # + # In Korean, counters require native numeral forms + # for small numbers (한/두/세…, 열/스무/서른…). + counter_suffix = pynini.string_file(get_abs_path("data/number/counter_suffix.tsv")) + counter_suffix_accep = pynini.project(counter_suffix, "input").optimize() + + native_ones = pynini.string_file(get_abs_path("data/number/native_ones.tsv")) # 1~9: 한/두/세/... + ordinal_tens = pynini.string_file(get_abs_path("data/ordinal/tens.tsv")) # 10=열, 20=스무, 30=서른 + ordinal_tens_prefix = pynini.string_file(get_abs_path("data/ordinal/tens_prefix.tsv")) # 열/스물/서른 + + native_11_to_39 = (ordinal_tens_prefix + native_ones).optimize() + native_1_to_39 = pynini.union(native_ones, ordinal_tens, native_11_to_39).optimize() + + # Compose number + counter as one cardinal token + counter_case = ( + pynutil.insert('integer: "') + + native_1_to_39 + + pynutil.insert('" ') + + pynutil.insert('counter: "') + + counter_suffix_accep + + pynutil.insert('"') + ).optimize() + + # Sign and final formatting + # Build the integer token (integer: "...") + integer_token = pynutil.insert('integer: "') + graph_num + pynutil.insert('"') + + # Sign handling: + # - minus sets negative flag + # - plus is ignored (positive number) + minus_prefix = pynutil.insert('negative: "true" ') + pynutil.delete("-") + plus_prefix = pynutil.delete("+") + + # Accounting negative: "( 1,234 )" -> negative + integer:"1234" + paren_negative = ( + pynutil.insert('negative: "true" ') + pynutil.delete("(") + ws + integer_token + ws + pynutil.delete(")") + ) + + # Signed number: optional (+|-) + integer + signed_integer = (minus_prefix | plus_prefix).ques + integer_token + + # Prefer accounting-form first, then signed form + final_graph = paren_negative | signed_integer | counter_case + + # Wrap with class tokens and finalize + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() + self.graph = graph_num diff --git a/nemo_text_processing/text_normalization/ko/taggers/date.py b/nemo_text_processing/text_normalization/ko/taggers/date.py new file mode 100644 index 000000000..4f2da5702 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/date.py @@ -0,0 +1,311 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying dates in Korean, e.g. + 2024/01/30 -> date { year: "이천이십사" month: "일월" day: "삼십" } + 2024/1/30 -> date { year: "이천이십사" month: "일월" day: "삼십" } + 2024-01-30 -> date { year: "이천이십사" month: "일월" day: "삼십" } + 2024.01.30 -> date { year: "이천이십사" month: "일월" day: "삼십" } + + 기원전233년 -> date { era: "기원전" year: "이백삼십삼년" } + 기원후2024년 -> date { era: "기원후" year: "이천이십사년" } + + 21일월요일 -> tokens { date { day: "이십일일" weekday: "월요일" } } + 1970년대 -> date { year: "천구백칠십년대" } + + 1월1일(월)~3일(수) + -> tokens { date { month: "일월" day: "일일" weekday: "월요일" } } + tokens { name: "부터" } + tokens { date { day: "삼일" weekday: "수요일" } } + + 1970~1980년대 + -> tokens { cardinal { integer: "천구백칠십" } } + tokens { name: "부터" } + tokens { date { year: "천구백팔십년대" } } + + 7월5~9일(월~금) + -> tokens { date { month: "칠월" } } + tokens { cardinal { integer: "오" } } + tokens { name: "부터" } + tokens { date { day: "구일" weekday: "월요일" } } + tokens { name: "부터" } + tokens { date { weekday: "금요일" } } + + 2023년3월1일(수)~6월12일(화) + -> tokens { date { year: "이천이십삼년" month: "삼월" day: "일일" weekday: "수요일" } } + tokens { name: "부터" } + tokens { date { month: "유월" day: "십이일" weekday: "화요일" } } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="date", kind="classify", deterministic=deterministic) + + strip0 = pynini.closure(pynutil.delete("0"), 0, 1) + graph_cardinal = cardinal.graph + cardinal_lz = (strip0 + graph_cardinal).optimize() + + # Load base .tsv files + week = pynini.string_file(get_abs_path("data/date/week.tsv")) + month_exceptions = pynini.string_file(get_abs_path("data/date/exceptions.tsv")) + month_exceptions_inputs = pynini.project(month_exceptions, "input").optimize() + + # Non-exception inputs go through the generic cardinal path + graph_cardinal_non_exceptions = pynini.compose( + pynini.difference(pynini.project(graph_cardinal, "input"), month_exceptions_inputs).optimize(), + graph_cardinal, + ).optimize() + + # Month cardinal: prefer exceptions; + month_cardinal = strip0 + (month_exceptions | graph_cardinal_non_exceptions).optimize() + + era = pynini.union("기원전", "기원후").optimize() + signs = pynutil.delete("/") | pynutil.delete(".") | pynutil.delete("-") + + # Strict digit ranges for M/D/Y and Y/M/D + _d = pynini.union(*[pynini.accep(str(i)) for i in range(10)]) + _1to9 = pynini.union(*[pynini.accep(str(i)) for i in range(1, 10)]) + + # For standalone years: + # - No era: 1–4 digits with NO leading zeros + YEAR_NO_ERA_1TO4 = pynini.closure(pynutil.delete("0"), 0, 3) + _1to9 + pynini.closure(_d, 0, 3) + # - With era (기원전/기원후): allow leading zeros but strip them + YEAR_ERA_1TO4 = pynini.closure(pynutil.delete("0"), 0, 3) + _1to9 + pynini.closure(_d, 0, 3) + + # MM: 01-09 | 10-12 + MM = (pynini.accep("0") + _1to9) | (pynini.accep("1") + pynini.union("0", "1", "2")) + + # DD: 01-09 | 10-19 | 20-29 | 30-31 + DD = ( + (pynini.accep("0") + _1to9) + | (pynini.accep("1") + _d) + | (pynini.accep("2") + _d) + | (pynini.accep("3") + pynini.union("0", "1")) + ) + + # YYYY: exactly 4 digits and two-digit year for M/D/YY and D/M/YY + YYYY = pynini.union("1", "2") + _d + _d + _d + YY = _d + _d + + # Map digits -> cardinal words using existing graphs (strip leading zero via month_cardinal/cardinal_lz) + mm_to_text = pynini.compose(MM, month_cardinal).optimize() + dd_to_text = pynini.compose(DD, cardinal_lz).optimize() + yy_to_text = pynini.compose(YY, graph_cardinal).optimize() + + # Components with tags/suffixes (strict) + month_component_md = ( + pynutil.insert("month: \"") + mm_to_text + pynutil.insert("월") + pynutil.insert("\"") + ).optimize() + day_component_md = ( + pynutil.insert("day: \"") + dd_to_text + pynutil.insert("일") + pynutil.insert("\"") + ).optimize() + year_component_y2 = ( + pynutil.insert("year: \"") + yy_to_text + pynutil.insert("년") + pynutil.insert("\"") + ).optimize() + + # Generic components + era_component = pynutil.insert("era: \"") + era + pynutil.insert("\"") + + # Brackets for weekday + front_bracket = ( + pynini.closure(pynutil.delete(delete_space)) + + pynutil.delete("(") + + pynini.closure(pynutil.delete(delete_space)) + ) | ( + pynini.closure(pynutil.delete(delete_space)) + + pynutil.delete("(") + + pynini.closure(pynutil.delete(delete_space)) + ) + preceding_bracket = ( + pynini.closure(pynutil.delete(delete_space)) + + pynutil.delete(")") + + pynini.closure(pynutil.delete(delete_space)) + ) | ( + pynini.closure(pynutil.delete(delete_space)) + + pynutil.delete(")") + + pynini.closure(pynutil.delete(delete_space)) + ) + + week_component_bracketed = ( + (front_bracket + pynutil.insert("weekday: \"") + week + preceding_bracket + pynutil.insert("\"")) + | ( + front_bracket + + pynutil.insert("weekday: \"") + + week + + pynini.cross("〜", "부터") + + week + + preceding_bracket + + pynutil.insert("\"") + ) + | ( + front_bracket + + pynutil.insert("weekday: \"") + + week + + pynutil.delete("・") + + week + + preceding_bracket + + pynutil.insert("\"") + ) + ) + + week_component_plain = pynutil.insert("weekday: \"") + week + pynutil.insert("\"") + week_component = week_component_bracketed | week_component_plain + + # Strict 4-digit year component (1000–2999) + year_component_y4_strict = ( + pynutil.insert("year: \"") + (YYYY @ graph_cardinal) + pynutil.insert("년") + pynutil.insert("\"") + ).optimize() + + # Prefer strict 4-digit; still allow 2-digit with worse weight (for MM/DD/YY etc.) + year_component_md_strict = (year_component_y4_strict | pynutil.add_weight(year_component_y2, 1.0)).optimize() + + # Format: YYYY/MM/DD(weekday) + graph_basic_date = ( + pynini.closure(era_component + insert_space, 0, 1) + + year_component_y4_strict + + signs + + insert_space + + (pynutil.insert("month: \"") + month_cardinal + pynutil.insert("월") + pynutil.insert("\"")) + + signs + + insert_space + + (pynutil.insert("day: \"") + cardinal_lz + pynutil.insert("일") + pynutil.insert("\"")) + + pynini.closure(pynini.closure(insert_space, 0, 1) + week_component, 0, 1) + ) + + # American: MM/DD/YYYY + graph_american_date = ( + month_component_md + + signs + + insert_space + + day_component_md + + signs + + insert_space + + year_component_md_strict + + pynini.closure(pynini.closure(insert_space, 0, 1) + week_component, 0, 1) + ).optimize() + + # European: DD/MM/YYYY + graph_european_date = ( + day_component_md + + signs + + insert_space + + month_component_md + + signs + + insert_space + + year_component_md_strict + + pynini.closure(pynini.closure(insert_space, 0, 1) + week_component, 0, 1) + ).optimize() + + # Single elements (year/month/day) + individual_year_component = ( + # with era: (기원전|기원후) + 1~4 digits (leading zeros allowed → stripped) + ( + era_component + + insert_space + + pynutil.insert("year: \"") + + (YEAR_ERA_1TO4 @ graph_cardinal) + + pynutil.delete("년") + + pynutil.insert("년") + + pynutil.insert("\"") + ) + | + # no era: 1~4 digits, no leading zero + ( + pynutil.insert("year: \"") + + (YEAR_NO_ERA_1TO4 @ graph_cardinal) + + pynutil.delete("년") + + pynutil.insert("년") + + pynutil.insert("\"") + ) + ).optimize() + + individual_month_component = ( + pynutil.insert("month: \"") + + month_cardinal + + pynutil.delete("월") + + pynutil.insert("월") + + pynutil.insert("\"") + ) + + individual_day_component = ( + pynutil.insert("day: \"") + + cardinal_lz + + pynutil.delete("일") + + pynutil.insert("일") + + pynutil.insert("\"") + ) + + week_full_word_acceptor = pynini.project(week, "output") + week_component_full_word = pynutil.insert("weekday: \"") + week_full_word_acceptor + pynutil.insert("\"") + + day_and_weekday_component = ( + individual_day_component + pynini.closure(insert_space, 0, 1) + week_component_full_word + ) + + month_and_weekday_component = ( + individual_month_component + pynini.closure(insert_space, 0, 1) + week_component_full_word + ) + + graph_individual_component = ( + day_and_weekday_component + | month_and_weekday_component + | individual_year_component + | individual_month_component + | individual_day_component + | week_component + ) + pynini.closure(insert_space + week_component, 0, 1) + + graph_individual_component_combined = ( + (individual_year_component + insert_space + individual_month_component) + | (individual_month_component + insert_space + individual_day_component) + | ( + individual_year_component + + insert_space + + individual_month_component + + insert_space + + individual_day_component + ) + ) + pynini.closure(insert_space + week_component, 0, 1) + + nendai = pynini.accep("년대") + era_nendai = ( + pynini.closure(era_component + insert_space, 0, 1) + + pynutil.insert("year: \"") + + graph_cardinal + + nendai + + pynutil.insert("\"") + ).optimize() + + graph_all_date = ( + graph_basic_date + | graph_american_date + | graph_european_date + | graph_individual_component + | graph_individual_component_combined + | era_nendai + ).optimize() + + final_graph = self.add_tokens(graph_all_date) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/decimal.py b/nemo_text_processing/text_normalization/ko/taggers/decimal.py new file mode 100644 index 000000000..f18fe531f --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/decimal.py @@ -0,0 +1,76 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying decimal numbers in Korean, e.g. + 1.23 -> decimal { integer_part: "일" fractional_part: "이삼" } + -0.5 -> decimal { negative: "마이너스" integer_part: "영" fractional_part: "오" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="decimal", kind="classify", deterministic=deterministic) + + # Use the base cardinal graph for the integer part + base_integer_graph = cardinal.graph + # Only special-case 10000 -> 만 for decimal integer part (if needed) + specials_input = pynini.cross("10000", "만") + + # Try the special mapping first, then fall back to normal cardinal + cardinal_before_decimal = (specials_input | base_integer_graph).optimize() + + cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv")) + zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + + DOUBLE_QUOTE = '"' + + graph_integer = ( + pynutil.insert(f'integer_part: {DOUBLE_QUOTE}') + cardinal_before_decimal + pynutil.insert(DOUBLE_QUOTE) + ) + graph_fractional = ( + pynutil.insert(f'fractional_part: {DOUBLE_QUOTE}') + + pynini.closure(cardinal_after_decimal | zero, 1) + + pynutil.insert(DOUBLE_QUOTE) + ) + + # Decimal without a sign (e.g., 2.5) + graph_decimal_no_sign = graph_integer + pynutil.delete('.') + pynutil.insert(NEMO_SPACE) + graph_fractional + + # Negative sign handling (e.g., -2.5 or 마이너스2.5) + graph_with_negative = ( + pynutil.insert(f'negative: {DOUBLE_QUOTE}') + + (pynini.cross("-", "마이너스") | pynini.accep("마이너스")) + + pynutil.insert(DOUBLE_QUOTE) + ) + + graph_decimal = graph_decimal_no_sign | ( + graph_with_negative + pynutil.insert(NEMO_SPACE) + graph_decimal_no_sign + ) + + # For internal use without tokens + self.just_decimal = graph_decimal_no_sign.optimize() + + # Final graph with tokens + final_graph = self.add_tokens(graph_decimal) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/electronic.py b/nemo_text_processing/text_normalization/ko/taggers/electronic.py new file mode 100644 index 000000000..f928448af --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/electronic.py @@ -0,0 +1,168 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_DIGIT, + NEMO_NOT_SPACE, + NEMO_SIGMA, + NEMO_SPACE, + GraphFst, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class ElectronicFst(GraphFst): + """ + Finite state transducer (FST) for classifying **electronic expressions** such as + email addresses, URLs, and domain names in Korean. + + Example conversions: + - abc@nvidia.co.kr → electronic { username: "abc" domain: "nvidia.co.kr" } + - www.nvidia.com → electronic { domain: "www.nvidia.com" } + - https://nvidia.com → electronic { protocol: "HTTPS colon slash slash" domain: "nvidia.com" } + - 1234-5678-9012-3456 → electronic { protocol: "credit card" domain: "1234567890123456" } + + Args: + cardinal: FST for digit/number verbalization (used for numeric parts if non-deterministic). + deterministic: If True, provides a single transduction path; otherwise allows multiple. + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="electronic", kind="classify", deterministic=deterministic) + + # ---------- Basic character ranges and symbols ---------- + LOWER = pynini.union(*[pynini.accep(c) for c in "abcdefghijklmnopqrstuvwxyz"]) + UPPER = pynini.union(*[pynini.accep(c) for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]) + ASCII_LETTER = (LOWER | UPPER).optimize() + ASCII_ALNUM = (ASCII_LETTER | NEMO_DIGIT).optimize() + + HYPHEN = pynini.accep("-") + DOT = pynini.accep(".") + SLASH = pynini.accep("/") + AT = pynini.accep("@") + + # Handle numeric reading mode (only for non-deterministic mode) + numbers = ( + NEMO_DIGIT + if deterministic + else (pynutil.insert(NEMO_SPACE) + cardinal.long_numbers + pynutil.insert(NEMO_SPACE)) + ) + + # ---------- Load resources ---------- + cc_cues = pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")) + accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input") + graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize() + + # ---------- Username ---------- + # Exclude '@' from username + username_symbols = pynini.difference(accepted_symbols, AT) + # Start with alphanumeric and allow symbols/numbers repeatedly + username_core = ASCII_ALNUM + pynini.closure(ASCII_ALNUM | numbers | username_symbols) + username = pynutil.insert('username: "') + username_core + pynutil.insert('"') + pynini.cross("@", NEMO_SPACE) + + # ---------- Domain ---------- + # Simplified RFC: label = [A-Za-z0-9-]+ , TLD = '.' [A-Za-z0-9]{2,} + label = pynini.closure(ASCII_ALNUM | HYPHEN, 1) + tld = DOT + pynini.closure(ASCII_ALNUM, 2) + # Domain can be (label + TLD) or TLD only (e.g., ".com") + domain_core = (label + pynini.closure(tld, 1)) | tld + + # Optional path after domain (e.g., /path) + path_segment = pynini.closure(NEMO_NOT_SPACE, 1) # at least one non-space character + path = SLASH + path_segment # / + optional_path = pynini.closure(path, 0, 1) # optional path + + domain_with_opt_path = domain_core + optional_path + + domain_graph_with_class_tags = ( + pynutil.insert('domain: "') + domain_with_opt_path.optimize() + pynutil.insert('"') + ) + + # ---------- protocol ---------- + protocol_symbols = pynini.closure((graph_symbols | pynini.cross(":", "colon")) + pynutil.insert(NEMO_SPACE)) + protocol_start = (pynini.cross("https", "HTTPS ") | pynini.cross("http", "HTTP ")) + ( + pynini.accep("://") @ protocol_symbols + ) + protocol_file_start = pynini.accep("file") + insert_space + (pynini.accep(":///") @ protocol_symbols) + protocol_end = pynutil.add_weight(pynini.cross("www", "WWW ") + pynini.accep(".") @ protocol_symbols, -1000) + protocol = protocol_file_start | protocol_start | protocol_end | (protocol_start + protocol_end) + protocol = pynutil.insert('protocol: "') + protocol + pynutil.insert('"') + + # ---------- Combine all graphs ---------- + graph = pynini.Fst() # empty + + # (1) Email pattern + email_guard = NEMO_SIGMA + AT + NEMO_SIGMA + DOT + NEMO_SIGMA + graph |= pynini.compose(email_guard, username + domain_graph_with_class_tags) + + # (2) Domain only (without protocol)) + # Domain core graph + graph_domain = (pynutil.insert('domain: "') + domain_core + pynutil.insert('"')).optimize() + graph |= graph_domain + + known_extensions = pynini.project( + pynini.string_file(get_abs_path("data/electronic/extensions.tsv")), + "input", + ) + + filename_stem = pynini.closure( + pynini.difference(NEMO_NOT_SPACE, pynini.union(SLASH, DOT)), + 1, + ) + + file_with_extension = filename_stem + known_extensions + + graph |= (pynutil.insert('domain: "') + file_with_extension + pynutil.insert('"')).optimize() + + # (3) URL with protocol + graph |= protocol + insert_space + domain_graph_with_class_tags + + # (4) Credit card pattern: cue + 4–16 digits + if deterministic: + cc_digits = pynini.closure(NEMO_DIGIT, 4, 16) + cc_phrases = ( + pynutil.insert('protocol: "') + + cc_cues + + pynutil.insert('" domain: "') + + delete_space + + cc_digits + + pynutil.insert('"') + ) + graph |= cc_phrases + + four = pynini.closure(NEMO_DIGIT, 4, 4) + sep_token = pynini.union(HYPHEN, NEMO_SPACE) + cc16_grouped = (four + pynini.cross(sep_token, " ")) ** 3 + four + cc16_grouped = cc16_grouped + delete_space + + cc16_no_cue = ( + pynutil.insert('protocol: "신용카드 " ') + + pynutil.insert('domain: "') + + cc16_grouped + + pynutil.insert('"') + ) + + # Give it higher priority over Date FST + cc16_no_cue = pynutil.add_weight(cc16_no_cue.optimize(), -1.0) + + graph |= cc16_no_cue + + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/fraction.py b/nemo_text_processing/text_normalization/ko/taggers/fraction.py new file mode 100644 index 000000000..2163f5f7f --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/fraction.py @@ -0,0 +1,96 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class FractionFst(GraphFst): + """ + Finite state transducer for classifying Korean fractions, e.g. + 3/5 → tokens { fraction { numerator: "삼" denominator: "오" } } + 2과7/9 → tokens { fraction { integer_part: "이" numerator: "칠" denominator: "구" } } + 마이너스3/5 → tokens { fraction { negative: "마이너스" numerator: "삼" denominator: "오" } } + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="fraction", kind="classify", deterministic=deterministic) + + cardinal = cardinal.graph + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + + DOUBLE_QUOTE = '"' + slash = pynutil.delete('/') + root = pynini.accep('√') + + # Decimal number (e.g., 1.23 → 일점이삼) + decimal_number = cardinal + pynini.cross(".", "점") + pynini.closure(graph_digit | graph_zero) + + # Accept cardinal / root + cardinal / decimal / root + decimal + numeral = cardinal | (root + cardinal) | decimal_number | (root + decimal_number) + + # Integer part (e.g., 2과, 1와) + integer_component = ( + pynutil.insert(f'integer_part: {DOUBLE_QUOTE}') + + numeral + + (pynini.accep("과") | pynini.accep("와")) + + pynutil.insert(DOUBLE_QUOTE) + ) + + integer_component_with_space = integer_component + delete_space + pynutil.insert(NEMO_SPACE) + + # Denominator and numerator + denominator_component = pynutil.insert(f'denominator: {DOUBLE_QUOTE}') + numeral + pynutil.insert(DOUBLE_QUOTE) + + numerator_component = pynutil.insert(f'numerator: {DOUBLE_QUOTE}') + numeral + pynutil.insert(DOUBLE_QUOTE) + + # Format 1: 3/4 style + graph_fraction_slash = ( + pynini.closure(integer_component_with_space, 0, 1) + + numerator_component + + slash + + pynutil.insert(NEMO_SPACE) + + denominator_component + + pynutil.insert(NEMO_SPACE) + + pynutil.insert('morphosyntactic_features: "분의"') + ) + + # Format 2: Korean native "4분의3" style + graph_fraction_word = ( + pynini.closure(integer_component_with_space, 0, 1) + + denominator_component + + pynutil.delete("분의") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert('morphosyntactic_features: "분의"') + + pynutil.insert(NEMO_SPACE) + + numerator_component + ) + + # Optional minus sign + optional_sign = ( + pynutil.insert(f'negative: {DOUBLE_QUOTE}') + + (pynini.accep("마이너스") | pynini.cross("-", "마이너스")) + + pynutil.insert(DOUBLE_QUOTE) + + pynutil.insert(NEMO_SPACE) + ) + + # Combine full graph + graph = pynini.closure(optional_sign, 0, 1) + (graph_fraction_slash | graph_fraction_word) + self.graph = graph.optimize() + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/measure.py b/nemo_text_processing/text_normalization/ko/taggers/measure.py new file mode 100644 index 000000000..0891e5783 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/measure.py @@ -0,0 +1,106 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying Korean measure expressions. + - 1kg → measure { cardinal { integer: "일" } units: "킬로그램" } + - 12.5km → measure { decimal { integer_part: "십이" fractional_part: "오" } units: "킬로미터" } + - 2/3m → measure { fraction { numerator: "이" denominator: "삼" } units: "미터" } + - 60km/h → measure { cardinal { integer: "육십" } units: "킬로미터 퍼 시간" } + + This FST attaches measurement units (e.g., "킬로미터", "그램") to numeric expressions + classified by the `cardinal`, `decimal`, or `fraction` subgraphs. + + Args: + cardinal: FST handling integer (cardinal) numbers. + decimal: FST handling decimal numbers (optional). + fraction: FST handling fractional numbers (optional). + deterministic: If True, provides a single transduction path; otherwise allows multiple. + """ + + def __init__( + self, + cardinal: GraphFst, + decimal: GraphFst = None, + fraction: GraphFst = None, + deterministic: bool = True, + ): + super().__init__(name="measure", kind="classify", deterministic=deterministic) + + # Numeric subgraphs + graph_cardinal = cardinal.graph + + # Unit lexicon + graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv")) + + # Per-expression handling (e.g., km/h, m/s) + opt_space = pynini.closure(delete_space, 0, 1) + per = pynini.cross("/", "퍼") + opt_space + insert_space + graph_unit + optional_per = pynini.closure(opt_space + insert_space + per, 0, 1) + + # Final unit FST produces either "" or "" + unit = pynutil.insert('units: "') + (graph_unit + optional_per | per) + pynutil.insert('"') + + minus_as_field = pynutil.insert('negative: "마이너스" ') + consume_minus = pynini.cross("-", "") | pynini.cross("마이너스", "") + + # Optional minus field + removal of actual sign symbol or word + optional_minus = pynini.closure(minus_as_field + consume_minus + opt_space, 0, 1) + + # Combine numeric and unit components + pieces = [] + + # 1) Cardinal form: e.g., "12kg" + sub_cardinal = ( + pynutil.insert("cardinal { ") + + pynutil.insert('integer: "') + + graph_cardinal + + delete_space + + pynutil.insert('" } ') + + unit + ) + pieces.append(sub_cardinal) + + # 2) Decimal form: e.g., "12.5km" + if decimal is not None: + sub_decimal = ( + pynutil.insert("decimal { ") + + optional_minus + + decimal.just_decimal + + delete_space + + pynutil.insert(" } ") + + unit + ) + pieces.append(sub_decimal) + + # 3) Fraction form: e.g., "2/3m" or "삼분의 이 미터" + if fraction is not None: + sub_fraction = pynutil.insert("fraction { ") + fraction.graph + delete_space + pynutil.insert(" } ") + unit + pieces.append(sub_fraction) + + # Union all supported numeric forms (cardinal | decimal | fraction) + graph = pieces[0] + for p in pieces[1:]: + graph |= p + + # Final wrapping into tokens { measure { ... } } + self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/money.py b/nemo_text_processing/text_normalization/ko/taggers/money.py new file mode 100644 index 000000000..a6a200184 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/money.py @@ -0,0 +1,99 @@ +# Copyright (c) 2025 NVIDIA CORPORATION.  All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +#     http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space +from nemo_text_processing.text_normalization.ko.utils import get_abs_path, load_labels + + +class MoneyFst(GraphFst): + """ + Finite state transducer for classifying Korean money. + + Example inputs and outputs: + ₩350 -> money { currency_maj: "원" integer_part: "삼백오십" } + 350원 -> money { integer_part: "삼백오십" currency_maj: "원" } + KRW 12,050 -> money { currency_maj: "원" integer_part: "일만이천오십" } + 12만 500원 -> money { integer_part: "십이만오백" currency_maj: "원" } + ₩10.25 -> money { currency_maj: "원" integer_part: "십" minor_part: "이십오" } # optional 2-digit minor + 0원 -> money { integer_part: "영" currency_maj: "원" } + + Args: + cardinal: CardinalFst + deterministic: If True, provide a single transduction; + if False, allow multiple transductions. + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="money", kind="classify", deterministic=deterministic) + + graph_cardinal = cardinal.graph + sp = pynini.closure(delete_space) # absorb any amount of spaces in input + + # --- Numbers (integer / optional minor) --- + # Integer part: "0" or a non-zero leading digit; allow commas (e.g., 18,925,000) + integer_part_fst = pynini.union("0", (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT | pynutil.delete(","))) + + # Plain integer → integer_part: "" + graph_integer_plain = ( + pynutil.insert('integer_part: "') + (integer_part_fst @ graph_cardinal) + pynutil.insert('" ') + ) + + # Optional 2-digit decimal (kept as minor_part if ever used downstream) + decimal_part_fst = NEMO_DIGIT**2 + graph_minor = pynutil.insert('minor_part: "') + (decimal_part_fst @ graph_cardinal) + pynutil.insert('" ') + + # Integer with scale suffix (만/억/조) → wrap the whole thing in one integer_part + scale_unit = pynini.union("만", "억", "조") + value_with_scale = (integer_part_fst @ graph_cardinal) + scale_unit + graph_integer_with_suffix = ( + pynutil.insert('integer_part: "') + value_with_scale + pynutil.insert('" ') + ).optimize() + + # Integer (+ optional ".<2-digit>" minor) + number_component_plain = graph_integer_plain + pynini.closure(pynutil.delete(".") + graph_minor, 0, 1) + number_component = (graph_integer_with_suffix | number_component_plain).optimize() + + # --- Currency (prefix or suffix) --- + # currency_major.tsv example: + # ₩ 원 + # KRW 원 + # 원 원 + maj_labels = load_labels(get_abs_path("data/money/currency_major.tsv")) + + # Prefix currency (e.g., ₩, KRW): emit currency_maj then number + currency_major_prepended = pynini.union( + *[pynutil.delete(surface) + pynutil.insert(f'currency_maj: "{unit}" ') for surface, unit in maj_labels] + ).optimize() + + # Suffix currency (e.g., ...원, ...달러): convert unit literal to currency_maj + currency_major_appended = pynini.union( + *[pynutil.delete(unit) + pynutil.insert(f'currency_maj: "{unit}" ') for _, unit in maj_labels] + ).optimize() + + # --- Compose (NO period handling) --- + # NOTE: We deliberately do NOT consume '/월', '/년', '/주', '/일', '/시간' here. + # If present in the raw text, they remain outside the money token and can be handled upstream/elsewhere. + + # [currency] [number] + graph_prepend = (currency_major_prepended + sp + number_component).optimize() + + # [number] [currency] + graph_append = (number_component + currency_major_appended).optimize() + + graph = (graph_prepend | graph_append).optimize() + + self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py new file mode 100644 index 000000000..59fa30ada --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying Korean ordinal expressions, e.g. + 1번째 -> ordinal { integer: "첫번째" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="ordinal", kind="classify", deterministic=deterministic) + + # Load base .tsv files + graph_digit = pynini.string_file(get_abs_path("data/ordinal/digit.tsv")) + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + graph_exceptions = pynini.string_file(get_abs_path("data/ordinal/exceptions.tsv")) + graph_tens = pynini.string_file(get_abs_path("data/ordinal/tens.tsv")) + graph_tens_prefix = pynini.string_file(get_abs_path("data/ordinal/tens_prefix.tsv")) + + graph_11_to_39 = (graph_tens_prefix + graph_digit).optimize() + + # Combine all ordinal forms from 1 to 39 + graph_ordinal_1to39 = ( + graph_exceptions | graph_digit | graph_zero | graph_tens | graph_11_to_39 + ).optimize() + pynini.accep("번째") + + # Accept tens digit 4–9 + tens_digit_4_to_9_accep = pynini.union(*[pynini.accep(str(i)) for i in range(4, 10)]) + # Accept any single digit + any_single_digit_accep = pynini.union(*[pynini.accep(str(i)) for i in range(0, 10)]) + # Combine two digits + from_40_to_99_inputs = tens_digit_4_to_9_accep + any_single_digit_accep + + # Match numbers with 3 or more digits + input_100_plus = pynini.closure(any_single_digit_accep, 3) + + # Combine both ranges (40–99 and 100+): total range = 40 and above + filter_inputs_from_40 = (from_40_to_99_inputs | input_100_plus).optimize() + + # Only allow cardinal numbers that are 40 or more + graph_cardinal_from40_filtered = pynini.compose(filter_inputs_from_40, cardinal.graph) + + # Add "번째" to the filtered cardinal graph. + graph_ordinal_from40 = graph_cardinal_from40_filtered + pynini.accep("번째") + + graph_ordinal = (graph_ordinal_1to39 | graph_ordinal_from40).optimize() # Handles 1-39 # Handles 40+ + + # Single-character particles (가, 이, 은, 는, 로, 도 ...) + josa_single = pynini.union("가", "이", "은", "는", "를", "을", "로", "도", "다") + + # Multi-character particles (부터, 까지) + josa_multi = pynini.union("부터", "까지") + + # Allow patterns like: + # 번째 + (optional single-josa) + (optional multi-josa) + josa = (josa_single.ques + josa_multi.ques).optimize() + + # Final ordinal graph with optional particles + graph_ordinal_with_josa = (graph_ordinal + josa).optimize() + + # Build the “integer: …” token structure + final_graph = pynutil.insert('integer: "') + graph_ordinal_with_josa + pynutil.insert('"') + + self.fst = self.add_tokens(final_graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/punctuation.py b/nemo_text_processing/text_normalization/ko/taggers/punctuation.py new file mode 100644 index 000000000..a10250a99 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/punctuation.py @@ -0,0 +1,37 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst + + +class PunctuationFst(GraphFst): + """ + Finite state transducer for classifying punctuation + e.g. a, -> tokens { name: "a" } tokens { name: "," } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="punctuation", kind="classify", deterministic=deterministic) + + range_component = pynini.cross("〜", "부터") | pynini.accep("부터") + + graph = pynutil.insert('name: "') + range_component + pynutil.insert('"') + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/telephone.py b/nemo_text_processing/text_normalization/ko/taggers/telephone.py new file mode 100644 index 000000000..90f31bb1f --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/telephone.py @@ -0,0 +1,89 @@ +# Copyright (c) 2025 NVIDIA CORPORATION.  All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +#     http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying Korean telephone numbers. + + Example inputs → tokens: + +82 010-3713-7050 -> telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" } + +1 (415) 555-0123 -> telephone { country_code: "국가번호 일," number_part: "사일오 오오오 영일이삼" } + (031)371-3700 -> telephone { number_part: "영삼일 삼칠일 삼칠영영" } + 010-3713-7050 -> telephone { number_part: "영일영 삼칠일삼 칠영오영" } + 010.777.8888 -> telephone { number_part: "영일영 칠칠칠 팔팔팔팔" } + + Args: + deterministic (bool, optional): If True, provide a single transduction; + if False, allow multiple transductions. + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="telephone", kind="classify", deterministic=deterministic) + # Separator between digit blocks (e.g., "-" or ".") + delete_sep = pynutil.delete("-") | pynutil.delete(".") + # Optional space inserted between blocks + insert_block_space = insert_space + + # 1) safe digit mapping: force 0 -> "영" (do not rely on zero.tsv invert) + digit = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize() + zero_map = pynini.cross("0", "영") + digit_ko = (digit | zero_map).optimize() + + three_digits = digit_ko**3 + four_digits = digit_ko**4 + + # country code: "+1", "+82", "+1-" + cc_digits = pynini.closure(digit_ko, 1, 3) + + country_code = ( + pynutil.delete("+") + + pynutil.insert('country_code: "') + + cc_digits + + pynutil.insert('"') + + pynini.closure(pynutil.delete("-") | pynutil.delete(" "), 0, 1) + + delete_space + ) + + # area part: "123-" | "123." | "(123)" [space?] or "(123)-" + area_core = three_digits + area_part = ( + (area_core + delete_sep) + | ( + pynutil.delete("(") + + area_core + + pynutil.delete(")") + + pynini.closure(pynutil.delete(" "), 0, 1) + + pynini.closure(delete_sep, 0, 1) + ) + ) + insert_block_space + + # 2) allow 3 **or 4** digits in the middle block (to support 010-3713-7050) + mid = pynini.union(three_digits, four_digits) + last4 = four_digits + + # consume '-' or '.' between middle and last blocks + number_part_core = area_part + mid + delete_sep + insert_block_space + last4 + number_part = pynutil.insert('number_part: "') + number_part_core + pynutil.insert('"') + + # final graph: with or without country code + graph = pynini.union(country_code + insert_space + number_part, number_part).optimize() + + self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/time.py b/nemo_text_processing/text_normalization/ko/taggers/time.py new file mode 100644 index 000000000..b8a499823 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/time.py @@ -0,0 +1,192 @@ +# Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time, e.g. + 오전 10시 30분 -> time { suffix: "오전" hours: "열시" minutes: "삼십분" } + 오후 3시 반 -> time { suffix: "오후" hours: "세시" minutes: "삼십분" } + 자정 -> time { hours: "영시" } + 정오 -> time { hours: "열두시" } + + Args: + cardinal: CardinalFst (Korean cardinal graph) + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="time", kind="classify", deterministic=deterministic) + + # Base number-to-words graph from the Cardinal Fst + graph_cardinal = cardinal.graph + strip0 = pynini.closure(pynutil.delete("0"), 0, 1) + + SP = pynini.closure(delete_space) + SEP = SP + insert_space + hour_clock = pynini.string_file(get_abs_path("data/time/hour.tsv")) + division = pynini.string_file(get_abs_path("data/time/division.tsv")) + + division_component = pynutil.insert("suffix: \"") + division + pynutil.insert("\"") + + # --- Special single-word times --- + noon = pynini.accep("정오") + midnight = pynini.accep("자정") + noon_component = pynutil.insert("hours: \"") + pynini.cross(noon, "열두시") + pynutil.insert("\"") + midnight_component = pynutil.insert("hours: \"") + pynini.cross(midnight, "영시") + pynutil.insert("\"") + + # --- Korean Hangul components (H시 [M분|반] [S초]) --- + # "0" or "00" -> 0 + h_zero = strip0 + pynini.accep("0") + # "13".."24" + h_13_24 = pynini.union(*[str(i) for i in range(13, 25)]) + + # "0시" -> "영시" + hour_component_ko_zero = ( + pynutil.insert("hours: \"") + + pynini.cross(h_zero, "영") + + pynutil.delete("시") + + pynutil.insert("시") + + pynutil.insert("\"") + ) + + # "13시..24시" -> Sino-Korean words (십삼/…/이십사) + 시 + hour_component_ko_13_24 = ( + pynutil.insert("hours: \"") + + (h_13_24 @ graph_cardinal) + + pynutil.delete("시") + + pynutil.insert("시") + + pynutil.insert("\"") + ) + + # "1시..12시" -> Native Korean words (한/두/세/네/…/열두) + 시 + hour_component_ko_1_12 = ( + pynutil.insert("hours: \"") + + (strip0 + hour_clock) + + pynutil.delete("시") + + pynutil.insert("시") + + pynutil.insert("\"") + ) + + # Priority: 13-24 > 0 > 1-12 to prevent partial matching errors + hour_component_ko = (hour_component_ko_13_24 | hour_component_ko_zero | hour_component_ko_1_12).optimize() + + # Minutes: number+"분" or "반" (approx. 30분). Allows optional '쯤|경' after minutes/반. + about_word = pynini.union("쯤", "경") + minute_number = ( + pynutil.insert("minutes: \"") + + (strip0 + graph_cardinal) + + pynutil.delete("분") + + pynutil.insert("분") + + pynutil.insert("\"") + ) + minute_half = ( + pynutil.insert("minutes: \"") + + pynutil.delete("반") + + pynutil.insert("반") + + pynini.closure(about_word, 0, 1) + + pynutil.insert("\"") + ) + minute_component_ko = (minute_half | minute_number).optimize() + + second_component_ko = ( + pynutil.insert("seconds: \"") + + (strip0 + graph_cardinal) + + pynutil.delete("초") + + pynutil.insert("초") + + pynutil.insert("\"") + ) + + # Allow suffix before or after + suffix_prefix_opt = pynini.closure(division_component + SEP, 0, 1) + suffix_postfix_opt = pynini.closure(SEP + division_component, 0, 1) + + # Hangul patterns + graph_hangul = ( + suffix_prefix_opt + + ( + hour_component_ko + | (hour_component_ko + SEP + minute_component_ko) + | (hour_component_ko + SEP + minute_component_ko + SEP + second_component_ko) + | minute_component_ko + | (minute_component_ko + SEP + second_component_ko) + | second_component_ko + ) + + suffix_postfix_opt + ).optimize() + + # Special words with optional suffix + graph_special = (suffix_prefix_opt + (noon_component | midnight_component) + suffix_postfix_opt).optimize() + + # --- Clock patterns: HH:MM[:SS] --- + colon = pynutil.delete(":") + + # Map 1-12 hours using native-Korean words, allowing an optional leading zero. + graph_hour_1_12 = ( + pynutil.insert("hours: \"") + (strip0 + hour_clock) + pynutil.insert("시") + pynutil.insert("\"") + ).optimize() + + # 0, 00, and 13-24 -> Sino-Korean words + hour_sino_val = ( + pynini.cross("00", "0") + | pynini.cross("0", "0") + | pynini.union(*[pynini.cross(str(i), str(i)) for i in range(13, 25)]) + ) + hour_sino_read = hour_sino_val @ graph_cardinal + + graph_hour_others = pynutil.insert("hours: \"") + hour_sino_read + pynutil.insert("시") + pynutil.insert("\"") + + hour_clock_component = (graph_hour_1_12 | graph_hour_others).optimize() + + minute_clock_component = ( + pynutil.insert("minutes: \"") + strip0 + graph_cardinal + pynutil.insert("분") + pynutil.insert("\"") + ) + second_clock_component = ( + pynutil.insert("seconds: \"") + strip0 + graph_cardinal + pynutil.insert("초") + pynutil.insert("\"") + ) + + # HH:MM (drop minutes if "00") + graph_hm_clock = ( + suffix_prefix_opt + + hour_clock_component + + delete_space.ques + + colon + + delete_space.ques + + (pynini.cross("00", "") | pynini.closure(insert_space + minute_clock_component, 0, 1)) + + suffix_postfix_opt + ).optimize() + + # HH:MM:SS (drop minutes/seconds if "00") + graph_hms_clock = ( + suffix_prefix_opt + + hour_clock_component + + delete_space.ques + + colon + + delete_space.ques + + (pynini.cross("00", "") | pynini.closure(insert_space + minute_clock_component, 0, 1)) + + delete_space.ques + + colon + + delete_space.ques + + (pynini.cross("00", "") | pynini.closure(insert_space + second_clock_component, 0, 1)) + + suffix_postfix_opt + ).optimize() + + graph = (graph_special | graph_hangul | graph_hm_clock | graph_hms_clock).optimize() + graph_final = self.add_tokens(graph) + self.fst = graph_final.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..e2a3a5890 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py @@ -0,0 +1,114 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.ko.taggers.date import DateFst +from nemo_text_processing.text_normalization.ko.taggers.decimal import DecimalFst +from nemo_text_processing.text_normalization.ko.taggers.electronic import ElectronicFst +from nemo_text_processing.text_normalization.ko.taggers.fraction import FractionFst +from nemo_text_processing.text_normalization.ko.taggers.measure import MeasureFst +from nemo_text_processing.text_normalization.ko.taggers.money import MoneyFst +from nemo_text_processing.text_normalization.ko.taggers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.ko.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.ko.taggers.telephone import TelephoneFst +from nemo_text_processing.text_normalization.ko.taggers.time import TimeFst +from nemo_text_processing.text_normalization.ko.taggers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.ko.taggers.word import WordFst +from nemo_text_processing.utils.logging import logger + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + input_case: str = "cased", + deterministic: bool = True, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_tokenize.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logger.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + cardinal = CardinalFst(deterministic=deterministic) + date = DateFst(cardinal=cardinal, deterministic=deterministic) + time = TimeFst(cardinal=cardinal, deterministic=deterministic) + ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) + word = WordFst(deterministic=deterministic) + decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) + fraction = FractionFst(cardinal=cardinal, deterministic=deterministic) + whitelist = WhiteListFst(deterministic=deterministic) + punctuation = PunctuationFst(deterministic=deterministic) + money = MoneyFst(cardinal=cardinal, deterministic=deterministic) + telephone = TelephoneFst(deterministic=deterministic) + measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) + electronic = ElectronicFst(cardinal=cardinal, deterministic=deterministic) + + classify = pynini.union( + pynutil.add_weight(cardinal.fst, 1.1), + pynutil.add_weight(date.fst, 1.1), + pynutil.add_weight(time.fst, 1.1), + pynutil.add_weight(fraction.fst, 1.0), + pynutil.add_weight(ordinal.fst, 1.1), + pynutil.add_weight(decimal.fst, 1.05), + pynutil.add_weight(word.fst, 100), + pynutil.add_weight(money.fst, 1.1), + pynutil.add_weight(measure.fst, 1.1), + pynutil.add_weight(punctuation.fst, 1.0), + pynutil.add_weight(whitelist.fst, 1.1), + pynutil.add_weight(telephone.fst, 1.1), + pynutil.add_weight(electronic.fst, 1.11), + ) + + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + + graph = ( + delete_space + token + pynini.closure((delete_extra_space | pynini.accep("")) + token) + delete_space + ) + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/text_normalization/ko/taggers/whitelist.py b/nemo_text_processing/text_normalization/ko/taggers/whitelist.py new file mode 100644 index 000000000..8977d8c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/whitelist.py @@ -0,0 +1,29 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class WhiteListFst(GraphFst): + def __init__(self, deterministic: bool = True): + super().__init__(name="whitelist", kind="classify", deterministic=deterministic) + + whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")) + graph = pynutil.insert('name: "') + whitelist + pynutil.insert('"') + + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/word.py b/nemo_text_processing/text_normalization/ko/taggers/word.py new file mode 100644 index 000000000..7aa3db709 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/word.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_DIGIT, NEMO_NOT_SPACE, GraphFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying Korean word. + e.g. 이름 -> tokens { name: "이름" } + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="word", kind="classify", deterministic=deterministic) + + word_char = pynini.difference(NEMO_NOT_SPACE, NEMO_DIGIT) + + word = pynutil.insert('name: "') + word += pynini.closure(word_char, 1) + word += pynutil.insert('"') + + self.fst = word.optimize() diff --git a/nemo_text_processing/text_normalization/ko/utils.py b/nemo_text_processing/text_normalization/ko/utils.py new file mode 100644 index 000000000..51aaea3e8 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/utils.py @@ -0,0 +1,60 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + with open(abs_path, encoding="utf-8") as label_tsv: + labels = list(csv.reader(label_tsv, delimiter="\t")) + return labels + + +def augment_labels_with_punct_at_end(labels): + """ + augments labels: if key ends on a punctuation that value does not have, add a new label + where the value maintains the punctuation + + Args: + labels : input labels + Returns: + additional labels + """ + res = [] + for label in labels: + if len(label) > 1: + if label[0][-1] == "." and label[1][-1] != ".": + res.append([label[0], label[1] + "."] + label[2:]) + return res diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py new file mode 100644 index 000000000..d2adfa7d9 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py @@ -0,0 +1,53 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal, e.g. + cardinal { negative: "true" integer: "23" } -> 마이너스 이십삼 + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) + + self.optional_sign = pynini.cross("negative: \"true\"", "마이너스 ") + if not deterministic: + self.optional_sign |= pynini.cross("negative: \"true\"", "음수 ") + self.optional_sign |= pynini.cross("negative: \"true\"", "- ") + + self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1) + + # quoted: parses a quoted string value like "십", "명" + quoted = delete_space + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete('"') + + # integer: mandatory field + integer = pynutil.delete("integer:") + quoted + + # counter: optional field (e.g., 명/개/살). + counter = pynutil.delete("counter:") + quoted + counter = pynini.closure(delete_space + counter, 0, 1) + self.numbers = self.optional_sign + integer + counter + + delete_tokens = self.delete_tokens(self.numbers) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/date.py b/nemo_text_processing/text_normalization/ko/verbalizers/date.py new file mode 100644 index 000000000..bfd5e9aa1 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/date.py @@ -0,0 +1,55 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + + +class DateFst(GraphFst): + """ + Korean date verbalizer + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="date", kind="verbalize", deterministic=deterministic) + + era_component = pynutil.delete("era: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + year_component = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + month_component = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + day_component = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + week_component = ( + pynutil.delete("weekday: \"") + + pynini.closure(delete_space) + + pynini.closure(NEMO_NOT_QUOTE) + + pynini.closure(delete_space) + + pynutil.delete("\"") + ) + + SPACE = pynini.closure(delete_space, 0, 1) + insert_space + + # This graph now correctly uses the 'delete_space' variable defined above. + graph_basic_date = ( + pynini.closure(era_component + SPACE, 0, 1) + + pynini.closure(year_component + SPACE, 0, 1) + + pynini.closure(month_component + SPACE, 0, 1) + + pynini.closure(day_component, 0, 1) + + pynini.closure(SPACE + week_component, 0, 1) + ) | (month_component + SPACE + week_component) + + final_graph = graph_basic_date + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/decimal.py b/nemo_text_processing/text_normalization/ko/verbalizers/decimal.py new file mode 100644 index 000000000..54375c5a9 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/decimal.py @@ -0,0 +1,48 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class DecimalFst(GraphFst): + def __init__(self, deterministic: bool = True): + super().__init__(name="decimal", kind="verbalize", deterministic=deterministic) + + # Extract integer part + integer_part = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + + # Extract fractional part and prepend "점" + fractional_part = ( + pynutil.delete('fractional_part: "') + + pynutil.insert("점") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + # Verbalize decimal number without sign + decimal_positive = integer_part + pynutil.delete(" ") + fractional_part + + # Handle negative sign + negative_sign = ( + pynutil.delete('negative: "') + pynini.accep("마이너스") + pynutil.delete('"') + pynutil.delete(" ") + ) + + # Combine positive and negative cases + decimal = decimal_positive | (negative_sign + pynutil.insert(" ") + decimal_positive) + + delete_tokens = self.delete_tokens(decimal) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py new file mode 100644 index 000000000..dc3b3f97b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py @@ -0,0 +1,141 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_ALPHA, + NEMO_CHAR, + NEMO_DIGIT, + NEMO_NOT_QUOTE, + NEMO_SIGMA, + GraphFst, + delete_extra_space, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class ElectronicFst(GraphFst): + """ + Finite state transducer (FST) for verbalizing **electronic expressions** (email/URL/domain). + + Input tokens: + tokens { electronic { username: "abc" domain: "abc.com" } } + + Example output (policy-dependent): + abc 골뱅이 abc 닷컴 + + Args: + deterministic: If True, produce a single verbalization. + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) + + # 1) Handle digits (0–9) + graph_digit_no_zero = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize() + + graph_zero = pynini.cross("0", "영") + if not deterministic: + graph_zero |= pynini.cross("0", "공") + graph_digit = (graph_digit_no_zero | graph_zero).optimize() + + digit_inline_rewrite = pynini.cdrewrite( + graph_digit, + "", + "", + NEMO_SIGMA, + ) + + # 3) username part (add spaces between characters) + raw_username = pynini.closure(NEMO_NOT_QUOTE, 1) + + user_name = ( + pynutil.delete("username:") + + delete_space + + pynutil.delete('"') + + (raw_username @ digit_inline_rewrite) + + pynutil.delete('"') + ) + + # 4) domain part (handle common endings like .com → 닷컴) + domain_common_pairs = ( + pynini.string_file(get_abs_path("data/electronic/domain.tsv")) + | pynini.string_file(get_abs_path("data/electronic/extensions.tsv")) + ).optimize() + + # Rewrite known domains (.com → 닷컴) + tld_rewrite = pynini.cdrewrite( + domain_common_pairs, + "", + "", + NEMO_SIGMA, + ) + # Add a space before “닷” if needed + add_space_before_dot = pynini.cdrewrite( + pynini.cross("닷", " 닷"), + (NEMO_ALPHA | NEMO_DIGIT | NEMO_CHAR), + "", + NEMO_SIGMA, + ) + + raw_domain = pynini.closure(NEMO_NOT_QUOTE, 1) + + four = pynini.closure(NEMO_DIGIT, 4, 4) + cc16_grouped = four + pynutil.insert(" ") + four + pynutil.insert(" ") + four + pynutil.insert(" ") + four + cc_domain = (cc16_grouped @ digit_inline_rewrite).optimize() + + domain = ( + pynutil.delete("domain:") + + delete_space + + pynutil.delete('"') + + ((raw_domain @ digit_inline_rewrite) @ tld_rewrite @ add_space_before_dot) + + delete_space + + pynutil.delete('"') + ).optimize() + + # 6) protocol (like “https://” or “file:///”) + protocol = ( + pynutil.delete('protocol: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + insert_space + ) + + protocol_raw = pynutil.delete('protocol: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + cc_protocol_guard = pynini.accep("신용카드") + pynini.closure(NEMO_NOT_QUOTE, 0) + cc_protocol = (protocol_raw @ cc_protocol_guard) + insert_space + + # Credit card case: "신용카드 ..." protocol + 16-digit domain grouped as 4-4-4-4 + cc_graph = ( + cc_protocol + + delete_space + + pynutil.delete("domain:") + + delete_space + + pynutil.delete('"') + + cc_domain + + pynutil.delete('"') + + delete_space + ).optimize() + + # 7) Combine: optional protocol + optional username + domain + default_graph = ( + pynini.closure(protocol + delete_space, 0, 1) + + pynini.closure(user_name + delete_space + pynutil.insert(" 골뱅이 ") + delete_space, 0, 1) + + domain + + delete_space + ).optimize() + + graph = (cc_graph | default_graph) @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA) + self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py new file mode 100644 index 000000000..bafbf133d --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py @@ -0,0 +1,90 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space + + +class FractionFst(GraphFst): + """ + Finite state transducer for verbalizing Korean fractions, e.g. + tokens { fraction { numerator: "3" denominator: "5" } } → 5분의3 + tokens { fraction { integer_part: "2" numerator: "7" denominator: "9" } } → 2과 9분의7 + tokens { fraction { denominator: "√8" numerator: "4" } } → 루트8분의4 + tokens { fraction { denominator: "2.75" numerator: "125" } } → 2.75분의125 + tokens { fraction { negative: "마이너스" numerator: "10" denominator: "11" } } → 마이너스11분의10 + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) + + # Handles square root symbols like "√3" → "루트3" + denominator_root = pynini.cross("√", "루트") + pynutil.insert(NEMO_SPACE) + pynini.closure(NEMO_NOT_QUOTE) + numerator_root = pynini.cross("√", "루트") + pynutil.insert(NEMO_SPACE) + pynini.closure(NEMO_NOT_QUOTE) + + # Matches non-root numeric content + denominator = pynini.closure(NEMO_NOT_QUOTE - "√") + numerator = pynini.closure(NEMO_NOT_QUOTE - "√") + + # Delete FST field: denominator and extract value + denominator_component = ( + pynutil.delete('denominator: "') + (denominator_root | denominator) + pynutil.delete('"') + ) + numerator_component = pynutil.delete('numerator: "') + (numerator_root | numerator) + pynutil.delete('"') + + # Match fraction form: "denominator + 분의 + numerator" + # Also deletes optional morphosyntactic_features: "분의" if present + graph_fraction = ( + denominator_component + + pynutil.delete(NEMO_SPACE) + + pynini.closure( + pynutil.delete('morphosyntactic_features:') + delete_space + pynutil.delete('"분의"') + delete_space, + 0, + 1, + ) + + pynutil.insert("분의") + + pynutil.insert(NEMO_SPACE) + + numerator_component + ) + + # Match and delete integer_part field (e.g., "2" in "2과3분의1") + graph_integer = ( + pynutil.delete('integer_part:') + + delete_space + + pynutil.delete('"') + + pynini.closure(pynini.union("√", ".", NEMO_NOT_QUOTE - '"')) + + pynutil.delete('"') + + pynutil.insert(NEMO_SPACE) + ) + graph_integer_fraction = graph_integer + delete_space + graph_fraction + + # Match and delete optional negative field (e.g., "마이너스") + optional_sign = ( + pynutil.delete('negative:') + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE - '"') + + pynutil.delete('"') + + delete_space + + pynutil.insert(NEMO_SPACE) + ) + + # Final graph handles optional negative + (integer + fraction | fraction only) + graph = pynini.closure(optional_sign, 0, 1) + (graph_integer_fraction | graph_fraction) + + # Final optimized verbalizer FST + final_graph = self.delete_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/measure.py b/nemo_text_processing/text_normalization/ko/verbalizers/measure.py new file mode 100644 index 000000000..765b143cd --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/measure.py @@ -0,0 +1,89 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_NOT_QUOTE, + NEMO_SIGMA, + GraphFst, + delete_space, + insert_space, +) + + +class MeasureFst(GraphFst): + """ + Finite state transducer for verbalizing Korean measure tokens into surface text. + measure { cardinal { integer: "<...>" } units: "<...>" } + + Converts tokens like: + measure { cardinal { integer: "이" } units: "킬로그램" } + measure { fraction { numerator: "이" denominator: "삼" } units: "킬로미터" } + + into surface text: + "이 킬로그램", "삼분의 이 킬로미터" + + Args: + decimal, cardinal, fraction: Sub-verbalizers handling number types. + deterministic: If True, outputs a single normalized form. + """ + + def __init__( + self, + decimal: GraphFst = None, + cardinal: GraphFst = None, + fraction: GraphFst = None, + deterministic: bool = True, + ): + super().__init__(name="measure", kind="verbalize", deterministic=deterministic) + + # Combine all numeric verbalizers + graph_cardinal = cardinal.fst + graph_decimal = decimal.fst + graph_fraction = fraction.fst + + # Add a space after "마이너스" if it appears within numeric blocks + minus_space_rewrite = pynini.cdrewrite(pynini.cross("마이너스", "마이너스 "), "", "", NEMO_SIGMA).optimize() + + # Apply rewrite to each numeric subgraph to ensure spacing after "마이너스" + cardinal_spaced = graph_cardinal @ minus_space_rewrite + fraction_spaced = graph_fraction @ minus_space_rewrite + decimal_spaced = graph_decimal @ minus_space_rewrite + + # Combine all supported numeric types (cardinal | decimal | fraction) + number_block = decimal_spaced | cardinal_spaced | fraction_spaced + + # Extract and output unit string + units = ( + delete_space + + pynutil.delete("units:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + # Normal form: + main = number_block + insert_space + units + + # preserve_order form: + preserve_order = delete_space + pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + alt = units + insert_space + number_block + pynini.closure(preserve_order) + + graph = main | alt + + # Merge and clean tokens + self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/money.py b/nemo_text_processing/text_normalization/ko/verbalizers/money.py new file mode 100644 index 000000000..333f68011 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/money.py @@ -0,0 +1,110 @@ +# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +#     http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + +# ===== whitespace & token helpers ===== +sp = pynini.closure(delete_space) # absorb 0+ spaces +FIELD_VAL = pynini.closure(NEMO_NOT_QUOTE, 1) + + +def del_key_val(key: str): + """ + Delete the token field prefix and quotes, keep only the value. + + Input format: [sp] key: "" + Output: + + Example: + input 'integer_part: "삼백오십"' + output '삼백오십' + """ + return (sp + pynutil.delete(f'{key}: "') + FIELD_VAL + pynutil.delete('"')).optimize() + + +def drop_key_val(key: str): + """ + Delete the entire key-value pair (key and its quoted value). + + Input format: [sp] key: "" + Output: (nothing) + + Example: + input 'minor_part: "십"' + output '' + """ + return (sp + pynutil.delete(f'{key}: "') + FIELD_VAL + pynutil.delete('"')).optimize() + + +def drop_key_exact(key: str, val: str): + """ + Delete the exact key-value pair if it matches the given value. + + Input format: [sp] key: "val" + Output: (nothing) + + Example: + input 'currency_maj: "원"' + output '' + """ + return (sp + pynutil.delete(f'{key}: "{val}"')).optimize() + + +class MoneyFst(GraphFst): + """ + Verbalize Korean money. + + Input tokens: + tokens { money { integer_part: "..." currency_maj: "..." [minor_part: "..."] } } + + Period (e.g., /월, /년, …) is intentionally NOT handled here. + Output examples: + integer_part: "십" currency_maj: "원" -> "십원" + integer_part: "삼십억" currency_maj: "원" -> "삼십억원" + integer_part: "이백" currency_maj: "달러" -> "이백 달러" + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="money", kind="verbalize", deterministic=deterministic) + + # --- fields --- + integer_part = del_key_val("integer_part") + minor_part_drop = drop_key_val("minor_part") # ignore minor for KRW + currency_val_any = del_key_val("currency_maj") # ex) "원", "달러", "유로" + won_key_drop = drop_key_exact("currency_maj", "원") # don't print the key for KRW + + # ===== KRW (원) ===== + # (A) [integer] [원] -> "{integer}원" + won_a = integer_part + sp + won_key_drop + pynutil.insert("원") + # (B) [원] [integer] -> "{integer}원" + won_b = won_key_drop + sp + integer_part + pynutil.insert("원") + won_core = won_a | won_b + won_core = (won_core + pynini.closure(minor_part_drop, 0, 1)).optimize() + + # ===== Other currencies ===== + # "{integer} {currency}" (KRW sticks; others are spaced) + other_core = (integer_part + insert_space + currency_val_any).optimize() + other_core = (other_core + pynini.closure(minor_part_drop, 0, 1)).optimize() + + # ===== combine (no period) ===== + graph_core = (pynutil.add_weight(won_core, 0.0) | pynutil.add_weight(other_core, 0.5)).optimize() + + # no trailing period mapping + graph = graph_core + + # strip tokens wrapper + self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/ko/verbalizers/ordinal.py new file mode 100644 index 000000000..c8c06a0c4 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/ordinal.py @@ -0,0 +1,42 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for verbalizing korean ordinal e.g. + tokens { ordinal { integer: "1번째" } } -> 첫번째 + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) + + graph_integer = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.delete("\"") + ) + + final_graph = graph_integer + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py b/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py new file mode 100644 index 000000000..211358141 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py @@ -0,0 +1,83 @@ +# Copyright (c) 2025 NVIDIA CORPORATION.  All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +#     http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for verbalizing Korean telephone numbers. + + Input: + telephone { [country_code: "...",] number_part: "..." [extension: "..."] } + Output: + [country_code + " "] + number_part [+ ", 내선 " + extension] + + Examples: + telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" } + -> 플러스 팔 이, 영일영, 삼칠일삼, 칠영오영 + telephone { number_part: "팔영영 오오오 영영영영" extension: "이삼사" } + -> 팔영영, 오오오, 영영영영, 내선 이삼사 + + Args: + deterministic: if True provides a single transduction; if False allows multiple. + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="telephone", kind="verbalize", deterministic=deterministic) + + # country_code (optional, add trailing space if present) + country = ( + pynini.closure(delete_space, 0, 1) + + pynutil.delete('country_code: "') + + pynutil.insert("국가번호 ") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + + insert_space + ) + + # number_part (mandatory) + number = ( + pynini.closure(delete_space, 0, 1) + + pynutil.delete('number_part: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + # extension (optional, prepend with ", 내선 ") + ext_field = ( + pynini.closure(delete_space, 0, 1) + + pynutil.delete('extension: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + extension_opt = pynini.closure(pynutil.insert(", 내선 ") + ext_field, 0, 1) + + # remove wrapper "telephone { ... }" + graph = ( + pynutil.delete("telephone") + + pynini.closure(delete_space, 0, 1) + + pynutil.delete("{") + + pynini.closure(country, 0, 1) + + number + + extension_opt + + pynini.closure(delete_space, 0, 1) + + pynutil.delete("}") + ) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/time.py b/nemo_text_processing/text_normalization/ko/verbalizers/time.py new file mode 100644 index 000000000..4d1414b1e --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/time.py @@ -0,0 +1,63 @@ +# Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + + +class TimeFst(GraphFst): + """ + Finite state transducer for verbalizing time e.g. + + + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="time", kind="verbalize", deterministic=deterministic) + + hour_component = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + minute_content = pynini.closure(NEMO_NOT_QUOTE) + minute_component = pynutil.delete("minutes: \"") + pynini.cross("영분", "") + pynutil.delete( + "\"" + ) | pynutil.delete("minutes: \"") + (minute_content - "영분") + pynutil.delete("\"") + + second_content = pynini.closure(NEMO_NOT_QUOTE) + second_component = pynutil.delete("seconds: \"") + pynini.cross("영초", "") + pynutil.delete( + "\"" + ) | pynutil.delete("seconds: \"") + (second_content - "영초") + pynutil.delete("\"") + + division_component = pynutil.delete("suffix: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + + graph_basic_time = pynini.closure(division_component + delete_space + insert_space, 0, 1) + ( + ( + hour_component + + delete_space + + insert_space + + minute_component + + delete_space + + insert_space + + second_component + ) + | (hour_component + delete_space + insert_space + minute_component) + | hour_component + | minute_component + | second_component + ) + + final_graph = graph_basic_time + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py new file mode 100644 index 000000000..d3d5b951a --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py @@ -0,0 +1,74 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.text_normalization.ko.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.ko.verbalizers.date import DateFst +from nemo_text_processing.text_normalization.ko.verbalizers.decimal import DecimalFst +from nemo_text_processing.text_normalization.ko.verbalizers.electronic import ElectronicFst +from nemo_text_processing.text_normalization.ko.verbalizers.fraction import FractionFst +from nemo_text_processing.text_normalization.ko.verbalizers.measure import MeasureFst +from nemo_text_processing.text_normalization.ko.verbalizers.money import MoneyFst +from nemo_text_processing.text_normalization.ko.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.ko.verbalizers.telephone import TelephoneFst +from nemo_text_processing.text_normalization.ko.verbalizers.time import TimeFst +from nemo_text_processing.text_normalization.ko.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.ko.verbalizers.word import WordFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic) + + cardinal = CardinalFst(deterministic=deterministic) + date = DateFst(deterministic=deterministic) + ordinal = OrdinalFst(deterministic=deterministic) + decimal = DecimalFst(deterministic=deterministic) + word = WordFst(deterministic=deterministic) + fraction = FractionFst(deterministic=deterministic) + whitelist = WhiteListFst(deterministic=deterministic) + time = TimeFst(deterministic=deterministic) + money = MoneyFst(deterministic=deterministic) + telephone = TelephoneFst(deterministic=deterministic) + measure = MeasureFst(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic) + electronic = ElectronicFst(deterministic=deterministic) + + graph = pynini.union( + cardinal.fst, + ordinal.fst, + word.fst, + decimal.fst, + fraction.fst, + date.fst, + whitelist.fst, + time.fst, + money.fst, + telephone.fst, + measure.fst, + electronic.fst, + ) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py new file mode 100644 index 000000000..0271a4b7b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py @@ -0,0 +1,72 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_WHITE_SPACE, + GraphFst, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.utils.logging import logger + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "twelve" minutes: "thirty" } } tokens { name: "now" } tokens { name: "." } -> its twelve thirty now . + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_verbalizer.far") + + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + logger.info(f'VerbalizeFinalFst graph was restored from {far_file}.') + else: + token_graph = VerbalizeFst(deterministic=deterministic) + + token_verbalizer = ( + pynutil.delete("tokens {") + delete_space + token_graph.fst + delete_space + pynutil.delete(" }") + ) + + space_between_tokens = pynini.closure(NEMO_WHITE_SPACE, 1) + + verbalizer = ( + delete_space + + token_verbalizer + + pynini.closure(space_between_tokens + token_verbalizer) + + delete_space + ) + + self.fst = verbalizer.optimize() + + if far_file: + generator_main(far_file, {"verbalize": self.fst}) diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/ko/verbalizers/whitelist.py new file mode 100644 index 000000000..786b3afbf --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/whitelist.py @@ -0,0 +1,39 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + + +class WhiteListFst(GraphFst): + """ + tokens { name: "부터" } -> 부터 + tokens { name: "~" } -> ~ + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="whitelist", kind="verbalize", deterministic=deterministic) + graph = ( + pynutil.delete("name:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete("\"") + ) + + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/word.py b/nemo_text_processing/text_normalization/ko/verbalizers/word.py new file mode 100644 index 000000000..a14abd553 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/word.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class WordFst(GraphFst): + """ + Korean verbalizer for word. + tokens { name: "이름" } -> 이름 + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="word", kind="verbalize", deterministic=deterministic) + + graph = pynutil.delete("name: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"") + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 2604fb4ae..5e2f9ebb5 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -185,6 +185,9 @@ def __init__( if post_process: self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache) + elif lang == 'ko': + from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst else: raise NotImplementedError(f"Language {lang} has not been supported yet.") @@ -731,7 +734,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi", "vi"], + choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi", "ko", "vi"], default="en", type=str, ) diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py index ae3160f78..26a3fc7b6 100644 --- a/nemo_text_processing/text_normalization/run_evaluate.py +++ b/nemo_text_processing/text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi', 'vi'], + choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi', 'ko', 'vi'], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/__init__.py b/tests/nemo_text_processing/ko/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/tests/nemo_text_processing/ko/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..50c58425a --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,37 @@ +영~0 +구~9 +십~10 +십칠~17 +오십삼~53 +백~100 +백오~105 +삼백이십~320 +구백팔십칠~987 +천~1000 +천육~1006 +천오백~1500 +오천사백삼십이~5432 +만~10000 +만천이백~11200 +삼만오천칠백~35700 +십이만~120000 +백오십만삼천~1503000 +천만~10000000 +오천이백칠십만육천백~52706100 +억~100000000 +삼억오천만~350000000 +십이억천만~1210000000 +백오십억칠천만~15070000000 +오천억~500000000000 +일조~1000000000000 +이조오천억~2500000000000 +영영영~0 0 0 +영영백이십삼~0 0 123 +만천~11000 +만천백십일~11111 +경~10000000000000000 +마이너스일~-1 +마이너스 일~-1 +- 일~-1 +마이너스일억사천이백칠십구만구천팔십이~-142799082 +마이너스 칠백삼십오~-735 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..89fa75eb6 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt @@ -0,0 +1,15 @@ +이천이십사년팔월이십팔일~2024년8월28일 +이천이십삼년 구월 오일~2023년9월5일 +천구백구십구년십이월삼십일일~1999년12월31일 +이천년 이월 이십구일~2000년2월29일 +이천십년시월십일~2010년10월10일 +이천이십일년유월십육일~2021년6월16일 +이천삼십년삼월십사일~2030년3월14일 +천구백팔십팔년 오월 이십일~1988년5월20일 +이천일년 칠월 구일~2001년7월9일 +이천십팔년사월삼십일~2018년4월30일 +삼천년팔월십오일~3000년8월15일 +이천구년 일월이십일~2009년1월20일 +이천삼십오년~2035년 +오월~5월 +구천구백구십구년삼월일일~9999년3월1일 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..63d023168 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_decimal.txt @@ -0,0 +1,14 @@ +일점삼~1.3 +영점오~0.5 +십점오~10.5 +이십삼점사~23.4 +백점일~100.1 +일점이삼~1.23 +영점오육칠~0.567 +구십구점구구~99.99 +일점영삼~1.03 +영점영영일~0.001 +천이백삼십사점오육~1234.56 +일점오만~1.5만 +일점오억~1.5억 +일점오경~1.5경 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_fraction.txt new file mode 100644 index 000000000..c5fda707d --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_fraction.txt @@ -0,0 +1,29 @@ +이분의일~1/2 +사분의일~1/4 +사분의삼~3/4 +오분의이~2/5 +십분의칠~7/10 +십이분의오~5/12 +이십삼분의십~10/23 +백분의일~1/100 +백분의구십구~99/100 +천분의백이십삼~123/1000 +일과이분의일~1 1/2 +삼과사분의일~3 1/4 +오와팔분의삼~5 3/8 +십과백분의칠십오~10 75/100 +마이너스사분의일~-1/4 +영점오분의일~1/0.5 +삼분의일점오~1.5/3 +루트사분의일~1/√4 +구분의루트십육~√16/9 +이와루트구분의일~2 1/√9 +마이너스오분의루트이십오~-√25/5 +칠분의육~6/7 +백오십분의이십~20/150 +사와오분의이~4 2/5 +이십과백분의일~20 1/100 +일점오분의영점이~0.2/1.5 +루트백분의십~10/√100 +십과루트팔십일분의삼~10 3/√81 +마이너스이와십분의일~-2 1/10 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..fa020a49c --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_measure.txt @@ -0,0 +1,46 @@ +오 킬로미터~5 km +백 미터~100 m +이십 센티미터~20 cm +삼십 밀리미터~30 mm +십이 점 일구오 킬로미터~12.195 km +천오백 미터~1500 m +영 점 오 밀리미터~0.5 mm +칠 미터~7 m +오십 킬로그램~50 kg +이백 그램~200 g +삼 톤~3 t +일 점 오 킬로그램~1.5 kg +천 밀리그램~1000 mg +십삼 톤~13 t +사백오십 그램~450 g +이 리터~2 L +오백 밀리리터~500 ml +일 점 오 리터~1.5 L +십 씨씨~10 cc +삼천 씨씨~3000 cc +칠십 밀리리터~70 ml +영하 십 도~-10 ° +이십오 도~25 ° +백 도~100 ° +삼십육점오 도~36.5 ° +삼십삼 평~33 py +백 제곱미터~100 m² +오십 평~50 py +십 제곱킬로미터~10 km² +오백 기가바이트~500 GB +십육 기가~16 GB +일 테라바이트~1 TB +이백오십육 메가바이트~256 MB +육십 헤르츠~60 Hz +백 메가헤르츠~100 MHz +백 킬로미터퍼시간~100 km/h +백퍼센트~100 % +오십 프로~50 % +삼십 점 오 퍼센트~30.5 % +십 미터퍼초~10 m/s +천알피엠~1000 rpm +이백이십 볼트~220 V +육십 와트~60 W +십 암페어~10 A +오천 밀리암페어~5000 mA +일점오 볼트~1.5 V \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..09c6b2841 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt @@ -0,0 +1,29 @@ +오천 원~₩5000 +만오천원~₩15000 +십이만삼천 원~₩123000 +백 원~₩100 +천백십일 원~₩1111 +육십만 원~₩600000 +백만 원~₩1000000 +삼백오십만 원~₩3500000 +천이백만 원~₩12000000 +일억 원~₩100000000 +십이억오천만 원~₩1250000000 +백억 원~₩10000000000 +오천억원~₩500000000000 +일조 원~₩1000000000000 +삼조오천억 원~₩3500000000000 +영원~₩0 +구십구 원~₩99 +만 원~₩10000 +일만 원~₩10000 +십오 달러~$15 +이십불~$20 +천오백 불~$1500 +백만 달러~$1000000 +오십 유로~€50 +천 엔~¥1000 +만 엔~¥10000 +백 파운드~£100 +이십 위안~¥20 +구천구백구십구원~₩9999 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..96fbb7005 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,17 @@ +영번째~0번째 +첫번째~1번째 +두번째~2번째 +세번째~3번째 +다섯번째~5번째 +아홉번째~9번째 +열번째~10번째 +열한번째~11번째 +열일곱번째~17번째 +스무번째~20번째 +스물두번째~22번째 +스물아홉번째~29번째 +서른번째~30번째 +사십번째~40번째 +오십번째~50번째 +오십삼번째~53번째 +백번째~100번째 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..605baf0c7 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_telephone.txt @@ -0,0 +1,7 @@ +공일공에일이삼사에오육칠팔~010-1234-5678 +영일영 구팔칠육 오사삼이~010-9876-5432 +공이에삼사오육에칠팔구공~02-3456-7890 +공삼일에구팔칠에육오사삼~031-987-6543 +공일공 일이삼사 오육칠팔~010-1234-5678 +공이 삼사오에육칠팔구~02-345-6789 +공일공일이삼사오육칠팔~01012345678 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..2700b6ccc --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt @@ -0,0 +1,15 @@ +두시~2:00 +열두시~12:00 +두시 삼십분~2:30 +세시 삼분~3:03 +두시 반~2:30 +열두시 반~12:30 +두시 오초~2:00:05 +두시 삼십분 오초~2:30:05 +오전 두시~오전 2:00 +오후 네시 반~오후 4:30 +두시전~2:00 전 +두시십분후~2:10 후 +한시 십오분 삼십초~1:15:30 +네시 이분~4:02 +열한시 오십구분~11:59 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..40187f74e --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,48 @@ +1~일 +2~이 +3~삼 +123~백이십삼 +13000~만삼천 +9000~구천 +123000~십이만삼천 +123000012~일억이천삼백만십이 +1000000~백만 +100000000~일억 +1000000000000~일조 +100000000000000~백조 +20000000000001~이십조일 +800000000001001~팔백조천일 +82345670123135111~팔경이천삼백사십오조육천칠백일억이천삼백십삼만오천백십일 +9999999999999~구조구천구백구십구억구천구백구십구만구천구백구십구 +99999999999999~구십구조구천구백구십구억구천구백구십구만구천구백구십구 +999999999999999~구백구십구조구천구백구십구억구천구백구십구만구천구백구십구 +9999999999999999~구천구백구십구조구천구백구십구억구천구백구십구만구천구백구십구 +19~십구 +76~칠십육 +379~삼백칠십구 +850~팔백오십 +1004~천사 +8326~팔천삼백이십육 +10383~만삼백팔십삼 +34892~삼만사천팔백구십이 +573234~오십칠만삼천이백삼십사 +982010~구십팔만이천십 +2349023~이백삼십사만구천이십삼 +4303189~사백삼십만삼천백팔십구 +60321589~육천삼십이만천오백팔십구 +88234568~팔천팔백이십삼만사천오백육십팔 +792133923~칠억구천이백십삼만삼천구백이십삼 +187624689~일억팔천칠백육십이만사천육백팔십구 +2304050708~이십삼억사백오만칠백팔 +6436789729~육십사억삼천육백칠십팔만구천칠백이십구 +78234580257~칠백팔십이억삼천사백오십팔만이백오십칠 +987654321345~구천팔백칠십육억오천사백삼십이만천삼백사십오 +2345678901234~이조삼천사백오십육억칠천팔백구십만천이백삼십사 +35791357913579~삼십오조칠천구백십삼억오천칠백구십일만삼천오백칠십구 +470369258147036~사백칠십조삼천육백구십이억오천팔백십사만칠천삼십육 +5048258149517395~오천사십팔조이천오백팔십일억사천구백오십일만칠천삼백구십오 +67890123045607890~육경칠천팔백구십조천이백삼십억사천오백육십만칠천팔백구십 +-2~마이너스 이 +-93~마이너스 구십삼 +-90325~마이너스 구만삼백이십오 +-3234567~마이너스 삼백이십삼만사천오백육십칠 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..c3e81a25c --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_date.txt @@ -0,0 +1,29 @@ +2024년~이천이십사년 +3월~삼월 +15일~십오일 +2024/01/30~이천이십사년 일월 삼십일 +10/30/2024~이천이십사년 시월 삼십일 +29/12/2000~이천년 십이월 이십구일 +2024/3/20~이천이십사년 삼월 이십일 +2024-07-23~이천이십사년 칠월 이십삼일 +1999.9.17~천구백구십구년 구월 십칠일 +기원전128년~기원전 백이십팔년 +기원후1390년~기원후 천삼백구십년 +28일수요일~이십팔일 수요일 +1900년대~천구백년대 +1월1일(월)〜3일(수)~일월 일일 월요일 부터 삼일 수요일 +5월10일(금)〜15일(수)~오월 십일 금요일 부터 십오일 수요일 +8월20일〜25일~팔월 이십일 부터 이십오일 +12월30일(토)〜1월2일(화)~십이월 삼십일 토요일 부터 일월 이일 화요일 +2월28일(목)〜3월3일(일)~이월 이십팔일 목요일 부터 삼월 삼일 일요일 +6월1일〜5일~유월 일일 부터 오일 +10월8일(화)〜10일(목)~시월 팔일 화요일 부터 십일 목요일 +1970〜1980년대~천구백칠십 부터 천구백팔십년대 +80〜90년대~팔십 부터 구십년대 +2010〜2020년대~이천십 부터 이천이십년대 +7월5〜9일(월〜금)~칠월 오 부터 구일 월요일부터금요일 +3월10〜15일(화〜일)~삼월 십 부터 십오일 화요일부터일요일 +11월1〜5일(수〜일)~십일월 일 부터 오일 수요일부터일요일 +2023년3월1일(수)〜6월12일(화)~이천이십삼년 삼월 일일 수요일 부터 유월 십이일 화요일 +2024년1월15일(월)〜2월10일(토)~이천이십사년 일월 십오일 월요일 부터 이월 십일 토요일 +2025년12월20일(토)〜2026년1월5일(월)~이천이십오년 십이월 이십일 토요일 부터 이천이십육년 일월 오일 월요일 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..d363c5bb2 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_decimal.txt @@ -0,0 +1,28 @@ +-0.1~마이너스 영점일 +-0.5~마이너스 영점오 +-1.1~마이너스 일점일 +-2.5~마이너스 이점오 +-4.2~마이너스 사점이 +-11.99~마이너스 십일점구구 +-15.8~마이너스 십오점팔 +-25.3~마이너스 이십오점삼 +-30.8~마이너스 삼십점팔 +-72.4~마이너스 칠십이점사 +-100.5~마이너스 백점오 +0.1~영점일 +0.5~영점오 +1.1~일점일 +2.5~이점오 +4.2~사점이 +11.99~십일점구구 +15.8~십오점팔 +25.3~이십오점삼 +30.8~삼십점팔 +42.75~사십이점칠오 +72.4~칠십이점사 +100.5~백점오 +123.99~백이십삼점구구 +165.4~백육십오점사 +999.99~구백구십구점구구 +1000.01~천점영일 +123456.2234~십이만삼천사백오십육점이이삼사 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt new file mode 100644 index 000000000..4e09d0db2 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt @@ -0,0 +1,20 @@ +a@hotmail.de~a 골뱅이 hotmail 닷 디이 +a@hotmail.fr~a 골뱅이 hotmail 닷 에프알 +a@hotmail.it~a 골뱅이 hotmail 닷 아이티 +a@aol.it~a 골뱅이 aol 닷 아이티 +a@msn.it~a 골뱅이 msn 닷 아이티 +abc@nvidia.app~abc 골뱅이 nvidia 닷 앱 +user01@gmail.co.kr~user영일 골뱅이 gmail 닷 씨오 닷 케이알 +nvidia.co.kr~nvidia 닷 씨오 닷 케이알 +1234-5678-9012-3456~신용카드 일이삼사 오육칠팔 구영일이 삼사오육 +2345-2222-3333-4444~신용카드 이삼사오 이이이이 삼삼삼삼 사사사사 +9090-1234-5555-9876~신용카드 구영구영 일이삼사 오오오오 구팔칠육 +카드 마지막 네자리 3456~카드 마지막 네자리 삼사오육 +카드 마지막 4자리 7890~카드 마지막 네자리 칠팔구영 +카드 끝자리 3456~카드 끝자리 삼사오육 +사진.jpg~사진 닷 제이피지 +사진.JPG~사진 닷 제이피지 +사진.png~사진 닷 피엔지 +사진.PNG~사진 닷 피엔지 +문서.pdf~문서 닷 피디에프 +문서.PDF~문서 닷 피디에프 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt new file mode 100644 index 000000000..a183be59b --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt @@ -0,0 +1,14 @@ +1/2~이분의 일 +-1/2~마이너스 이분의 일 +1과1/2~일과 이분의 일 +2와12/33~이와 삼십삼분의 십이 +-1과1/2~마이너스 일과 이분의 일 +마이너스1과1/2~마이너스 일과 이분의 일 +마이너스1과√1/2~마이너스 일과 이분의 루트 일 +-1과√1/2~마이너스 일과 이분의 루트 일 +1과√1/2~일과 이분의 루트 일 +1과1/√3~일과 루트 삼분의 일 +1과1/3~일과 삼분의 일 +1과√1/4~일과 사분의 루트 일 +3분의1~삼분의 일 +121분의3221~백이십일분의 삼천이백이십일 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..3d24c4f0d --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_measure.txt @@ -0,0 +1,40 @@ +1kg~일 킬로그램 +12kg~십이 킬로그램 +0kg~영 킬로그램 +100g~백 그램 +500g~오백 그램 +1m~일 미터 +12km~십이 킬로미터 +5cm~오 센티미터 +15mm~십오 밀리미터 +1000km~천 킬로미터 +9999m~구천구백구십구 미터 +10L~십 리터 +1l~일 리터 +250ml~이백오십 밀리리터 +1mL~일 밀리리터 +123h~백이십삼 시간 +3s~삼 초 +60km/h~육십 킬로미터 퍼 시간 +1m/s~일 미터 퍼 초 +12kg/kg~십이 킬로그램 퍼 킬로그램 +1km/m~일 킬로미터 퍼 미터 +50W~오십 와트 +440Hz~사백사십 헤르츠 +300N~삼백 뉴턴 +120rpm~백이십 분당회전수 +100%~백 퍼센트 +30°~삼십 도 +0.5kg~영점오 킬로그램 +2.3km~이점삼 킬로미터 +12.5L~십이점오 리터 +3.14m~삼점일사 미터 +0.03m~영점영삼 미터 +1/2kg~이분의 일 킬로그램 +2/3km~삼분의 이 킬로미터 +5/8cm~팔분의 오 센티미터 +2과3/4L~이과 사분의 삼 리터 +10과1/2km/h~십과 이분의 일 킬로미터 퍼 시간 +-3/4km~마이너스 사분의 삼 킬로미터 +-3.1km~마이너스 삼점일 킬로미터 +-3km~마이너스 삼 킬로미터 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..7a22075ab --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_money.txt @@ -0,0 +1,64 @@ +₩2000~이천원 +₩2,000~이천원 +2000원~이천원 +KRW 1230000~백이십삼만원 +₩460000000000~사천육백억원 +₩30억~삼십억원 +₩30조~삼십조원 +₩45억~사십오억원 +₩15000~만오천원 +₩1~일원 +₩20~이십원 +₩18925000~천팔백구십이만오천원 +₩18,925,000~천팔백구십이만오천원 +₩18854~만팔천팔백오십사원 +₩18129~만팔천백이십구원 +₩0~영원 +₩7~칠원 +₩10~십원 +₩11~십일원 +₩21~이십일원 +₩99~구십구원 +200원~이백원 +999원~구백구십구원 +₩1,000~천원 +₩9,999~구천구백구십구원 +₩10,000~만원 +₩20,000~이만원 +₩100,000~십만원 +₩1,000,000~백만원 +₩2,500,000~이백오십만원 +₩12,345~만이천삼백사십오원 +₩1,234,567~백이십삼만사천오백육십칠원 +₩10,000,000~천만원 +₩23,456,000~이천삼백사십오만육천원 +₩100,000,000~일억원 +₩123,000,000~일억이천삼백만원 +₩2억~이억원 +₩12억~십이억원 +₩100억~백억원 +₩1,000억~천억원 +₩3,400억~삼천사백억원 +₩4조~사조원 +₩12조~십이조원 +KRW 1,000,000,000~십억원 +krw 2,345,600,000~이십삼억사천오백육십만원 +₩2,300,000,000~이십삼억원 +₩999,999,999~구억구천구백구십구만구천구백구십구원 +KRW 30,000,000~삼천만원 +krw 5000~오천원 +$ 0~영 달러 +$ 1~일 달러 +$ 200~이백 달러 +US$ 1,234,567~백이십삼만사천오백육십칠 달러 +$ 30억~삼십억 달러 +HK$ 300~삼백 홍콩 달러 +€500~오백 유로 +EUR 1,230,000~백이십삼만 유로 +¥2000~이천 엔 +JPY 1,230,000~백이십삼만 엔 +¥30조~삼십조 엔 +CAD 2,500~이천오백 캐나다 달러 +NZD 123,456~십이만삼천사백오십육 뉴질랜드 달러 +CHF 100~백 스위스 프랑 +AED 75~칠십오 아랍에미리트 디르함 diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..3544a2aeb --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,54 @@ +1번째~첫번째 +2번째~두번째 +3번째~세번째 +4번째~네번째 +5번째~다섯번째 +6번째~여섯번째 +7번째~일곱번째 +8번째~여덟번째 +9번째~아홉번째 +10번째~열번째 +11번째~열한번째 +12번째~열두번째 +13번째~열세번째 +14번째~열네번째 +15번째~열다섯번째 +16번째~열여섯번째 +17번째~열일곱번째 +18번째~열여덟번째 +19번째~열아홉번째 +20번째~스무번째 +21번째~스물한번째 +22번째~스물두번째 +23번째~스물세번째 +24번째~스물네번째 +25번째~스물다섯번째 +26번째~스물여섯번째 +27번째~스물일곱번째 +28번째~스물여덟번째 +29번째~스물아홉번째 +30번째~서른번째 +31번째~서른한번째 +32번째~서른두번째 +33번째~서른세번째 +34번째~서른네번째 +35번째~서른다섯번째 +36번째~서른여섯번째 +37번째~서른일곱번째 +38번째~서른여덟번째 +39번째~서른아홉번째 +100번째~백번째 +189번째~백팔십구번째 +1034번째~천삼십사번째 +7324번째~칠천삼백이십사번째 +23456번째~이만삼천사백오십육번째 +78903번째~칠만팔천구백삼번째 +345678번째~삼십사만오천육백칠십팔번째 +987654번째~구십팔만칠천육백오십사번째 +1000000번째~백만번째 +5678901번째~오백육십칠만팔천구백일번째 +89123456번째~팔천구백십이만삼천사백오십육번째 +62345098번째~육천이백삼십사만오천구십팔번째 +235067092번째~이억삼천오백육만칠천구십이번째 +876543210번째~팔억칠천육백오십사만삼천이백십번째 +1000000000번째~십억번째 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..b6e573aec --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt @@ -0,0 +1,33 @@ ++1 123-123-5678~국가번호 일 일이삼 일이삼 오육칠팔 +123-123-5678~일이삼 일이삼 오육칠팔 ++1-123-123-5678~국가번호 일 일이삼 일이삼 오육칠팔 ++1 (123)-123-5678~국가번호 일 일이삼 일이삼 오육칠팔 +(123)-123-5678~일이삼 일이삼 오육칠팔 +555.555.5555~오오오 오오오 오오오오 +(123) 123-5678~일이삼 일이삼 오육칠팔 +010-3713-7050~영일영 삼칠일삼 칠영오영 ++82 123-456-7890~국가번호 팔이 일이삼 사오육 칠팔구영 ++82-123-4567-8901~국가번호 팔이 일이삼 사오육칠 팔구영일 ++44-207-555-1234~국가번호 사사 이영칠 오오오 일이삼사 +123.456-7890~일이삼 사오육 칠팔구영 +123-456.7890~일이삼 사오육 칠팔구영 +(987)-654-3210~구팔칠 육오사 삼이일영 +(987) 654-3210~구팔칠 육오사 삼이일영 ++7 000-000-0000~국가번호 칠 영영영 영영영 영영영영 +000.000.0000~영영영 영영영 영영영영 +271-828-1828~이칠일 팔이팔 일팔이팔 +314-159-2653~삼일사 일오구 이육오삼 +(010) 123-4567~영일영 일이삼 사오육칠 ++358-123-456-7890~국가번호 삼오팔 일이삼 사오육 칠팔구영 ++1 800-555-0000~국가번호 일 팔영영 오오오 영영영영 +(800) 555-0000~팔영영 오오오 영영영영 ++12 345-678-9012~국가번호 일이 삼사오 육칠팔 구영일이 ++999 999-999-9999~국가번호 구구구 구구구 구구구 구구구구 +321.654.0987~삼이일 육오사 영구팔칠 ++82 010-1234-5678~국가번호 팔이 영일영 일이삼사 오육칠팔 +(999)-000-0000~구구구 영영영 영영영영 ++1-123.456.7890~국가번호 일 일이삼 사오육 칠팔구영 ++82-123.456-7890~국가번호 팔이 일이삼 사오육 칠팔구영 +111-222-3333~일일일 이이이 삼삼삼삼 +909-808-7070~구영구 팔영팔 칠영칠영 +(555)555-5555~오오오 오오오 오오오오 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..2fb79402d --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_time.txt @@ -0,0 +1,25 @@ +오전10시30분~오전 열시 삼십분 +오전 10시30분~오전 열시 삼십분 +오전 10:30~오전 열시 삼십분 +오전 10:30:05~오전 열시 삼십분 오초 +오후 3시 반~오후 세시 반 +오후3시30분~오후 세시 삼십분 +오후 03:30~오후 세시 삼십분 +새벽 4시 5분~새벽 네시 오분 +새벽 04:05~새벽 네시 오분 +아침 7시~아침 일곱시 +낮 12시 15분~낮 열두시 십오분 +저녁 8시 45분~저녁 여덟시 사십오분 +밤 11시 55분 5초~밤 열한시 오십오분 오초 +밤 11:50:05~밤 열한시 오십분 오초 +정오~열두시 +자정~영시 +14:05~십사시 오분 +18:05~십팔시 오분 +23:00~이십삼시 +00:30:00~영시 삼십분 +24:03:38~이십사시 삼분 삼십팔초 +오전 0시 15분~오전 영시 십오분 +오후 12시 10분~오후 열두시 십분 +아침7시1분~아침 일곱시 일분 +저녁9시09분~저녁 아홉시 구분 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py new file mode 100644 index 000000000..d7d1f74c7 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + normalizer_ko = Normalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds + + inverse_normalizer_ko = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_ko.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_date.py b/tests/nemo_text_processing/ko/test_date.py new file mode 100644 index 000000000..a0f8daa9f --- /dev/null +++ b/tests/nemo_text_processing/ko/test_date.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDate: + normalizer_ko = Normalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_date(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_decimal.py b/tests/nemo_text_processing/ko/test_decimal.py new file mode 100644 index 000000000..99cf64894 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_decimal.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDecimal: + normalizer_ko = Normalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_decimal(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_electronic.py b/tests/nemo_text_processing/ko/test_electronic.py new file mode 100644 index 000000000..d06099328 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_electronic.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer +from ..utils import parse_test_case_file + + +class TestElectronic: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_electronic.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_electronic(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds diff --git a/tests/nemo_text_processing/ko/test_fraction.py b/tests/nemo_text_processing/ko/test_fraction.py new file mode 100644 index 000000000..7b8742da6 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_fraction.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestFraction: + normalizer_ko = Normalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_fraction.txt')) + # @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_fraction(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_fraction.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_measure.py b/tests/nemo_text_processing/ko/test_measure.py new file mode 100644 index 000000000..ea053b281 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_measure.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestMeasure: + normalizer_ko = Normalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_measure(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_money.py b/tests/nemo_text_processing/ko/test_money.py new file mode 100644 index 000000000..63ff5ca1b --- /dev/null +++ b/tests/nemo_text_processing/ko/test_money.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestMoney: + normalizer_ko = Normalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_money(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_ordinal.py b/tests/nemo_text_processing/ko/test_ordinal.py new file mode 100644 index 000000000..256a79249 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_ordinal.py @@ -0,0 +1,43 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm_date(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input, verbose=True) + assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh new file mode 100644 index 000000000..f11856232 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,74 @@ +#! /bin/sh + +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests/ko"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testITNOrdinal() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_ordinal.txt + runtest $input +} + +testITNDecimal() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_decimal.txt + runtest $input +} + +testITNFraction() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_fraction.txt + runtest $input +} + +testITNTime() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_time.txt + runtest $input +} + +testITNDate() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_date.txt + runtest $input +} + +testITNMoney() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_money.txt + runtest $input +} + +testITNTelephone() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_telephone.txt + runtest $input +} + +testITNMeasure() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_measure.txt + runtest $input +} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh new file mode 100644 index 000000000..9adbc152b --- /dev/null +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh @@ -0,0 +1,83 @@ +#! /bin/sh +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read written spoken <<< $testcase + # replace non breaking space with breaking space + # Use below if postprocessor is not used. Comment if it is used + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + # Use below if postprocessor is used. Comment if it is not used + # denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + # trim white space + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$written" "$spoken" "$denorm_pred" + done < "$input" +} + + +testTNCardinal() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testTNOrdinalText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_ordinal.txt + runtest $input +} + +testTNDecimalalText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_decimal.txt + runtest $input +} + +testTNFractionText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_fraction.txt + runtest $input +} + +testTNDateText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_date.txt + runtest $input +} + +testTNTimeText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_time.txt + runtest $input +} + +testTNMoneyText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_money.txt + runtest $input +} + +testTNTelephoneText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_telephone.txt + runtest $input +} + +testTNMeasureText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_measure.txt + runtest $input +} + +testTNElectronicText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_electronic.txt + runtest $input +} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_telephone.py b/tests/nemo_text_processing/ko/test_telephone.py new file mode 100644 index 000000000..949fdc068 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_telephone.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTime: + normalizer_ko = Normalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_telephone(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_time.py b/tests/nemo_text_processing/ko/test_time.py new file mode 100644 index 000000000..9ba942772 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_time.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTime: + normalizer_ko = Normalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_time(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 2e10e81f2..3e80b56ff 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -107,6 +107,7 @@ def parse_args(): 'mr', 'ja', 'rw', + 'ko', ], type=str, default='en', @@ -327,6 +328,17 @@ def parse_args(): ClassifyFst as TNClassifyFst, ) from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'ko': + from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ( + ClassifyFst as ITNClassifyFst, + ) + from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import ( + VerbalizeFst as ITNVerbalizeFst, + ) + from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ( + ClassifyFst as TNClassifyFst, + ) + from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst else: raise KeyError(f"Language {args.language} is not defined for export.") output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") diff --git a/tutorials/Text_(Inverse)_Normalization.ipynb b/tutorials/Text_(Inverse)_Normalization.ipynb index 9aea8317c..6a21e3856 100644 --- a/tutorials/Text_(Inverse)_Normalization.ipynb +++ b/tutorials/Text_(Inverse)_Normalization.ipynb @@ -92,6 +92,7 @@ "source": [ "# create text normalization instance that works on cased input\n", "from nemo_text_processing.text_normalization.normalize import Normalizer\n", + "\n", "normalizer = Normalizer(input_case='cased', lang='en')" ] }, @@ -101,7 +102,7 @@ "metadata": {}, "outputs": [], "source": [ - "# the normalizer class offers the following parameterization. \n", + "# the normalizer class offers the following parameterization.\n", "print(normalizer.__doc__)" ] }, @@ -246,7 +247,7 @@ "outputs": [], "source": [ "# example evaluation sentence\n", - "eval_text = \"\"\"PLAIN\\ton\\t\n", + "eval_text = \"\"\"PLAIN\\ton\\t\n", "DATE\\t22 july 2012\\tthe twenty second of july twenty twelve\n", "PLAIN\\tthey\\t\n", "PLAIN\\tworked\\t\n", @@ -278,6 +279,7 @@ "source": [ "# Parse evaluation file into written and normalized sentence pairs\n", "from nemo_text_processing.text_normalization.data_loader_utils import load_files, training_data_to_sentences\n", + "\n", "eval_data = load_files([EVAL_FILE])\n", "sentences_un_normalized, sentences_normalized, sentences_class_types = training_data_to_sentences(eval_data)\n", "print(list(zip(sentences_un_normalized, sentences_normalized)))" @@ -302,9 +304,8 @@ "source": [ "# measure sentence accuracy\n", "from nemo_text_processing.text_normalization.data_loader_utils import evaluate\n", - "sentences_accuracy = evaluate(\n", - " preds=sentences_prediction, labels=sentences_normalized, input=sentences_un_normalized\n", - " )\n", + "\n", + "sentences_accuracy = evaluate(preds=sentences_prediction, labels=sentences_normalized, input=sentences_un_normalized)\n", "print(\"- Accuracy: \" + str(sentences_accuracy))" ] }, @@ -324,6 +325,7 @@ "source": [ "# create inverse text normalization instance\n", "from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer\n", + "\n", "inverse_normalizer = InverseNormalizer(lang='en')" ] }, @@ -381,14 +383,14 @@ "metadata": {}, "outputs": [], "source": [ - "# initialize normalizer, this may take some time to generate the extended grammars. \n", + "# initialize normalizer, this may take some time to generate the extended grammars.\n", "# Thus, we recommend to cache the grammars by specifying a cache directory\n", "normalizer = NormalizerWithAudio(\n", - " lang=\"en\",\n", - " input_case=\"cased\",\n", - " overwrite_cache=False,\n", - " cache_dir=\"cache_dir\",\n", - " )\n", + " lang=\"en\",\n", + " input_case=\"cased\",\n", + " overwrite_cache=False,\n", + " cache_dir=\"cache_dir\",\n", + ")\n", "# create up to 10 normalization options\n", "print(normalizer.normalize(\"123\", n_tagged=10, punct_post_process=True))" ] diff --git a/tutorials/WFST_Tutorial.ipynb b/tutorials/WFST_Tutorial.ipynb index 5b1c0530e..2963e905d 100644 --- a/tutorials/WFST_Tutorial.ipynb +++ b/tutorials/WFST_Tutorial.ipynb @@ -60,7 +60,15 @@ "metadata": {}, "outputs": [], "source": [ - "from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, NEMO_DIGIT, delete_space, NEMO_SIGMA, NEMO_NOT_QUOTE, delete_extra_space, NEMO_NON_BREAKING_SPACE\n", + "from nemo_text_processing.text_normalization.en.graph_utils import (\n", + " GraphFst,\n", + " NEMO_DIGIT,\n", + " delete_space,\n", + " NEMO_SIGMA,\n", + " NEMO_NOT_QUOTE,\n", + " delete_extra_space,\n", + " NEMO_NON_BREAKING_SPACE,\n", + ")\n", "from nemo_text_processing.text_normalization.normalize import Normalizer\n", "\n", "from nemo_text_processing.inverse_text_normalization.fr.taggers.cardinal import CardinalFst\n", @@ -77,7 +85,7 @@ "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.ordinal import OrdinalFst\n", "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.time import TimeFst\n", "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.whitelist import WhiteListFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.word import WordFst\n" + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.word import WordFst" ] }, { @@ -492,16 +500,17 @@ "source": [ "from pynini.lib import pynutil\n", "\n", + "\n", "def apply_fst(text, fst):\n", - " \"\"\" Given a string input, returns the output string\n", - " produced by traversing the path with lowest weight.\n", - " If no valid path accepts input string, returns an\n", - " error.\n", - " \"\"\"\n", - " try:\n", - " print(pynini.shortestpath(text @ fst).string())\n", - " except pynini.FstOpError:\n", - " print(f\"Error: No valid output with given input: '{text}'\")" + " \"\"\"Given a string input, returns the output string\n", + " produced by traversing the path with lowest weight.\n", + " If no valid path accepts input string, returns an\n", + " error.\n", + " \"\"\"\n", + " try:\n", + " print(pynini.shortestpath(text @ fst).string())\n", + " except pynini.FstOpError:\n", + " print(f\"Error: No valid output with given input: '{text}'\")" ] }, { @@ -563,19 +572,21 @@ }, "outputs": [], "source": [ - "zero = pynini.string_map([(\"zéro\",\"0\")]) # French only pronounces zeroes as stand alone\n", - "digits = pynini.string_map([ # pynini function that creates explicit input-output mappings for a WFST\n", - "\t\t\t\t(\"un\",\"1\"),\n", - "\t\t\t\t(\"une\",\"1\"),\n", - "\t\t\t\t(\"deux\",\"2\"),\n", - "\t\t\t\t(\"trois\",\"3\"),\n", - "\t\t\t\t(\"quatre\",\"4\"),\n", - "\t\t\t\t(\"cinq\",\"5\"),\n", - "\t\t\t\t(\"six\",\"6\"),\n", - "\t\t\t\t(\"sept\",\"7\"),\n", - "\t\t\t\t(\"huit\",\"8\"),\n", - "\t\t\t\t(\"neuf\",\"9\")\n", - "])" + "zero = pynini.string_map([(\"zéro\", \"0\")]) # French only pronounces zeroes as stand alone\n", + "digits = pynini.string_map(\n", + " [ # pynini function that creates explicit input-output mappings for a WFST\n", + " (\"un\", \"1\"),\n", + " (\"une\", \"1\"),\n", + " (\"deux\", \"2\"),\n", + " (\"trois\", \"3\"),\n", + " (\"quatre\", \"4\"),\n", + " (\"cinq\", \"5\"),\n", + " (\"six\", \"6\"),\n", + " (\"sept\", \"7\"),\n", + " (\"huit\", \"8\"),\n", + " (\"neuf\", \"9\"),\n", + " ]\n", + ")" ] }, { @@ -678,14 +689,16 @@ }, "outputs": [], "source": [ - "teens = pynini.string_map([\n", - "\t\t\t\t(\"onze\",\"11\"),\n", - "\t\t\t\t(\"douze\",\"12\"),\n", - "\t\t\t\t(\"treize\",\"13\"),\n", - "\t\t\t\t(\"quatorze\",\"14\"),\n", - "\t\t\t\t(\"quinze\",\"15\"),\n", - "\t\t\t\t(\"seize\",\"16\"),\n", - "])" + "teens = pynini.string_map(\n", + " [\n", + " (\"onze\", \"11\"),\n", + " (\"douze\", \"12\"),\n", + " (\"treize\", \"13\"),\n", + " (\"quatorze\", \"14\"),\n", + " (\"quinze\", \"15\"),\n", + " (\"seize\", \"16\"),\n", + " ]\n", + ")" ] }, { @@ -706,7 +719,9 @@ "outputs": [], "source": [ "tens = pynini.string_map([(\"dix\", \"1\")])\n", - "delete_hyphen = pynini.closure(pynutil.delete(\"-\"), 0, 1) # Applies a closure from 0-1 of operation. Equivalent to regex /?/\n", + "delete_hyphen = pynini.closure(\n", + " pynutil.delete(\"-\"), 0, 1\n", + ") # Applies a closure from 0-1 of operation. Equivalent to regex /?/\n", "\n", "graph_tens = tens + delete_hyphen + digits" ] @@ -820,7 +835,7 @@ }, "outputs": [], "source": [ - "graph_digits = digits | pynutil.insert(\"0\") # inserts zero if no digit follows" + "graph_digits = digits | pynutil.insert(\"0\") # inserts zero if no digit follows" ] }, { @@ -862,7 +877,7 @@ "outputs": [], "source": [ "graph_teens_and_tens = graph_tens | teens\n", - "graph_all = graph_teens_and_tens | zero " + "graph_all = graph_teens_and_tens | zero" ] }, { @@ -886,13 +901,13 @@ }, "outputs": [], "source": [ - "apply_fst(\"dix-huit\", graph_all) \n", - "apply_fst(\"seize\" , graph_all)\n", - "apply_fst(\"dix\" , graph_all) \n", - "apply_fst(\"une\" , graph_all) \n", - "apply_fst(\"trois\" , graph_all) \n", - "apply_fst(\"quatre\" , graph_all) \n", - "apply_fst(\"zéro\" , graph_all)" + "apply_fst(\"dix-huit\", graph_all)\n", + "apply_fst(\"seize\", graph_all)\n", + "apply_fst(\"dix\", graph_all)\n", + "apply_fst(\"une\", graph_all)\n", + "apply_fst(\"trois\", graph_all)\n", + "apply_fst(\"quatre\", graph_all)\n", + "apply_fst(\"zéro\", graph_all)" ] }, { @@ -959,14 +974,16 @@ }, "outputs": [], "source": [ - "tens = pynini.string_map([\n", - "\t\t\t\t(\"dix\", \"1\"),\n", - "\t\t\t\t(\"vingt\",\"2\"),\n", - "\t\t\t\t(\"trente\",\"3\"),\n", - "\t\t\t\t(\"quarante\",\"4\"),\n", - "\t\t\t\t(\"cinquante\",\"5\"),\n", - "\t\t\t\t(\"soixante\",\"6\"),\n", - "\t\t])\n", + "tens = pynini.string_map(\n", + " [\n", + " (\"dix\", \"1\"),\n", + " (\"vingt\", \"2\"),\n", + " (\"trente\", \"3\"),\n", + " (\"quarante\", \"4\"),\n", + " (\"cinquante\", \"5\"),\n", + " (\"soixante\", \"6\"),\n", + " ]\n", + ")\n", "\n", "graph_et = pynutil.delete(\"-et-\")\n", "\n", @@ -974,7 +991,7 @@ "\n", "graph_tens = tens + graph_digits\n", "graph_teens_and_tens = graph_tens | teens\n", - "graph_all = graph_teens_and_tens | zero " + "graph_all = graph_teens_and_tens | zero" ] }, { @@ -1045,19 +1062,21 @@ }, "outputs": [], "source": [ - "tens = pynini.string_map([\n", - "\t\t\t\t(\"dix\", \"1\"),\n", - "\t\t\t\t(\"vingt\",\"2\"),\n", - "\t\t\t\t(\"trente\",\"3\"),\n", - "\t\t\t\t(\"quarante\",\"4\"),\n", - "\t\t\t\t(\"cinquante\",\"5\"),\n", - "\t\t\t\t(\"soixante\",\"6\"),\n", - " (\"quatre-vingt\", \"8\")\n", - "\t\t])\n", + "tens = pynini.string_map(\n", + " [\n", + " (\"dix\", \"1\"),\n", + " (\"vingt\", \"2\"),\n", + " (\"trente\", \"3\"),\n", + " (\"quarante\", \"4\"),\n", + " (\"cinquante\", \"5\"),\n", + " (\"soixante\", \"6\"),\n", + " (\"quatre-vingt\", \"8\"),\n", + " ]\n", + ")\n", "tens = tens | pynutil.insert(\"0\") | tens + delete_hyphen | tens + graph_et\n", "graph_tens = tens + graph_digits\n", "graph_teens_and_tens = graph_tens | teens\n", - "graph_all = graph_teens_and_tens | zero " + "graph_all = graph_teens_and_tens | zero" ] }, { @@ -1121,9 +1140,7 @@ }, "outputs": [], "source": [ - "quatre_vingt_plural = pynini.string_map([\n", - " (\"quatre-vingts\", \"80\")\n", - "\t\t])" + "quatre_vingt_plural = pynini.string_map([(\"quatre-vingts\", \"80\")])" ] }, { @@ -1143,21 +1160,21 @@ }, "outputs": [], "source": [ - "quatre_vingt_singular = pynini.string_map([\n", - " (\"quatre-vingt-\", \"8\") # Note that the hyphen can be assumed now\n", - "\t\t])\n", - "graph_digits_without_zero = pynini.string_map([\n", - "\t\t\t\t(\"un\",\"1\"),\n", - "\t\t\t\t(\"une\",\"1\"),\n", - "\t\t\t\t(\"deux\",\"2\"),\n", - "\t\t\t\t(\"trois\",\"3\"),\n", - "\t\t\t\t(\"quatre\",\"4\"),\n", - "\t\t\t\t(\"cinq\",\"5\"),\n", - "\t\t\t\t(\"six\",\"6\"),\n", - "\t\t\t\t(\"sept\",\"7\"),\n", - "\t\t\t\t(\"huit\",\"8\"),\n", - "\t\t\t\t(\"neuf\",\"9\")\n", - "])\n", + "quatre_vingt_singular = pynini.string_map([(\"quatre-vingt-\", \"8\")]) # Note that the hyphen can be assumed now\n", + "graph_digits_without_zero = pynini.string_map(\n", + " [\n", + " (\"un\", \"1\"),\n", + " (\"une\", \"1\"),\n", + " (\"deux\", \"2\"),\n", + " (\"trois\", \"3\"),\n", + " (\"quatre\", \"4\"),\n", + " (\"cinq\", \"5\"),\n", + " (\"six\", \"6\"),\n", + " (\"sept\", \"7\"),\n", + " (\"huit\", \"8\"),\n", + " (\"neuf\", \"9\"),\n", + " ]\n", + ")\n", "graph_eighties = (quatre_vingt_singular + graph_digits_without_zero) | quatre_vingt_plural" ] }, @@ -1178,33 +1195,43 @@ }, "outputs": [], "source": [ - "seventy_and_ninety = pynini.string_map([\n", + "seventy_and_ninety = pynini.string_map(\n", + " [\n", " (\"soixante-dix\", \"70\"),\n", " (\"quatre-vingt-dix\", \"90\"),\n", - "\t\t])\n", + " ]\n", + ")\n", "\n", - "seventy_and_ninety_tens = pynini.string_map([\n", + "seventy_and_ninety_tens = pynini.string_map(\n", + " [\n", " (\"soixante-\", \"7\"),\n", " (\"quatre-vingt-\", \"9\"),\n", - "\t\t])\n", + " ]\n", + ")\n", "\n", - "seventy_and_ninety_one = pynini.string_map([\n", + "seventy_and_ninety_one = pynini.string_map(\n", + " [\n", " (\"soixante-et-onze\", \"71\"),\n", " (\"quatre-vingt-onze\", \"91\"),\n", - "\t\t])\n", + " ]\n", + ")\n", "\n", - "seventy_and_ninety_digits = digits = pynini.string_map([ \n", - "\t\t\t\t(\"douze\",\"2\"),\n", - "\t\t\t\t(\"treize\",\"3\"),\n", - "\t\t\t\t(\"quatorze\",\"4\"),\n", - "\t\t\t\t(\"quinze\",\"5\"),\n", - "\t\t\t\t(\"seize\",\"6\"),\n", - "\t\t\t\t(\"dix-sept\",\"7\"), # For 97-99, digits are used as normal.\n", - "\t\t\t\t(\"dix-huit\",\"8\"),\n", - "\t\t\t\t(\"dix-neuf\",\"9\")\n", - "])\n", + "seventy_and_ninety_digits = digits = pynini.string_map(\n", + " [\n", + " (\"douze\", \"2\"),\n", + " (\"treize\", \"3\"),\n", + " (\"quatorze\", \"4\"),\n", + " (\"quinze\", \"5\"),\n", + " (\"seize\", \"6\"),\n", + " (\"dix-sept\", \"7\"), # For 97-99, digits are used as normal.\n", + " (\"dix-huit\", \"8\"),\n", + " (\"dix-neuf\", \"9\"),\n", + " ]\n", + ")\n", "\n", - "graph_seventies_and_nineties = (seventy_and_ninety_tens + seventy_and_ninety_digits) | seventy_and_ninety | seventy_and_ninety_one " + "graph_seventies_and_nineties = (\n", + " (seventy_and_ninety_tens + seventy_and_ninety_digits) | seventy_and_ninety | seventy_and_ninety_one\n", + ")" ] }, { @@ -1224,20 +1251,22 @@ }, "outputs": [], "source": [ - "tens = pynini.string_map([\n", - "\t\t\t\t(\"dix\", \"1\"),\n", - "\t\t\t\t(\"vingt\",\"2\"),\n", - "\t\t\t\t(\"trente\",\"3\"),\n", - "\t\t\t\t(\"quarante\",\"4\"),\n", - "\t\t\t\t(\"cinquante\",\"5\"),\n", - "\t\t\t\t(\"soixante\",\"6\"),\n", - "\t\t])\n", + "tens = pynini.string_map(\n", + " [\n", + " (\"dix\", \"1\"),\n", + " (\"vingt\", \"2\"),\n", + " (\"trente\", \"3\"),\n", + " (\"quarante\", \"4\"),\n", + " (\"cinquante\", \"5\"),\n", + " (\"soixante\", \"6\"),\n", + " ]\n", + ")\n", "tens = tens | pynutil.insert(\"0\") | tens + delete_hyphen | tens + graph_et\n", "\n", "graph_tens = tens + graph_digits\n", "graph_tens_with_special_cases = graph_tens | graph_seventies_and_nineties | graph_eighties\n", "graph_teens_and_tens = graph_tens_with_special_cases | teens\n", - "graph_all = graph_teens_and_tens | zero " + "graph_all = graph_teens_and_tens | zero" ] }, { @@ -1261,7 +1290,7 @@ }, "outputs": [], "source": [ - "apply_fst(\"quatre-vingt-treize\" , graph_all)\n", + "apply_fst(\"quatre-vingt-treize\", graph_all)\n", "apply_fst(\"quatre-vingts\", graph_all)\n", "apply_fst(\"quatre-vingt-deux\", graph_all)" ] @@ -1314,9 +1343,9 @@ "outputs": [], "source": [ "apply_fst(\"dix-une\", graph_all) # supposed to be \"onze\"\n", - "apply_fst(\"dix-deux\", graph_all) # supposed to be \"douze\"\n", + "apply_fst(\"dix-deux\", graph_all) # supposed to be \"douze\"\n", "apply_fst(\"vingt-un\", graph_all) # supposed to be \"vingt-et-un\"\n", - "apply_fst(\"trente-un\", graph_all) # supposed to be \"trente-et-un\"" + "apply_fst(\"trente-un\", graph_all) # supposed to be \"trente-et-un\"" ] }, { @@ -1340,38 +1369,41 @@ }, "outputs": [], "source": [ - "graph_tens_special = pynini.string_map([\n", - "\t\t\t\t(\"soixante-dix\", \"70\"),\n", - "\t\t\t\t(\"soixante-et-onze\",\"71\"),\n", - " (\"soixante-douze\",\"72\"),\n", - "\t\t\t\t(\"soixante-treize\",\"73\"),\n", - "\t\t\t\t(\"soizante-quatorze\",\"74\"),\n", - "\t\t\t\t(\"soixante-quinze\",\"75\"),\n", - "\t\t\t\t(\"soixante-seize\",\"76\"),\n", - " (\"soixante-dix-sept\",\"77\"),\n", - " (\"soixante-dix-huit\",\"78\"),\n", - "\t\t\t\t(\"soixante-dix-neuf\",\"79\"),\n", + "graph_tens_special = pynini.string_map(\n", + " [\n", + " (\"soixante-dix\", \"70\"),\n", + " (\"soixante-et-onze\", \"71\"),\n", + " (\"soixante-douze\", \"72\"),\n", + " (\"soixante-treize\", \"73\"),\n", + " (\"soizante-quatorze\", \"74\"),\n", + " (\"soixante-quinze\", \"75\"),\n", + " (\"soixante-seize\", \"76\"),\n", + " (\"soixante-dix-sept\", \"77\"),\n", + " (\"soixante-dix-huit\", \"78\"),\n", + " (\"soixante-dix-neuf\", \"79\"),\n", " (\"quatre-vingts\", \"80\"),\n", " (\"quatre-vingt-un\", \"81\"),\n", " (\"quatre-vingt-une\", \"81\"),\n", - "\t\t\t\t(\"quatre-vingt-deux\",\"82\"),\n", - " (\"quatre-vingt-trois\",\"83\"),\n", - " (\"quatre-vingt-quatre\",\"84\"),\n", - " (\"quatre-vingt-cinq\",\"85\"),\n", - " (\"quatre-vingt-six\",\"86\"),\n", - " (\"quatre-vingt-sept\",\"87\"),\n", - " (\"quatre-vingt-huit\",\"88\"),\n", - " (\"quatre-vingt-neuf\",\"89\"),\n", - " (\"quatre-vingt-dix\",\"90\"),\n", - " (\"quatre-vingt-onze\",\"91\"),\n", - " (\"quatre-vingt-douze\",\"92\"),\n", - " (\"quatre-vingt-treize\",\"93\"),\n", - " (\"quatre-vingt-quatorze\",\"94\"),\n", - " (\"quatre-vingt-quinze\",\"95\"),\n", - " (\"quatre-vingt-sieze\",\"96\"),\n", - " (\"quatre-vingt-dix-sept\",\"97\"),\n", - " (\"quatre-vingt-dix-huit\",\"98\"),\n", - " (\"quatre-vingt-dix-neuf\",\"99\"),])" + " (\"quatre-vingt-deux\", \"82\"),\n", + " (\"quatre-vingt-trois\", \"83\"),\n", + " (\"quatre-vingt-quatre\", \"84\"),\n", + " (\"quatre-vingt-cinq\", \"85\"),\n", + " (\"quatre-vingt-six\", \"86\"),\n", + " (\"quatre-vingt-sept\", \"87\"),\n", + " (\"quatre-vingt-huit\", \"88\"),\n", + " (\"quatre-vingt-neuf\", \"89\"),\n", + " (\"quatre-vingt-dix\", \"90\"),\n", + " (\"quatre-vingt-onze\", \"91\"),\n", + " (\"quatre-vingt-douze\", \"92\"),\n", + " (\"quatre-vingt-treize\", \"93\"),\n", + " (\"quatre-vingt-quatorze\", \"94\"),\n", + " (\"quatre-vingt-quinze\", \"95\"),\n", + " (\"quatre-vingt-sieze\", \"96\"),\n", + " (\"quatre-vingt-dix-sept\", \"97\"),\n", + " (\"quatre-vingt-dix-huit\", \"98\"),\n", + " (\"quatre-vingt-dix-neuf\", \"99\"),\n", + " ]\n", + ")" ] }, { @@ -1395,7 +1427,7 @@ }, "outputs": [], "source": [ - "constructed_version = (graph_seventies_and_nineties | graph_eighties)\n", + "constructed_version = graph_seventies_and_nineties | graph_eighties\n", "constructed_version.num_states()" ] }, @@ -1535,7 +1567,7 @@ "source": [ "apply_fst(\"deux-cent-trois\", hundreds)\n", "apply_fst(\"huit-cent-quatre-vingts\", hundreds)\n", - "apply_fst(\"cinq-cent-trente\" , hundreds) " + "apply_fst(\"cinq-cent-trente\", hundreds)" ] }, { @@ -1559,7 +1591,9 @@ }, "outputs": [], "source": [ - "cents = pynini.accep(\"cent\") | pynini.accep(\"cents\") # Creates a Finite State (Accep)tor, mapping inputs back to themselves\n", + "cents = pynini.accep(\"cent\") | pynini.accep(\n", + " \"cents\"\n", + ") # Creates a Finite State (Accep)tor, mapping inputs back to themselves\n", "hundreds = graph_digits + delete_hyphen + pynutil.delete(cents) + delete_hyphen + graph_all" ] }, @@ -1580,7 +1614,7 @@ }, "outputs": [], "source": [ - "graph_cents = pynini.cross(\"cents\", \"00\") # Creates a single input-output mapping\n", + "graph_cents = pynini.cross(\"cents\", \"00\") # Creates a single input-output mapping\n", "hundreds = graph_digits + delete_hyphen + ((pynutil.delete(\"cent\") + delete_hyphen + graph_all) | graph_cents)" ] }, @@ -1618,9 +1652,9 @@ }, "outputs": [], "source": [ - "apply_fst(\"trois-cents\", graph_hundreds) \n", + "apply_fst(\"trois-cents\", graph_hundreds)\n", "apply_fst(\"cent\", graph_hundreds)\n", - "apply_fst(\"cent-trois\", graph_hundreds) " + "apply_fst(\"cent-trois\", graph_hundreds)" ] }, { @@ -1738,8 +1772,8 @@ }, "outputs": [], "source": [ - "apply_fst(\"mille-cent-un\", graph_thousands) # Should be 1101\n", - "apply_fst(\"mille-cent\", graph_thousands) # 1100" + "apply_fst(\"mille-cent-un\", graph_thousands) # Should be 1101\n", + "apply_fst(\"mille-cent\", graph_thousands) # 1100" ] }, { @@ -1761,8 +1795,8 @@ "source": [ "graph_cents = pynini.cross(\"cents\", \"00\")\n", "graph_cent = pynini.cross(\"cent\", \"1\")\n", - "graph_hundreds_first_digit = (graph_digits + delete_hyphen + pynutil.delete(cents)) | graph_cent\n", - "graph_hundreds = (graph_hundreds_first_digit + delete_hyphen | pynutil.insert(\"0\")) + graph_all \n", + "graph_hundreds_first_digit = (graph_digits + delete_hyphen + pynutil.delete(cents)) | graph_cent\n", + "graph_hundreds = (graph_hundreds_first_digit + delete_hyphen | pynutil.insert(\"0\")) + graph_all\n", "\n", "graph_one_thousand = pynini.cross(\"mille\", \"1\")\n", "graph_many_thousand = graph_hundreds + delete_hyphen + pynutil.delete(\"mille\")\n", @@ -1788,7 +1822,7 @@ }, "outputs": [], "source": [ - "graph_hundreds = (graph_hundreds_first_digit + delete_hyphen | pynutil.insert(\"0\", weight=.1)) + graph_all \n", + "graph_hundreds = (graph_hundreds_first_digit + delete_hyphen | pynutil.insert(\"0\", weight=0.1)) + graph_all\n", "\n", "graph_one_thousand = pynini.cross(\"mille\", \"1\")\n", "graph_many_thousand = graph_hundreds + delete_hyphen + pynutil.delete(\"mille\")\n", @@ -1850,7 +1884,9 @@ "source": [ "graph_one_thousand = pynini.cross(\"mille\", \"1\")\n", "graph_many_thousand = graph_hundreds + delete_hyphen + pynutil.delete(\"mille\")\n", - "graph_thousands = (graph_one_thousand | graph_many_thousand | pynutil.insert(\"000\", weight=.001)) + delete_hyphen + graph_hundreds" + "graph_thousands = (\n", + " (graph_one_thousand | graph_many_thousand | pynutil.insert(\"000\", weight=0.001)) + delete_hyphen + graph_hundreds\n", + ")" ] }, { @@ -1887,8 +1923,10 @@ "outputs": [], "source": [ "millions = pynini.accep(\"million\") | pynini.accep(\"millions\")\n", - "graph_millions = ((graph_hundreds + delete_hyphen + pynutil.delete(millions) + delete_hyphen) | pynutil.insert(\"000\", weight=.1) # We need three zeroes now\n", - " ) + graph_thousands" + "graph_millions = (\n", + " (graph_hundreds + delete_hyphen + pynutil.delete(millions) + delete_hyphen)\n", + " | pynutil.insert(\"000\", weight=0.1) # We need three zeroes now\n", + ") + graph_thousands" ] }, { @@ -1900,8 +1938,10 @@ "outputs": [], "source": [ "billions = pynini.accep(\"milliards\") | pynini.accep(\"milliard\")\n", - "graph_billions = ((graph_hundreds + delete_hyphen + pynutil.delete(billions) + delete_hyphen)| pynutil.insert(\"000\",weight=.1) # We need three zeroes now\n", - " ) + graph_millions" + "graph_billions = (\n", + " (graph_hundreds + delete_hyphen + pynutil.delete(billions) + delete_hyphen)\n", + " | pynutil.insert(\"000\", weight=0.1) # We need three zeroes now\n", + ") + graph_millions" ] }, { @@ -1913,8 +1953,10 @@ "outputs": [], "source": [ "trillions = pynini.accep(\"billion\") | pynini.accep(\"billions\")\n", - "graph_trillions = ((graph_hundreds + delete_hyphen + pynutil.delete(trillions) + delete_hyphen) | pynutil.insert(\"000\",weight=.1) # We need three zeroes now\n", - " ) + graph_billions" + "graph_trillions = (\n", + " (graph_hundreds + delete_hyphen + pynutil.delete(trillions) + delete_hyphen)\n", + " | pynutil.insert(\"000\", weight=0.1) # We need three zeroes now\n", + ") + graph_billions" ] }, { @@ -1959,7 +2001,7 @@ "outputs": [], "source": [ "example = \"deux-cent-milliard-quatre-million-deux-cent-quatre-vingt-onze\"\n", - "apply_fst(example, graph) " + "apply_fst(example, graph)" ] }, { @@ -1993,14 +2035,18 @@ }, "outputs": [], "source": [ - "delete_leading_zeroes = pynutil.delete(pynini.closure(\"0\")) # will delete all zeroes under closure. Equivalent to regex * operator\n", - "stop_at_non_zero = pynini.difference(NEMO_DIGIT, \"0\") # creates a graph that accepts all input-outputs from NEMO_DIGIT except 0\n", - "rest_of_cardinal = pynini.closure(NEMO_DIGIT) # accepts all digits that may follow\n", + "delete_leading_zeroes = pynutil.delete(\n", + " pynini.closure(\"0\")\n", + ") # will delete all zeroes under closure. Equivalent to regex * operator\n", + "stop_at_non_zero = pynini.difference(\n", + " NEMO_DIGIT, \"0\"\n", + ") # creates a graph that accepts all input-outputs from NEMO_DIGIT except 0\n", + "rest_of_cardinal = pynini.closure(NEMO_DIGIT) # accepts all digits that may follow\n", "\n", "clean_cardinal = delete_leading_zeroes + stop_at_non_zero + rest_of_cardinal\n", - "clean_cardinal = clean_cardinal | \"0\" # We don't want to ignore the occurrence of zero\n", + "clean_cardinal = clean_cardinal | \"0\" # We don't want to ignore the occurrence of zero\n", "\n", - "graph = graph @ clean_cardinal " + "graph = graph @ clean_cardinal" ] }, { @@ -2103,8 +2149,8 @@ " def __init__(self):\n", " super().__init__(name=\"cardinal\", kind=\"classify\")\n", " # Rest of the grammar here\n", - " # ....... \n", - " #........." + " # .......\n", + " # ........." ] }, { @@ -2133,8 +2179,8 @@ " def __init__(self):\n", " super().__init__(name=\"cardinal\", kind=\"classify\")\n", " # Rest of the grammar here\n", - " # ....... \n", - " #.........\n", + " # .......\n", + " # .........\n", " self.fst = pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")" ] }, @@ -2159,8 +2205,8 @@ " def __init__(self):\n", " super().__init__(name=\"cardinal\", kind=\"classify\")\n", " # Rest of the grammar here\n", - " # ....... \n", - " #.........\n", + " # .......\n", + " # .........\n", " self.fst = pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")\n", " final_graph = self.add_tokens(graph)" ] @@ -2187,7 +2233,9 @@ "outputs": [], "source": [ "optional_minus_graph = pynini.closure(\n", - " pynutil.insert(\"negative: \") + pynini.cross(\"moins\", \"\\\"-\\\"\") + \" \", 0, 1 # Note the extra space to separate the value from the integer field\n", + " pynutil.insert(\"negative: \") + pynini.cross(\"moins\", \"\\\"-\\\"\") + \" \",\n", + " 0,\n", + " 1, # Note the extra space to separate the value from the integer field\n", ")\n", "\n", "final_graph = optional_minus_graph + pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")" @@ -2214,10 +2262,10 @@ "class CardinalFst(GraphFst):\n", " def __init__(self):\n", " super().__init__(name=\"cardinal\", kind=\"classify\")\n", - " \n", + "\n", " ### Cardinal Grammar....\n", " ### .....\n", - " graph = graph_trillions | zero \n", + " graph = graph_trillions | zero\n", "\n", " ### Formatting grammar....\n", " ### .....\n", @@ -2225,12 +2273,12 @@ "\n", " ### Token insertion\n", " optional_minus_graph = pynini.closure(\n", - " pynutil.insert(\"negative: \") + pynini.cross(\"moins\", \"\\\"-\\\"\") + \" \", 0, 1\n", - " )\n", + " pynutil.insert(\"negative: \") + pynini.cross(\"moins\", \"\\\"-\\\"\") + \" \", 0, 1\n", + " )\n", "\n", " final_graph = optional_minus_graph + pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")\n", "\n", - " final_graph = self.add_tokens(final_graph) # inserts the cardinal tag\n", + " final_graph = self.add_tokens(final_graph) # inserts the cardinal tag\n", "\n", " self.fst = final_graph" ] @@ -2305,8 +2353,8 @@ "outputs": [], "source": [ "class CardinalFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"cardinal\", kind=\"verbalize\")" + " def __init__(self):\n", + " super().__init__(name=\"cardinal\", kind=\"verbalize\")" ] }, { @@ -2345,11 +2393,11 @@ "outputs": [], "source": [ "class CardinalFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"cardinal\", kind=\"verbalize\")\n", - " \n", - " # Removes the negative attribute and leaves the sign if occurs\n", - " optional_sign = pynini.closure(\n", + " def __init__(self):\n", + " super().__init__(name=\"cardinal\", kind=\"verbalize\")\n", + "\n", + " # Removes the negative attribute and leaves the sign if occurs\n", + " optional_sign = pynini.closure(\n", " pynutil.delete(\"negative:\")\n", " + delete_space\n", " + pynutil.delete(\"\\\"\")\n", @@ -2359,21 +2407,21 @@ " 0,\n", " 1,\n", " )\n", - " \n", - " # removes integer aspect\n", - " graph = (\n", + "\n", + " # removes integer aspect\n", + " graph = (\n", " pynutil.delete(\"integer:\")\n", " + delete_space\n", " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_DIGIT, 1) # Accepts at least one digit\n", + " + pynini.closure(NEMO_DIGIT, 1) # Accepts at least one digit\n", " + pynutil.delete(\"\\\"\")\n", " )\n", - " \n", - " graph = optional_sign + graph # concatenates two properties\n", "\n", - " delete_tokens = self.delete_tokens(graph) # removes semiotic class tag\n", + " graph = optional_sign + graph # concatenates two properties\n", "\n", - " self.fst = delete_tokens.optimize()" + " delete_tokens = self.delete_tokens(graph) # removes semiotic class tag\n", + "\n", + " self.fst = delete_tokens.optimize()" ] }, { @@ -2519,8 +2567,8 @@ }, "outputs": [], "source": [ - "strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", - "graph_strip_morpheme = NEMO_SIGMA + strip_morpheme # accepts all strings until passed suffix, then deletes suffix" + "strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", + "graph_strip_morpheme = NEMO_SIGMA + strip_morpheme # accepts all strings until passed suffix, then deletes suffix" ] }, { @@ -2543,22 +2591,22 @@ "class CardinalFst(GraphFst):\n", " def __init__(self):\n", " super().__init__(name=\"cardinal\", kind=\"classify\")\n", - " \n", + "\n", " ### Cardinal Grammar....\n", " ### .....\n", - " graph = graph_trillions | zero \n", + " graph = graph_trillions | zero\n", "\n", " ### Formatting grammar....\n", " ### .....\n", " graph = graph @ clean_cardinal\n", - " \n", + "\n", " ### NEW GRAPH\n", - " self.just_cardinals = graph # will produce cardinals without formatting\n", + " self.just_cardinals = graph # will produce cardinals without formatting\n", "\n", " ### Token insertion\n", " optional_minus_graph = pynini.closure(\n", - " pynutil.insert(\"negative: \") + pynini.cross(\"moins\", \"\\\"-\\\"\") + \" \", 0, 1\n", - " )\n", + " pynutil.insert(\"negative: \") + pynini.cross(\"moins\", \"\\\"-\\\"\") + \" \", 0, 1\n", + " )\n", "\n", " final_graph = optional_minus_graph + pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")\n", "\n", @@ -2582,7 +2630,7 @@ }, "outputs": [], "source": [ - "graph_cardinal = CardinalFst().just_cardinals \n", + "graph_cardinal = CardinalFst().just_cardinals\n", "graph_ordinal_regular_suffix = graph_strip_morpheme @ graph_cardinal" ] }, @@ -2603,7 +2651,7 @@ }, "outputs": [], "source": [ - "example = \"sixième\" # dervied from six/6\n", + "example = \"sixième\" # dervied from six/6\n", "apply_fst(example, graph_ordinal_regular_suffix)" ] }, @@ -2632,21 +2680,24 @@ }, "outputs": [], "source": [ - "graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", - " (\"cinquième\",\t\"cinq\"),\n", - " (\"neuvième\",\t\"neuf\"),\n", - " (\"onzième\",\t\"onze\"),\n", - " (\"douzième\",\t\"douze\"),\n", - " (\"treizième\",\t\"treize\"),\n", - " (\"quatorzième\",\t\"quatorze\"),\n", - " (\"quinzième\",\t\"quinze\"),\n", - " (\"seizième\",\t\"seize\"),\n", - " (\"trentième\",\t\"trente\"),\n", - " (\"quarantième\",\t\"quarante\"),\n", - " (\"cinquantième\",\t\"cinquante\"),\n", - " (\"soixantième\",\t\"soixante\"),\n", - " (\"millième\",\t\"mille\"),\n", - "])" + "graph_root_change = pynini.string_map(\n", + " [\n", + " (\"quatrième\", \"quatre\"),\n", + " (\"cinquième\", \"cinq\"),\n", + " (\"neuvième\", \"neuf\"),\n", + " (\"onzième\", \"onze\"),\n", + " (\"douzième\", \"douze\"),\n", + " (\"treizième\", \"treize\"),\n", + " (\"quatorzième\", \"quatorze\"),\n", + " (\"quinzième\", \"quinze\"),\n", + " (\"seizième\", \"seize\"),\n", + " (\"trentième\", \"trente\"),\n", + " (\"quarantième\", \"quarante\"),\n", + " (\"cinquantième\", \"cinquante\"),\n", + " (\"soixantième\", \"soixante\"),\n", + " (\"millième\", \"mille\"),\n", + " ]\n", + ")" ] }, { @@ -2675,8 +2726,8 @@ }, "outputs": [], "source": [ - "graph_firsts = pynini.string_map([(\"premier\", \"un\"),(\"première\", \"un\")])\n", - "graph_seconds = pynini.string_map([(\"second\", \"deux\"),(\"seconde\", \"deux\")])" + "graph_firsts = pynini.string_map([(\"premier\", \"un\"), (\"première\", \"un\")])\n", + "graph_seconds = pynini.string_map([(\"second\", \"deux\"), (\"seconde\", \"deux\")])" ] }, { @@ -2707,31 +2758,34 @@ }, "outputs": [], "source": [ - "strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", - "\n", - "graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", - " (\"cinquième\",\t\"cinq\"),\n", - " (\"neuvième\",\t\"neuf\"),\n", - " (\"onzième\",\t\"onze\"),\n", - " (\"douzième\",\t\"douze\"),\n", - " (\"treizième\",\t\"treize\"),\n", - " (\"quatorzième\",\t\"quatorze\"),\n", - " (\"quinzième\",\t\"quinze\"),\n", - " (\"seizième\",\t\"seize\"),\n", - " (\"trentième\",\t\"trente\"),\n", - " (\"quarantième\",\t\"quarante\"),\n", - " (\"cinquantième\",\t\"cinquante\"),\n", - " (\"soixantième\",\t\"soixante\"),\n", - " (\"millième\",\t\"mille\"),\n", - "])\n", + "strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", + "\n", + "graph_root_change = pynini.string_map(\n", + " [\n", + " (\"quatrième\", \"quatre\"),\n", + " (\"cinquième\", \"cinq\"),\n", + " (\"neuvième\", \"neuf\"),\n", + " (\"onzième\", \"onze\"),\n", + " (\"douzième\", \"douze\"),\n", + " (\"treizième\", \"treize\"),\n", + " (\"quatorzième\", \"quatorze\"),\n", + " (\"quinzième\", \"quinze\"),\n", + " (\"seizième\", \"seize\"),\n", + " (\"trentième\", \"trente\"),\n", + " (\"quarantième\", \"quarante\"),\n", + " (\"cinquantième\", \"cinquante\"),\n", + " (\"soixantième\", \"soixante\"),\n", + " (\"millième\", \"mille\"),\n", + " ]\n", + ")\n", "\n", "# Component will accept all tokens that end with desired strings\n", - "graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change) \n", + "graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change)\n", "\n", - "graph_firsts = pynini.string_map([(\"premier\", \"un\"),(\"première\", \"un\")])\n", - "graph_seconds = pynini.string_map([(\"second\", \"deux\"),(\"seconde\", \"deux\")])\n", + "graph_firsts = pynini.string_map([(\"premier\", \"un\"), (\"première\", \"un\")])\n", + "graph_seconds = pynini.string_map([(\"second\", \"deux\"), (\"seconde\", \"deux\")])\n", "\n", - "graph_get_cardinal = pynini.union(graph_firsts, graph_seconds, graph_get_cardinal) \n", + "graph_get_cardinal = pynini.union(graph_firsts, graph_seconds, graph_get_cardinal)\n", "\n", "graph_cardinal = CardinalFst().just_cardinals\n", "\n", @@ -2794,7 +2848,7 @@ "outputs": [], "source": [ "def __init__(self, cardinal: GraphFst):\n", - " super().__init__(name=\"ordinal\", kind=\"classify\")" + " super().__init__(name=\"ordinal\", kind=\"classify\")" ] }, { @@ -2816,10 +2870,11 @@ "source": [ "from nemo_text_processing.inverse_text_normalization.fr.taggers.cardinal import CardinalFst\n", "\n", + "\n", "class OrdinalFst(GraphFst):\n", - " def __init__(self, cardinal: GraphFst):\n", - " super().__init__(name=\"ordinal\", kind=\"classify\")\n", - " graph_cardinal = cardinal.graph_no_exception # NeMo equivalent to self.just_cardinals" + " def __init__(self, cardinal: GraphFst):\n", + " super().__init__(name=\"ordinal\", kind=\"classify\")\n", + " graph_cardinal = cardinal.graph_no_exception # NeMo equivalent to self.just_cardinals" ] }, { @@ -2840,37 +2895,40 @@ "outputs": [], "source": [ "class OrdinalFst(GraphFst):\n", - " def __init__(self, cardinal: GraphFst):\n", - " super().__init__(name=\"ordinal\", kind=\"classify\")\n", - " graph_cardinal = cardinal.graph_no_exception # may replace\n", - "\n", - " strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", + " def __init__(self, cardinal: GraphFst):\n", + " super().__init__(name=\"ordinal\", kind=\"classify\")\n", + " graph_cardinal = cardinal.graph_no_exception # may replace\n", + "\n", + " strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", + "\n", + " graph_root_change = pynini.string_map(\n", + " [\n", + " (\"quatrième\", \"quatre\"),\n", + " (\"cinquième\", \"cinq\"),\n", + " (\"neuvième\", \"neuf\"),\n", + " (\"onzième\", \"onze\"),\n", + " (\"douzième\", \"douze\"),\n", + " (\"treizième\", \"treize\"),\n", + " (\"quatorzième\", \"quatorze\"),\n", + " (\"quinzième\", \"quinze\"),\n", + " (\"seizième\", \"seize\"),\n", + " (\"trentième\", \"trente\"),\n", + " (\"quarantième\", \"quarante\"),\n", + " (\"cinquantième\", \"cinquante\"),\n", + " (\"soixantième\", \"soixante\"),\n", + " (\"millième\", \"mille\"),\n", + " ]\n", + " )\n", "\n", - " graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", - " (\"cinquième\",\t\"cinq\"),\n", - " (\"neuvième\",\t\"neuf\"),\n", - " (\"onzième\",\t\"onze\"),\n", - " (\"douzième\",\t\"douze\"),\n", - " (\"treizième\",\t\"treize\"),\n", - " (\"quatorzième\",\t\"quatorze\"),\n", - " (\"quinzième\",\t\"quinze\"),\n", - " (\"seizième\",\t\"seize\"),\n", - " (\"trentième\",\t\"trente\"),\n", - " (\"quarantième\",\t\"quarante\"),\n", - " (\"cinquantième\",\t\"cinquante\"),\n", - " (\"soixantième\",\t\"soixante\"),\n", - " (\"millième\",\t\"mille\"),\n", - " ])\n", - " \n", - " # Component will accept all tokens that end with desired strings\n", - " graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change) \n", + " # Component will accept all tokens that end with desired strings\n", + " graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change)\n", "\n", - " graph_firsts = pynini.string_map([(\"premier\", \"un\"),(\"première\", \"un\")])\n", - " graph_seconds = pynini.string_map([(\"second\", \"deux\"),(\"seconde\", \"deux\")])\n", + " graph_firsts = pynini.string_map([(\"premier\", \"un\"), (\"première\", \"un\")])\n", + " graph_seconds = pynini.string_map([(\"second\", \"deux\"), (\"seconde\", \"deux\")])\n", "\n", - " graph_get_cardinal = pynini.union(graph_firsts, graph_seconds, graph_get_cardinal) \n", + " graph_get_cardinal = pynini.union(graph_firsts, graph_seconds, graph_get_cardinal)\n", "\n", - " graph_ordinal = graph_get_cardinal @ graph_cardinal\n" + " graph_ordinal = graph_get_cardinal @ graph_cardinal" ] }, { @@ -2906,14 +2964,12 @@ }, "outputs": [], "source": [ - "graph_morpheme = pynutil.insert(\"e\") # Insert e superscript\n", - "graph_plural = pynini.closure(pynini.accep(\"s\"), 0, 1) # We create an acceptor since we must process the possible \"s\"\n", + "graph_morpheme = pynutil.insert(\"e\") # Insert e superscript\n", + "graph_plural = pynini.closure(pynini.accep(\"s\"), 0, 1) # We create an acceptor since we must process the possible \"s\"\n", "\n", "graph_morpheme_component = graph_morpheme + graph_plural\n", "\n", - "graph_morphosyntactic_features = (pynutil.insert(\" morphosyntactic_features: \\\"\") \n", - " + graph_morpheme_component\n", - " )" + "graph_morphosyntactic_features = pynutil.insert(\" morphosyntactic_features: \\\"\") + graph_morpheme_component" ] }, { @@ -2933,7 +2989,7 @@ }, "outputs": [], "source": [ - "graph_reg_ordinals = graph_get_cardinal @ graph_cardinal # Rewriting ordinals to remove the first and second ordinal.\n", + "graph_reg_ordinals = graph_get_cardinal @ graph_cardinal # Rewriting ordinals to remove the first and second ordinal.\n", "\n", "graph_ordinal = pynutil.insert(\"integer: \\\"\") + graph_reg_ordinals + pynutil.insert(\"\\\"\")\n", "graph_ordinal += graph_morphosyntactic_features" @@ -2956,10 +3012,10 @@ }, "outputs": [], "source": [ - "firsts = pynini.string_map([(\"premier\", \"er\"), (\"première\",\"re\")])\n", - "firsts += graph_plural # Still accepts plural marker in superscript\n", - "seconds = pynini.string_map([(\"second\", \"d\"),(\"seconde\", \"de\")])\n", - "seconds += graph_plural \n", + "firsts = pynini.string_map([(\"premier\", \"er\"), (\"première\", \"re\")])\n", + "firsts += graph_plural # Still accepts plural marker in superscript\n", + "seconds = pynini.string_map([(\"second\", \"d\"), (\"seconde\", \"de\")])\n", + "seconds += graph_plural\n", "\n", "graph_firsts = pynutil.insert(\"integer: \\\"1\\\" morphosyntactic_features: \\\"\") + firsts\n", "graph_seconds = pynutil.insert(\"integer: \\\"2\\\" morphosyntactic_features: \\\"\") + seconds" @@ -2983,60 +3039,61 @@ "outputs": [], "source": [ "class OrdinalFst(GraphFst):\n", - " def __init__(self, cardinal: GraphFst):\n", - " super().__init__(name=\"ordinal\", kind=\"classify\")\n", - " graph_cardinal = cardinal.graph_no_exception # may replace\n", - "\n", - " strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", - "\n", - " graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", - " (\"cinquième\",\t\"cinq\"),\n", - " (\"neuvième\",\t\"neuf\"),\n", - " (\"onzième\",\t\"onze\"),\n", - " (\"douzième\",\t\"douze\"),\n", - " (\"treizième\",\t\"treize\"),\n", - " (\"quatorzième\",\t\"quatorze\"),\n", - " (\"quinzième\",\t\"quinze\"),\n", - " (\"seizième\",\t\"seize\"),\n", - " (\"trentième\",\t\"trente\"),\n", - " (\"quarantième\",\t\"quarante\"),\n", - " (\"cinquantième\",\t\"cinquante\"),\n", - " (\"soixantième\",\t\"soixante\"),\n", - " (\"millième\",\t\"mille\"),\n", - " ])\n", - " \n", - " # Component will accept all tokens that end with desired strings\n", - " graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change) \n", - "\n", - " # Graph will map ordinals beyond second ordinal to their cardinals\n", - " graph_reg_ordinals = graph_get_cardinal @ graph_cardinal\n", - "\n", - " # Graphing morphosyntactic_features\n", - " graph_morpheme = pynutil.insert(\"e\") # Insert e superscript\n", - " graph_plural = pynini.accep(\"s\").ques # ques is equivalent to pynini.closure(, 0, 1)\n", - "\n", - " graph_morpheme_component = graph_morpheme + graph_plural\n", - "\n", - " graph_morphosyntactic_features = (pynutil.insert(\" morphosyntactic_features: \\\"\") \n", - " + graph_morpheme_component\n", + " def __init__(self, cardinal: GraphFst):\n", + " super().__init__(name=\"ordinal\", kind=\"classify\")\n", + " graph_cardinal = cardinal.graph_no_exception # may replace\n", + "\n", + " strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", + "\n", + " graph_root_change = pynini.string_map(\n", + " [\n", + " (\"quatrième\", \"quatre\"),\n", + " (\"cinquième\", \"cinq\"),\n", + " (\"neuvième\", \"neuf\"),\n", + " (\"onzième\", \"onze\"),\n", + " (\"douzième\", \"douze\"),\n", + " (\"treizième\", \"treize\"),\n", + " (\"quatorzième\", \"quatorze\"),\n", + " (\"quinzième\", \"quinze\"),\n", + " (\"seizième\", \"seize\"),\n", + " (\"trentième\", \"trente\"),\n", + " (\"quarantième\", \"quarante\"),\n", + " (\"cinquantième\", \"cinquante\"),\n", + " (\"soixantième\", \"soixante\"),\n", + " (\"millième\", \"mille\"),\n", + " ]\n", " )\n", "\n", - " # Adding in the `integer` property:\n", - " graph_ordinal = pynutil.insert(\"integer: \\\"\") + graph_reg_ordinals + pynutil.insert(\"\\\"\")\n", - " graph_ordinal += graph_morphosyntactic_features \n", + " # Component will accept all tokens that end with desired strings\n", + " graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change)\n", + "\n", + " # Graph will map ordinals beyond second ordinal to their cardinals\n", + " graph_reg_ordinals = graph_get_cardinal @ graph_cardinal\n", + "\n", + " # Graphing morphosyntactic_features\n", + " graph_morpheme = pynutil.insert(\"e\") # Insert e superscript\n", + " graph_plural = pynini.accep(\"s\").ques # ques is equivalent to pynini.closure(, 0, 1)\n", "\n", - " # Case of first and second ordinals\n", - " firsts = pynini.string_map([(\"premier\", \"er\"), (\"première\",\"re\")])\n", - " firsts += graph_plural # Still accepts plural marker in superscript\n", - " seconds = pynini.string_map([(\"second\", \"d\"),(\"seconde\", \"de\")])\n", - " seconds += graph_plural \n", + " graph_morpheme_component = graph_morpheme + graph_plural\n", "\n", - " graph_firsts = pynutil.insert(\"integer: \\\"1\\\" morphosyntactic_features: \\\"\") + firsts\n", - " graph_seconds = pynutil.insert(\"integer: \\\"2\\\" morphosyntactic_features: \\\"\") + seconds\n", + " graph_morphosyntactic_features = pynutil.insert(\" morphosyntactic_features: \\\"\") + graph_morpheme_component\n", "\n", - " # All together\n", - " graph_ordinal = pynini.union(graph_ordinal, graph_firsts, graph_seconds)\n", - " self.fst = graph_ordinal.optimize()" + " # Adding in the `integer` property:\n", + " graph_ordinal = pynutil.insert(\"integer: \\\"\") + graph_reg_ordinals + pynutil.insert(\"\\\"\")\n", + " graph_ordinal += graph_morphosyntactic_features\n", + "\n", + " # Case of first and second ordinals\n", + " firsts = pynini.string_map([(\"premier\", \"er\"), (\"première\", \"re\")])\n", + " firsts += graph_plural # Still accepts plural marker in superscript\n", + " seconds = pynini.string_map([(\"second\", \"d\"), (\"seconde\", \"de\")])\n", + " seconds += graph_plural\n", + "\n", + " graph_firsts = pynutil.insert(\"integer: \\\"1\\\" morphosyntactic_features: \\\"\") + firsts\n", + " graph_seconds = pynutil.insert(\"integer: \\\"2\\\" morphosyntactic_features: \\\"\") + seconds\n", + "\n", + " # All together\n", + " graph_ordinal = pynini.union(graph_ordinal, graph_firsts, graph_seconds)\n", + " self.fst = graph_ordinal.optimize()" ] }, { @@ -3102,7 +3159,9 @@ "source": [ "special_tokens = pynini.accep(\"siècle\")\n", "\n", - "graph_special_tokens = delete_space + pynutil.insert(\"/\") + special_tokens # We need to delete the space in between this token and the following one.\n", + "graph_special_tokens = (\n", + " delete_space + pynutil.insert(\"/\") + special_tokens\n", + ") # We need to delete the space in between this token and the following one.\n", "graph_special_tokens = pynini.closure(graph_special_tokens, 0, 1)\n", "\n", "graph_ordinal += graph_special_tokens + pynutil.insert(\"\\\"\")" @@ -3128,69 +3187,71 @@ "outputs": [], "source": [ "class OrdinalFst(GraphFst):\n", - " def __init__(self, cardinal: GraphFst):\n", - " super().__init__(name=\"ordinal\", kind=\"classify\")\n", - " graph_cardinal = cardinal.graph_no_exception # may replace\n", - "\n", - " strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", - "\n", - " graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", - " (\"cinquième\",\t\"cinq\"),\n", - " (\"neuvième\",\t\"neuf\"),\n", - " (\"onzième\",\t\"onze\"),\n", - " (\"douzième\",\t\"douze\"),\n", - " (\"treizième\",\t\"treize\"),\n", - " (\"quatorzième\",\t\"quatorze\"),\n", - " (\"quinzième\",\t\"quinze\"),\n", - " (\"seizième\",\t\"seize\"),\n", - " (\"trentième\",\t\"trente\"),\n", - " (\"quarantième\",\t\"quarante\"),\n", - " (\"cinquantième\",\t\"cinquante\"),\n", - " (\"soixantième\",\t\"soixante\"),\n", - " (\"millième\",\t\"mille\"),\n", - " ])\n", - " \n", - " # Component will accept all tokens that end with desired strings\n", - " graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change) \n", - "\n", - " # Graph will map ordinals beyond second ordinal to their cardinals\n", - " graph_reg_ordinals = graph_get_cardinal @ graph_cardinal\n", - "\n", - " # Graphing morphosyntactic_features\n", - " graph_morpheme = pynutil.insert(\"e\") # Insert e superscript\n", - " graph_plural = pynini.accep(\"s\").ques # We create an acceptor since we must process the possible \"s\"\n", - "\n", - " graph_morpheme_component = graph_morpheme + graph_plural\n", - "\n", - " graph_morphosyntactic_features = (pynutil.insert(\" morphosyntactic_features: \\\"\") \n", - " + graph_morpheme_component\n", + " def __init__(self, cardinal: GraphFst):\n", + " super().__init__(name=\"ordinal\", kind=\"classify\")\n", + " graph_cardinal = cardinal.graph_no_exception # may replace\n", + "\n", + " strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", + "\n", + " graph_root_change = pynini.string_map(\n", + " [\n", + " (\"quatrième\", \"quatre\"),\n", + " (\"cinquième\", \"cinq\"),\n", + " (\"neuvième\", \"neuf\"),\n", + " (\"onzième\", \"onze\"),\n", + " (\"douzième\", \"douze\"),\n", + " (\"treizième\", \"treize\"),\n", + " (\"quatorzième\", \"quatorze\"),\n", + " (\"quinzième\", \"quinze\"),\n", + " (\"seizième\", \"seize\"),\n", + " (\"trentième\", \"trente\"),\n", + " (\"quarantième\", \"quarante\"),\n", + " (\"cinquantième\", \"cinquante\"),\n", + " (\"soixantième\", \"soixante\"),\n", + " (\"millième\", \"mille\"),\n", + " ]\n", " )\n", "\n", - " # Adding in the `integer` property:\n", - " graph_ordinal = pynutil.insert(\"integer: \\\"\") + graph_reg_ordinals + pynutil.insert(\"\\\"\")\n", - " graph_ordinal += graph_morphosyntactic_features \n", + " # Component will accept all tokens that end with desired strings\n", + " graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change)\n", + "\n", + " # Graph will map ordinals beyond second ordinal to their cardinals\n", + " graph_reg_ordinals = graph_get_cardinal @ graph_cardinal\n", + "\n", + " # Graphing morphosyntactic_features\n", + " graph_morpheme = pynutil.insert(\"e\") # Insert e superscript\n", + " graph_plural = pynini.accep(\"s\").ques # We create an acceptor since we must process the possible \"s\"\n", "\n", - " # Case of first and second ordinals\n", - " firsts = pynini.string_map([(\"premier\", \"er\"), (\"première\",\"re\")])\n", - " firsts += graph_plural # Still accepts plural marker in superscript\n", - " seconds = pynini.string_map([(\"second\", \"d\"),(\"seconde\", \"de\")])\n", - " seconds += graph_plural \n", + " graph_morpheme_component = graph_morpheme + graph_plural\n", "\n", - " graph_firsts = pynutil.insert(\"integer: \\\"1\\\" morphosyntactic_features: \\\"\") + firsts\n", - " graph_seconds = pynutil.insert(\"integer: \\\"2\\\" morphosyntactic_features: \\\"\") + seconds\n", + " graph_morphosyntactic_features = pynutil.insert(\" morphosyntactic_features: \\\"\") + graph_morpheme_component\n", "\n", + " # Adding in the `integer` property:\n", + " graph_ordinal = pynutil.insert(\"integer: \\\"\") + graph_reg_ordinals + pynutil.insert(\"\\\"\")\n", + " graph_ordinal += graph_morphosyntactic_features\n", "\n", - " # Special tokens\n", - " special_tokens = pynini.accep(\"siècle\")\n", + " # Case of first and second ordinals\n", + " firsts = pynini.string_map([(\"premier\", \"er\"), (\"première\", \"re\")])\n", + " firsts += graph_plural # Still accepts plural marker in superscript\n", + " seconds = pynini.string_map([(\"second\", \"d\"), (\"seconde\", \"de\")])\n", + " seconds += graph_plural\n", "\n", - " graph_special_tokens = delete_space + pynutil.insert(\"/\") + special_tokens # We need to delete the space in between this token and the following one.\n", - " graph_special_tokens = pynini.closure(graph_special_tokens, 0, 1)\n", + " graph_firsts = pynutil.insert(\"integer: \\\"1\\\" morphosyntactic_features: \\\"\") + firsts\n", + " graph_seconds = pynutil.insert(\"integer: \\\"2\\\" morphosyntactic_features: \\\"\") + seconds\n", "\n", - " graph_ordinal += graph_special_tokens + pynutil.insert(\"\\\"\")\n", + " # Special tokens\n", + " special_tokens = pynini.accep(\"siècle\")\n", "\n", - " # Finishing\n", - " graph_ordinal = self.add_tokens(graph_ordinal)\n", - " self.fst = graph_ordinal.optimize()\n" + " graph_special_tokens = (\n", + " delete_space + pynutil.insert(\"/\") + special_tokens\n", + " ) # We need to delete the space in between this token and the following one.\n", + " graph_special_tokens = pynini.closure(graph_special_tokens, 0, 1)\n", + "\n", + " graph_ordinal += graph_special_tokens + pynutil.insert(\"\\\"\")\n", + "\n", + " # Finishing\n", + " graph_ordinal = self.add_tokens(graph_ordinal)\n", + " self.fst = graph_ordinal.optimize()" ] }, { @@ -3248,16 +3309,16 @@ }, "outputs": [], "source": [ - " # Create mappings for all superscripts\n", - " superscript = pynini.union(\n", - " pynini.cross(\"e\", \"ᵉ\"), # only delete first quote since there may be more features\n", - " pynini.cross(\"d\", \"ᵈ\"),\n", - " pynini.cross(\"r\", \"ʳ\"),\n", - " pynini.cross(\"s\", \"ˢ\"),\n", - " )\n", + "# Create mappings for all superscripts\n", + "superscript = pynini.union(\n", + " pynini.cross(\"e\", \"ᵉ\"), # only delete first quote since there may be more features\n", + " pynini.cross(\"d\", \"ᵈ\"),\n", + " pynini.cross(\"r\", \"ʳ\"),\n", + " pynini.cross(\"s\", \"ˢ\"),\n", + ")\n", "\n", - " # Append to deletion of feature property. Note that we use plus closure for multiple superscripts.\n", - " graph_morphosyntactic_features = pynutil.delete(\" morphosyntactic_features: \\\"\") + superscript.plus" + "# Append to deletion of feature property. Note that we use plus closure for multiple superscripts.\n", + "graph_morphosyntactic_features = pynutil.delete(\" morphosyntactic_features: \\\"\") + superscript.plus" ] }, { @@ -3307,12 +3368,12 @@ "outputs": [], "source": [ "graph_integer = (\n", - " pynutil.delete(\"integer:\")\n", - " + delete_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_DIGIT, 1)\n", - " + pynutil.delete(\"\\\"\")\n", - " )\n", + " pynutil.delete(\"integer:\")\n", + " + delete_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_DIGIT, 1)\n", + " + pynutil.delete(\"\\\"\")\n", + ")\n", "graph_Arabic = graph_integer + graph_morphosyntactic_features + pynutil.delete(\"\\\"\")" ] }, @@ -3351,27 +3412,33 @@ }, "outputs": [], "source": [ - "digits = pynini.string_map([(\"1\", \"I\"),\n", - " (\"2\",\t\"II\"),\n", - " (\"3\",\t\"III\"),\n", - " (\"4\",\t\"IV\"),\n", - " (\"5\",\t\"V\"),\n", - " (\"6\",\t\"VI\"),\n", - " (\"7\",\t\"VII\"),\n", - " (\"8\",\t\"VIII\"),\n", - " (\"9\",\t\"IX\"),\n", - " ])\n", - "tens = pynini.string_map([(\"1\", \"X\"),\n", - " (\"2\",\t\"XX\"),\n", - " (\"3\",\t\"XXX\"),\n", - " (\"4\",\t\"XL\"),\n", - " (\"5\",\t\"L\"),\n", - " (\"6\",\t\"LX\"),\n", - " (\"7\",\t\"LXX\"),\n", - " (\"8\",\t\"LXXX\"),\n", - " (\"9\",\t\"XC\"),\n", - " ])\n", - "zero = pynutil.delete(\"0\") # No Roman representation for zero." + "digits = pynini.string_map(\n", + " [\n", + " (\"1\", \"I\"),\n", + " (\"2\", \"II\"),\n", + " (\"3\", \"III\"),\n", + " (\"4\", \"IV\"),\n", + " (\"5\", \"V\"),\n", + " (\"6\", \"VI\"),\n", + " (\"7\", \"VII\"),\n", + " (\"8\", \"VIII\"),\n", + " (\"9\", \"IX\"),\n", + " ]\n", + ")\n", + "tens = pynini.string_map(\n", + " [\n", + " (\"1\", \"X\"),\n", + " (\"2\", \"XX\"),\n", + " (\"3\", \"XXX\"),\n", + " (\"4\", \"XL\"),\n", + " (\"5\", \"L\"),\n", + " (\"6\", \"LX\"),\n", + " (\"7\", \"LXX\"),\n", + " (\"8\", \"LXXX\"),\n", + " (\"9\", \"XC\"),\n", + " ]\n", + ")\n", + "zero = pynutil.delete(\"0\") # No Roman representation for zero." ] }, { @@ -3392,7 +3459,7 @@ "outputs": [], "source": [ "map_one_digit = NEMO_DIGIT\n", - "map_two_digits = NEMO_DIGIT ** 2 # pynini overloads the exponent function to allow self-concatenation." + "map_two_digits = NEMO_DIGIT**2 # pynini overloads the exponent function to allow self-concatenation." ] }, { @@ -3486,31 +3553,37 @@ " graph_Arabic = graph_integer + graph_morphosyntactic_features + pynutil.delete(\"\\\"\")\n", "\n", " # Mapping Roman numerals\n", - " digits = pynini.string_map([(\"1\", \"I\"),\n", - " (\"2\",\t\"II\"),\n", - " (\"3\",\t\"III\"),\n", - " (\"4\",\t\"IV\"),\n", - " (\"5\",\t\"V\"),\n", - " (\"6\",\t\"VI\"),\n", - " (\"7\",\t\"VII\"),\n", - " (\"8\",\t\"VIII\"),\n", - " (\"9\",\t\"IX\"),\n", - " ])\n", - " tens = pynini.string_map([(\"1\", \"X\"),\n", - " (\"2\",\t\"XX\"),\n", - " (\"3\",\t\"XXX\"),\n", - " (\"4\",\t\"XL\"),\n", - " (\"5\",\t\"L\"),\n", - " (\"6\",\t\"LX\"),\n", - " (\"7\",\t\"LXX\"),\n", - " (\"8\",\t\"LXXX\"),\n", - " (\"9\",\t\"XC\"),\n", - " ])\n", - " zero = pynutil.delete(\"0\") # No Roman representation for zero.\n", + " digits = pynini.string_map(\n", + " [\n", + " (\"1\", \"I\"),\n", + " (\"2\", \"II\"),\n", + " (\"3\", \"III\"),\n", + " (\"4\", \"IV\"),\n", + " (\"5\", \"V\"),\n", + " (\"6\", \"VI\"),\n", + " (\"7\", \"VII\"),\n", + " (\"8\", \"VIII\"),\n", + " (\"9\", \"IX\"),\n", + " ]\n", + " )\n", + " tens = pynini.string_map(\n", + " [\n", + " (\"1\", \"X\"),\n", + " (\"2\", \"XX\"),\n", + " (\"3\", \"XXX\"),\n", + " (\"4\", \"XL\"),\n", + " (\"5\", \"L\"),\n", + " (\"6\", \"LX\"),\n", + " (\"7\", \"LXX\"),\n", + " (\"8\", \"LXXX\"),\n", + " (\"9\", \"XC\"),\n", + " ]\n", + " )\n", + " zero = pynutil.delete(\"0\") # No Roman representation for zero.\n", "\n", " # filters for Roman digits\n", " map_one_digit = NEMO_DIGIT\n", - " map_two_digits = NEMO_DIGIT ** 2 # pynini overloads the exponent function to allow self-concatenation.\n", + " map_two_digits = NEMO_DIGIT**2 # pynini overloads the exponent function to allow self-concatenation.\n", "\n", " # Composing onto roman digits\n", " graph_one_digit_romans = NEMO_DIGIT @ digits\n", @@ -3525,10 +3598,10 @@ " graph_romans += pynini.cross(\"/\", \" \") + \"siècle\" + pynutil.delete(\"\\\"\")\n", "\n", " # Final composition\n", - " graph = (graph_romans | graph_Arabic)\n", + " graph = graph_romans | graph_Arabic\n", "\n", " delete_tokens = self.delete_tokens(graph)\n", - " self.fst = delete_tokens.optimize()\n" + " self.fst = delete_tokens.optimize()" ] }, { @@ -3643,7 +3716,7 @@ }, "outputs": [], "source": [ - "cardinal = CardinalFst().graph_no_exception # NeMo equivalent of just_cardinals\n", + "cardinal = CardinalFst().graph_no_exception # NeMo equivalent of just_cardinals\n", "\n", "# place cardinal under closure to permit values <=1\n", "graph_integer = pynini.closure(cardinal, 0, 1)" @@ -3710,8 +3783,8 @@ }, "outputs": [], "source": [ - "example = \"trois virgule trois cinquante-cinq\" \n", - "apply_fst(example, graph_decimal) # Should output only the cardinals in the string" + "example = \"trois virgule trois cinquante-cinq\"\n", + "apply_fst(example, graph_decimal) # Should output only the cardinals in the string" ] }, { @@ -3813,7 +3886,9 @@ }, "outputs": [], "source": [ - "graph_integer_or_none = graph_integer | pynutil.insert(\"integer_part: \\\"0\\\" \", weight=.1) # In cases we don't always have an integer preceding\n", + "graph_integer_or_none = graph_integer | pynutil.insert(\n", + " \"integer_part: \\\"0\\\" \", weight=0.1\n", + ") # In cases we don't always have an integer preceding\n", "graph_decimal_no_sign = graph_integer_or_none + delete_space + pynutil.delete(\"virgule\") + graph_fractional" ] }, @@ -3876,13 +3951,15 @@ " delete_virgule = pynutil.delete(\"virgule\")\n", "\n", " graph_integer = pynutil.insert(\"integer_part: \\\"\") + cardinal + pynutil.insert(\"\\\" \") + delete_space\n", - " graph_integer_or_none = graph_integer | pynutil.insert(\"integer_part: \\\"0\\\" \", weight=.001) # In cases we don't always have an integer preceding\n", + " graph_integer_or_none = graph_integer | pynutil.insert(\n", + " \"integer_part: \\\"0\\\" \", weight=0.001\n", + " ) # In cases we don't always have an integer preceding\n", "\n", " graph_string_of_cardinals = delete_space + cardinal\n", " graph_string_of_cardinals = pynini.closure(graph_string_of_cardinals, 1)\n", " graph_fractional = pynutil.insert(\"fractional_part: \\\"\") + graph_string_of_cardinals + pynutil.insert(\"\\\"\")\n", "\n", - " graph_decimal_no_sign = graph_integer_or_none + pynutil.delete(\"virgule\") + graph_fractional \n", + " graph_decimal_no_sign = graph_integer_or_none + pynutil.delete(\"virgule\") + graph_fractional\n", "\n", " graph_negative = pynini.cross(\"moins\", \"negative: \\\"-\\\" \") + delete_space\n", " graph_negative = pynini.closure(graph_negative, 0, 1)\n", @@ -3890,8 +3967,7 @@ " graph_decimal = graph_negative + graph_decimal_no_sign\n", "\n", " graph = self.add_tokens(graph_decimal)\n", - " self.fst = graph.optimize()\n", - "\n" + " self.fst = graph.optimize()" ] }, { @@ -3936,19 +4012,19 @@ "outputs": [], "source": [ "suffix = pynini.union(\n", - " \"million\",\n", - " \"millions\",\n", - " \"milliard\",\n", - " \"milliards\",\n", - " \"billion\",\n", - " \"billions\",\n", - " \"billiard\",\n", - " \"billiards\",\n", - " \"trillion\",\n", - " \"trillions\",\n", - " \"trilliard\",\n", - " \"trilliards\",\n", - " )" + " \"million\",\n", + " \"millions\",\n", + " \"milliard\",\n", + " \"milliards\",\n", + " \"billion\",\n", + " \"billions\",\n", + " \"billiard\",\n", + " \"billiards\",\n", + " \"trillion\",\n", + " \"trillions\",\n", + " \"trilliard\",\n", + " \"trilliards\",\n", + ")" ] }, { @@ -3971,7 +4047,7 @@ "outputs": [], "source": [ "def get_quantity(decimal, cardinal_up_to_thousand):\n", - " key_values = pynini.union(\n", + " key_values = pynini.union(\n", " \"million\",\n", " \"millions\",\n", " \"milliard\",\n", @@ -3986,23 +4062,23 @@ " \"trilliards\",\n", " )\n", " # The French WFST that this borrows from has not removed leading zeroes yet.\n", - " numbers = cardinal_up_to_thousand @ (\n", - " pynutil.delete(pynini.closure(\"0\")) + pynini.difference(NEMO_DIGIT, \"0\") + pynini.closure(NEMO_DIGIT)\n", - " )\n", - " res = (\n", - " pynutil.insert(\"integer_part: \\\"\")\n", - " + numbers\n", - " + pynutil.insert(\"\\\"\")\n", - " + (\n", - " pynini.union(delete_hyphen, delete_extra_space)\n", - " ) # Can be written either as 'deux-millions' or 'deux millions' depending on whether it registers as a noun or part of cardinal.\n", - " + pynutil.insert(\" quantity: \\\"\")\n", - " + suffix\n", - " + pynutil.insert(\"\\\"\")\n", - " )\n", - " # Union with decimal to permit either a cardinal or decimal representation.\n", - " res |= decimal + delete_extra_space + pynutil.insert(\" quantity: \\\"\") + suffix + pynutil.insert(\"\\\"\")\n", - " return res" + " numbers = cardinal_up_to_thousand @ (\n", + " pynutil.delete(pynini.closure(\"0\")) + pynini.difference(NEMO_DIGIT, \"0\") + pynini.closure(NEMO_DIGIT)\n", + " )\n", + " res = (\n", + " pynutil.insert(\"integer_part: \\\"\")\n", + " + numbers\n", + " + pynutil.insert(\"\\\"\")\n", + " + (\n", + " pynini.union(delete_hyphen, delete_extra_space)\n", + " ) # Can be written either as 'deux-millions' or 'deux millions' depending on whether it registers as a noun or part of cardinal.\n", + " + pynutil.insert(\" quantity: \\\"\")\n", + " + suffix\n", + " + pynutil.insert(\"\\\"\")\n", + " )\n", + " # Union with decimal to permit either a cardinal or decimal representation.\n", + " res |= decimal + delete_extra_space + pynutil.insert(\" quantity: \\\"\") + suffix + pynutil.insert(\"\\\"\")\n", + " return res" ] }, { @@ -4030,22 +4106,22 @@ " delete_virgule = pynutil.delete(\"virgule\")\n", "\n", " graph_integer = pynutil.insert(\"integer_part: \\\"\") + cardinal + pynutil.insert(\"\\\" \") + delete_space\n", - " graph_integer_or_none = graph_integer | pynutil.insert(\"integer_part: \\\"0\\\" \", weight=.001) # In cases we don't always have an integer preceding\n", + " graph_integer_or_none = graph_integer | pynutil.insert(\n", + " \"integer_part: \\\"0\\\" \", weight=0.001\n", + " ) # In cases we don't always have an integer preceding\n", "\n", " graph_string_of_cardinals = delete_space + cardinal\n", " graph_string_of_cardinals = pynini.closure(graph_string_of_cardinals, 1)\n", " graph_fractional = pynutil.insert(\"fractional_part: \\\"\") + graph_string_of_cardinals + pynutil.insert(\"\\\"\")\n", "\n", - " graph_decimal_no_sign = graph_integer_or_none + delete_virgule + graph_fractional \n", + " graph_decimal_no_sign = graph_integer_or_none + delete_virgule + graph_fractional\n", "\n", " graph_negative = pynini.cross(\"moins\", \"negative: \\\"-\\\" \") + delete_space\n", " graph_negative = pynini.closure(graph_negative, 0, 1)\n", " graph_decimal = graph_negative + graph_decimal_no_sign\n", "\n", " # Union default decimal with version that accepts quantities\n", - " graph_decimal |= graph_negative + get_quantity(\n", - " graph_decimal_no_sign, quantities_cardinal\n", - " )\n", + " graph_decimal |= graph_negative + get_quantity(graph_decimal_no_sign, quantities_cardinal)\n", " final_graph = self.add_tokens(graph_decimal)\n", " self.fst = final_graph.optimize()" ] @@ -4114,8 +4190,10 @@ }, "outputs": [], "source": [ - "every_three_digits = NEMO_DIGIT ** 3 # accepts a string of three digits\n", - "space_every_three_integer = pynini.closure(NEMO_NON_BREAKING_SPACE + every_three_digits) # inserts space before every three digits." + "every_three_digits = NEMO_DIGIT**3 # accepts a string of three digits\n", + "space_every_three_integer = pynini.closure(\n", + " NEMO_NON_BREAKING_SPACE + every_three_digits\n", + ") # inserts space before every three digits." ] }, { @@ -4186,7 +4264,7 @@ " super().__init__(name=\"decimal\", kind=\"verbalize\")\n", "\n", " # Need parser to group digits by threes\n", - " exactly_three_digits = NEMO_DIGIT ** 3\n", + " exactly_three_digits = NEMO_DIGIT**3\n", " at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3)\n", "\n", " space_every_three_integer = (\n", @@ -4229,7 +4307,7 @@ " )\n", " optional_quantity = pynini.closure(pynutil.insert(\" \") + quantity + delete_space, 0, 1)\n", " graph = (optional_integer + optional_fractional + optional_quantity).optimize()\n", - " self.numbers = graph # Saving just the part of the graph used for numbers\n", + " self.numbers = graph # Saving just the part of the graph used for numbers\n", " graph = optional_sign + graph\n", " delete_tokens = self.delete_tokens(graph)\n", " self.fst = delete_tokens.optimize()" @@ -4254,7 +4332,7 @@ "example2 = 'decimal { integer_part: \"22323\" fractional_part: \"104553\" }'\n", "\n", "apply_fst(example1, fst)\n", - "apply_fst(example2, fst)\n" + "apply_fst(example2, fst)" ] }, { @@ -4356,7 +4434,7 @@ "from nemo_text_processing.inverse_text_normalization.fr.taggers import cardinal\n", "\n", "cardinal_graph = cardinal.CardinalFst()\n", - "graph_cardinal = cardinal_graph.graph_no_exception # graphs cardinals w/o tokenization\n", + "graph_cardinal = cardinal_graph.graph_no_exception # graphs cardinals w/o tokenization\n", "\n", "add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert(\"0\") + NEMO_DIGIT)\n", "graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit" @@ -4383,27 +4461,27 @@ " delete_virgule = pynutil.delete(\"virgule\")\n", "\n", " graph_integer = pynutil.insert(\"integer_part: \\\"\") + cardinal + pynutil.insert(\"\\\" \") + delete_space\n", - " graph_integer_or_none = graph_integer | pynutil.insert(\"integer_part: \\\"0\\\" \", weight=.001) # In cases we don't always have an integer preceding\n", + " graph_integer_or_none = graph_integer | pynutil.insert(\n", + " \"integer_part: \\\"0\\\" \", weight=0.001\n", + " ) # In cases we don't always have an integer preceding\n", "\n", " graph_string_of_cardinals = delete_space + cardinal\n", " graph_string_of_cardinals = pynini.closure(graph_string_of_cardinals, 1)\n", " graph_fractional = pynutil.insert(\"fractional_part: \\\"\") + graph_string_of_cardinals + pynutil.insert(\"\\\"\")\n", "\n", - " graph_decimal_no_sign = graph_integer_or_none + delete_virgule + graph_fractional \n", + " graph_decimal_no_sign = graph_integer_or_none + delete_virgule + graph_fractional\n", "\n", " ### NEW GRAPH HERE\n", " self.final_graph_wo_negative = graph_decimal_no_sign | get_quantity(\n", " final_graph_wo_sign, cardinal.graph_hundreds_component_at_least_one_none_zero_digit\n", " )\n", - " \n", + "\n", " graph_negative = pynini.cross(\"moins\", \"negative: \\\"-\\\" \") + delete_space\n", " graph_negative = pynini.closure(graph_negative, 0, 1)\n", " graph_decimal = graph_negative + graph_decimal_no_sign\n", "\n", " # Union default decimal with version that accepts quantities\n", - " graph_decimal |= graph_negative + get_quantity(\n", - " graph_decimal_no_sign, quantities_cardinal\n", - " )\n", + " graph_decimal |= graph_negative + get_quantity(graph_decimal_no_sign, quantities_cardinal)\n", " final_graph = self.add_tokens(graph_decimal)\n", " self.fst = final_graph.optimize()" ] @@ -4426,8 +4504,8 @@ "cardinal_graph = cardinal.CardinalFst()\n", "decimal_graph = decimal.DecimalFst(cardinal_graph)\n", "\n", - "graph_cardinal = cardinal_graph.graph_no_exception # graphs cardinals w/o tokenization\n", - "graph_decimal = decimal_graph.final_graph_wo_negative # graphs positive decimals w/o tokenization\n", + "graph_cardinal = cardinal_graph.graph_no_exception # graphs cardinals w/o tokenization\n", + "graph_decimal = decimal_graph.final_graph_wo_negative # graphs positive decimals w/o tokenization\n", "\n", "add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert(\"0\") + NEMO_DIGIT)\n", "graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit" @@ -4473,7 +4551,7 @@ }, "outputs": [], "source": [ - "graph_preposition = pynini.union(\"des \", \"d'\") # Used for large amounts (billions de euros)\n", + "graph_preposition = pynini.union(\"des \", \"d'\") # Used for large amounts (billions de euros)\n", "major_currency_no_normalize = pynini.closure(graph_preposition, 0, 1) + major_currency.project(\"input\")" ] }, @@ -4520,7 +4598,7 @@ " minor_currency += graph_plural\n", "\n", " major_currency_no_normalize = major_currency.project(\"input\")\n", - " graph_preposition = pynini.union(\"des \", \"d'\") # Used for large amounts (billions de euros)\n", + " graph_preposition = pynini.union(\"des \", \"d'\") # Used for large amounts (billions de euros)\n", " major_currency_no_normalize = graph_preposition + major_currency.project(\"input\")\n", "\n", " graph_cardinal = cardinal.graph_no_exception\n", @@ -4574,13 +4652,13 @@ "outputs": [], "source": [ "graph_integer_component = pynutil.insert(\"integer_part: \\\"\") + graph_cardinal + pynutil.insert(\"\\\"\")\n", - "graph_fractional_component = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\")\n", + "graph_fractional_component = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\")\n", "\n", "graph_major_currency = pynutil.insert(\" currency: \\\"\") + major_currency + pynutil.insert(\"\\\"\")\n", "graph_minor_currency = pynutil.insert(\" currency: \\\"\") + minor_currency + pynutil.insert(\"\\\"\")\n", "\n", "graph_only_major_money = graph_integer_component + delete_space + graph_major_currency\n", - "graph_only_minor_money = graph_fractional_component + delete_space + graph_minor_currency " + "graph_only_minor_money = graph_fractional_component + delete_space + graph_minor_currency" ] }, { @@ -4600,8 +4678,10 @@ }, "outputs": [], "source": [ - "implicit_fractional_part = delete_space + pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\") \n", - "implicit_fractional_part = pynini.closure(implicit_fractional_part, 0, 1) " + "implicit_fractional_part = (\n", + " delete_space + pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\")\n", + ")\n", + "implicit_fractional_part = pynini.closure(implicit_fractional_part, 0, 1)" ] }, { @@ -4621,12 +4701,12 @@ }, "outputs": [], "source": [ - "delete_et = pynutil.delete(\"et \") # Sometimes prefaces the minor currency\n", - "delete_et = pynini.closure(delete_et, 0 , 1)\n", + "delete_et = pynutil.delete(\"et \") # Sometimes prefaces the minor currency\n", + "delete_et = pynini.closure(delete_et, 0, 1)\n", "\n", - "delete_minor = pynutil.delete(minor_currency.project(\"input\")) # to remove the minor currency\n", + "delete_minor = pynutil.delete(minor_currency.project(\"input\")) # to remove the minor currency\n", "\n", - "explicit_fractional_part = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\") \n", + "explicit_fractional_part = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\")\n", "explicit_fractional_part = delete_space + delete_et + explicit_fractional_part + delete_space + delete_minor\n", "explicit_fractional_part = pynini.closure(explicit_fractional_part, 0, 1)" ] @@ -4702,7 +4782,7 @@ " minor_currency += graph_plural\n", "\n", " major_currency_no_normalize = major_currency.project(\"input\")\n", - " graph_preposition = pynini.union(\"des \", \"d'\") # Used for large amounts (billions de euros)\n", + " graph_preposition = pynini.union(\"des \", \"d'\") # Used for large amounts (billions de euros)\n", " major_currency_no_normalize = graph_preposition + major_currency.project(\"input\")\n", "\n", " graph_cardinal = cardinal.graph_no_exception\n", @@ -4712,24 +4792,29 @@ " graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit\n", "\n", " graph_integer_component = pynutil.insert(\"integer_part: \\\"\") + graph_cardinal + pynutil.insert(\"\\\"\")\n", - " graph_fractional_component = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\")\n", + " graph_fractional_component = (\n", + " pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\")\n", + " )\n", "\n", " graph_major_currency = pynutil.insert(\" currency: \\\"\") + major_currency + pynutil.insert(\"\\\"\")\n", " graph_minor_currency = pynutil.insert(\" currency: \\\"\") + minor_currency + pynutil.insert(\"\\\"\")\n", "\n", " graph_only_major_money = graph_integer_component + delete_space + graph_major_currency\n", - " graph_only_minor_money = graph_fractional_component + delete_space + graph_minor_currency \n", - "\n", - " implicit_fractional_part = delete_space + pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\") \n", - " implicit_fractional_part = pynini.closure(implicit_fractional_part, 0, 1) \n", + " graph_only_minor_money = graph_fractional_component + delete_space + graph_minor_currency\n", "\n", + " implicit_fractional_part = (\n", + " delete_space + pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\")\n", + " )\n", + " implicit_fractional_part = pynini.closure(implicit_fractional_part, 0, 1)\n", "\n", - " delete_et = pynutil.delete(\"et \") # Sometimes prefaces the minor currency\n", - " delete_et = pynini.closure(delete_et, 0 , 1)\n", + " delete_et = pynutil.delete(\"et \") # Sometimes prefaces the minor currency\n", + " delete_et = pynini.closure(delete_et, 0, 1)\n", "\n", - " delete_minor = pynutil.delete(minor_currency.project(\"input\")) # to remove the minor currency\n", + " delete_minor = pynutil.delete(minor_currency.project(\"input\")) # to remove the minor currency\n", "\n", - " explicit_fractional_part = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\") \n", + " explicit_fractional_part = (\n", + " pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\")\n", + " )\n", " explicit_fractional_part = delete_space + delete_et + explicit_fractional_part + delete_space + delete_minor\n", " explicit_fractional_part = pynini.closure(explicit_fractional_part, 0, 1)\n", "\n", @@ -4857,18 +4942,18 @@ }, "outputs": [], "source": [ - " def __init__(self, decimal: GraphFst):\n", - " super().__init__(name=\"money\", kind=\"verbalize\")\n", - " unit = (\n", - " pynutil.delete(\"currency:\")\n", - " + delete_extra_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_NOT_QUOTE, 1)\n", - " + pynutil.delete(\"\\\"\")\n", - " )\n", - " graph = decimal.numbers + delete_space + unit\n", - " delete_tokens = self.delete_tokens(graph)\n", - " self.fst = delete_tokens.optimize()" + "def __init__(self, decimal: GraphFst):\n", + " super().__init__(name=\"money\", kind=\"verbalize\")\n", + " unit = (\n", + " pynutil.delete(\"currency:\")\n", + " + delete_extra_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_NOT_QUOTE, 1)\n", + " + pynutil.delete(\"\\\"\")\n", + " )\n", + " graph = decimal.numbers + delete_space + unit\n", + " delete_tokens = self.delete_tokens(graph)\n", + " self.fst = delete_tokens.optimize()" ] }, { @@ -4978,99 +5063,103 @@ }, "outputs": [], "source": [ - "hours = pynini.string_map([\n", - " (\"zéro\",\"0\"),\n", - " (\"une\",\"1\"),\n", - " (\"deux\",\"2\"),\n", - " (\"trois\",\"3\"),\n", - " (\"quatre\",\"4\"),\n", - " (\"cinq\",\"5\"),\n", - " (\"six\",\"6\"),\n", - " (\"sept\",\"7\"),\n", - " (\"huit\",\"8\"),\n", - " (\"neuf\",\"9\"),\n", - " (\"dix\",\"10\"),\n", - " (\"onze\",\"11\"),\n", - " (\"douze\",\"12\"),\n", - " (\"treize\",\"13\"),\n", - " (\"quatorze\",\"14\"),\n", - " (\"quinze\",\"15\"),\n", - " (\"seize\",\"16\"),\n", - " (\"dix-sept\",\"17\"),\n", - " (\"dix-huit\",\"18\"),\n", - " (\"dix-neuf\",\"19\"),\n", - " (\"vingt\",\"20\"),\n", - " (\"vingt-et-une\",\"21\"),\n", - " (\"vingt et une\",\"21\"),\n", - " (\"vingt-deux\",\"22\"),\n", - " (\"vingt-trois\",\"23\"),\n", - " (\"vingt-quatre\",\"24\"),\n", - "])\n", - "minutes = pynini.string_map([\n", - " (\"une\", \"01\"),\n", - " (\"deux\", \"02\"),\n", - " (\"trois\", \"03\"),\n", - " (\"quatre\", \"04\"),\n", - " (\"cinq\", \"05\"),\n", - " (\"six\", \"06\"),\n", - " (\"sept\", \"07\"),\n", - " (\"huit\", \"08\"),\n", - " (\"neuf\", \"09\"),\n", - " (\"dix\", \"10\"),\n", - " (\"onze\", \"11\"),\n", - " (\"douze\", \"12\"),\n", - " (\"treize\", \"13\"),\n", - " (\"quatorze\", \"14\"),\n", - " (\"quinze\", \"15\"),\n", - " (\"seize\", \"16\"),\n", - " (\"dix-sept\", \"17\"),\n", - " (\"dix-huit\", \"18\"),\n", - " (\"dix-neuf\", \"19\"),\n", - " (\"vingt\", \"20\"),\n", - " (\"vingt-et-une\", \"21\"),\n", - " (\"vingt et une\", \"21\"),\n", - " (\"vingt-deux\", \"22\"),\n", - " (\"vingt-trois\", \"23\"),\n", - " (\"vingt-quatre\", \"27\"),\n", - " (\"vingt-cinq\", \"25\"),\n", - " (\"vingt-six\", \"26\"),\n", - " (\"vingt-sept\", \"27\"),\n", - " (\"vingt-huit\", \"28\"),\n", - " (\"vingt-neuf\", \"29\"),\n", - " (\"trente\", \"30\"),\n", - " (\"trente-et-une\", \"31\"),\n", - " (\"trente et une\", \"31\"),\n", - " (\"trente-deux\", \"32\"),\n", - " (\"trente-trois\", \"33\"),\n", - " (\"trente-quatre\", \"34\"),\n", - " (\"trente-cinq\", \"35\"),\n", - " (\"trente-six\", \"36\"),\n", - " (\"trente-sept\", \"37\"),\n", - " (\"trente-huit\", \"38\"),\n", - " (\"trente-neuf\", \"39\"),\n", - " (\"quarante\", \"40\"),\n", - " (\"quarante-et-une\", \"41\"),\n", - " (\"quarante et une\", \"41\"),\n", - " (\"quarante-deux\", \"42\"),\n", - " (\"quarante-trois\", \"43\"),\n", - " (\"quarante-quatre\", \"44\"),\n", - " (\"quarante-cinq\", \"45\"),\n", - " (\"quarante-six\", \"46\"),\n", - " (\"quarante-sept\", \"47\"),\n", - " (\"quarante-huit\", \"48\"),\n", - " (\"quarante-neuf\", \"49\"),\n", - " (\"cinquante\", \"50\"),\n", - " (\"cinquante-et-une\", \"51\"),\n", - " (\"cinquante et une\", \"51\"),\n", - " (\"cinquante-deux\", \"52\"),\n", - " (\"cinquante-trois\", \"53\"),\n", - " (\"cinquante-quatre\", \"54\"),\n", - " (\"cinquante-cinq\", \"55\"),\n", - " (\"cinquante-six\", \"56\"),\n", - " (\"cinquante-sept\", \"57\"),\n", - " (\"cinquante-huit\", \"58\"),\n", - " (\"cinquante-neuf\", \"59\"),\n", - "])" + "hours = pynini.string_map(\n", + " [\n", + " (\"zéro\", \"0\"),\n", + " (\"une\", \"1\"),\n", + " (\"deux\", \"2\"),\n", + " (\"trois\", \"3\"),\n", + " (\"quatre\", \"4\"),\n", + " (\"cinq\", \"5\"),\n", + " (\"six\", \"6\"),\n", + " (\"sept\", \"7\"),\n", + " (\"huit\", \"8\"),\n", + " (\"neuf\", \"9\"),\n", + " (\"dix\", \"10\"),\n", + " (\"onze\", \"11\"),\n", + " (\"douze\", \"12\"),\n", + " (\"treize\", \"13\"),\n", + " (\"quatorze\", \"14\"),\n", + " (\"quinze\", \"15\"),\n", + " (\"seize\", \"16\"),\n", + " (\"dix-sept\", \"17\"),\n", + " (\"dix-huit\", \"18\"),\n", + " (\"dix-neuf\", \"19\"),\n", + " (\"vingt\", \"20\"),\n", + " (\"vingt-et-une\", \"21\"),\n", + " (\"vingt et une\", \"21\"),\n", + " (\"vingt-deux\", \"22\"),\n", + " (\"vingt-trois\", \"23\"),\n", + " (\"vingt-quatre\", \"24\"),\n", + " ]\n", + ")\n", + "minutes = pynini.string_map(\n", + " [\n", + " (\"une\", \"01\"),\n", + " (\"deux\", \"02\"),\n", + " (\"trois\", \"03\"),\n", + " (\"quatre\", \"04\"),\n", + " (\"cinq\", \"05\"),\n", + " (\"six\", \"06\"),\n", + " (\"sept\", \"07\"),\n", + " (\"huit\", \"08\"),\n", + " (\"neuf\", \"09\"),\n", + " (\"dix\", \"10\"),\n", + " (\"onze\", \"11\"),\n", + " (\"douze\", \"12\"),\n", + " (\"treize\", \"13\"),\n", + " (\"quatorze\", \"14\"),\n", + " (\"quinze\", \"15\"),\n", + " (\"seize\", \"16\"),\n", + " (\"dix-sept\", \"17\"),\n", + " (\"dix-huit\", \"18\"),\n", + " (\"dix-neuf\", \"19\"),\n", + " (\"vingt\", \"20\"),\n", + " (\"vingt-et-une\", \"21\"),\n", + " (\"vingt et une\", \"21\"),\n", + " (\"vingt-deux\", \"22\"),\n", + " (\"vingt-trois\", \"23\"),\n", + " (\"vingt-quatre\", \"27\"),\n", + " (\"vingt-cinq\", \"25\"),\n", + " (\"vingt-six\", \"26\"),\n", + " (\"vingt-sept\", \"27\"),\n", + " (\"vingt-huit\", \"28\"),\n", + " (\"vingt-neuf\", \"29\"),\n", + " (\"trente\", \"30\"),\n", + " (\"trente-et-une\", \"31\"),\n", + " (\"trente et une\", \"31\"),\n", + " (\"trente-deux\", \"32\"),\n", + " (\"trente-trois\", \"33\"),\n", + " (\"trente-quatre\", \"34\"),\n", + " (\"trente-cinq\", \"35\"),\n", + " (\"trente-six\", \"36\"),\n", + " (\"trente-sept\", \"37\"),\n", + " (\"trente-huit\", \"38\"),\n", + " (\"trente-neuf\", \"39\"),\n", + " (\"quarante\", \"40\"),\n", + " (\"quarante-et-une\", \"41\"),\n", + " (\"quarante et une\", \"41\"),\n", + " (\"quarante-deux\", \"42\"),\n", + " (\"quarante-trois\", \"43\"),\n", + " (\"quarante-quatre\", \"44\"),\n", + " (\"quarante-cinq\", \"45\"),\n", + " (\"quarante-six\", \"46\"),\n", + " (\"quarante-sept\", \"47\"),\n", + " (\"quarante-huit\", \"48\"),\n", + " (\"quarante-neuf\", \"49\"),\n", + " (\"cinquante\", \"50\"),\n", + " (\"cinquante-et-une\", \"51\"),\n", + " (\"cinquante et une\", \"51\"),\n", + " (\"cinquante-deux\", \"52\"),\n", + " (\"cinquante-trois\", \"53\"),\n", + " (\"cinquante-quatre\", \"54\"),\n", + " (\"cinquante-cinq\", \"55\"),\n", + " (\"cinquante-six\", \"56\"),\n", + " (\"cinquante-sept\", \"57\"),\n", + " (\"cinquante-huit\", \"58\"),\n", + " (\"cinquante-neuf\", \"59\"),\n", + " ]\n", + ")" ] }, { @@ -5164,94 +5253,98 @@ }, "outputs": [], "source": [ - "hours_to = pynini.string_map([\n", - " (\"1\",\"0\"),\n", - " (\"2\",\"1\"),\n", - " (\"3\",\"2\"),\n", - " (\"4\",\"3\"),\n", - " (\"5\",\"4\"),\n", - " (\"6\",\"5\"),\n", - " (\"7\",\"6\"),\n", - " (\"8\",\"7\"),\n", - " (\"9\",\"8\"),\n", - " (\"10\",\"9\"),\n", - " (\"11\",\"10\"),\n", - " (\"12\",\"11\"),\n", - " (\"13\",\"12\"),\n", - " (\"14\",\"13\"),\n", - " (\"15\",\"14\"),\n", - " (\"16\",\"15\"),\n", - " (\"17\",\"16\"),\n", - " (\"18\",\"17\"),\n", - " (\"19\",\"18\"),\n", - " (\"20\",\"19\"),\n", - " (\"21\",\"20\"),\n", - " (\"22\",\"21\"),\n", - " (\"23\",\"22\"),\n", - " (\"24\",\"23\"),\n", - " (\"0\",\"23\"),\n", - "])\n", - "minutes_to = pynini.string_map([\n", - " (\"59\", \"01\"),\n", - " (\"58\", \"02\"),\n", - " (\"57\", \"03\"),\n", - " (\"56\", \"04\"),\n", - " (\"55\", \"05\"),\n", - " (\"54\", \"06\"),\n", - " (\"53\", \"07\"),\n", - " (\"52\", \"08\"),\n", - " (\"51\", \"09\"),\n", - " (\"50\", \"10\"),\n", - " (\"49\", \"11\"),\n", - " (\"48\", \"12\"),\n", - " (\"47\", \"13\"),\n", - " (\"46\", \"14\"),\n", - " (\"45\", \"15\"),\n", - " (\"44\", \"16\"),\n", - " (\"43\", \"17\"),\n", - " (\"42\", \"18\"),\n", - " (\"41\", \"19\"),\n", - " (\"40\", \"20\"),\n", - " (\"39\", \"21\"),\n", - " (\"38\", \"22\"),\n", - " (\"37\", \"23\"),\n", - " (\"36\", \"24\"),\n", - " (\"35\", \"25\"),\n", - " (\"34\", \"26\"),\n", - " (\"33\", \"27\"),\n", - " (\"32\", \"28\"),\n", - " (\"31\", \"29\"),\n", - " (\"30\", \"30\"),\n", - " (\"29\", \"31\"),\n", - " (\"28\", \"32\"),\n", - " (\"27\", \"33\"),\n", - " (\"26\", \"34\"),\n", - " (\"25\", \"35\"),\n", - " (\"24\", \"36\"),\n", - " (\"23\", \"37\"),\n", - " (\"22\", \"38\"),\n", - " (\"21\", \"39\"),\n", - " (\"20\", \"40\"),\n", - " (\"19\", \"41\"),\n", - " (\"18\", \"42\"),\n", - " (\"17\", \"43\"),\n", - " (\"16\", \"44\"),\n", - " (\"15\", \"45\"),\n", - " (\"14\", \"46\"),\n", - " (\"13\", \"47\"),\n", - " (\"12\", \"48\"),\n", - " (\"11\", \"49\"),\n", - " (\"10\", \"50\"),\n", - " (\"09\", \"51\"),\n", - " (\"08\", \"52\"),\n", - " (\"07\", \"53\"),\n", - " (\"06\", \"54\"),\n", - " (\"05\", \"55\"),\n", - " (\"04\", \"56\"),\n", - " (\"03\", \"57\"),\n", - " (\"02\", \"58\"),\n", - " (\"01\", \"59\"),\n", - "])\n", + "hours_to = pynini.string_map(\n", + " [\n", + " (\"1\", \"0\"),\n", + " (\"2\", \"1\"),\n", + " (\"3\", \"2\"),\n", + " (\"4\", \"3\"),\n", + " (\"5\", \"4\"),\n", + " (\"6\", \"5\"),\n", + " (\"7\", \"6\"),\n", + " (\"8\", \"7\"),\n", + " (\"9\", \"8\"),\n", + " (\"10\", \"9\"),\n", + " (\"11\", \"10\"),\n", + " (\"12\", \"11\"),\n", + " (\"13\", \"12\"),\n", + " (\"14\", \"13\"),\n", + " (\"15\", \"14\"),\n", + " (\"16\", \"15\"),\n", + " (\"17\", \"16\"),\n", + " (\"18\", \"17\"),\n", + " (\"19\", \"18\"),\n", + " (\"20\", \"19\"),\n", + " (\"21\", \"20\"),\n", + " (\"22\", \"21\"),\n", + " (\"23\", \"22\"),\n", + " (\"24\", \"23\"),\n", + " (\"0\", \"23\"),\n", + " ]\n", + ")\n", + "minutes_to = pynini.string_map(\n", + " [\n", + " (\"59\", \"01\"),\n", + " (\"58\", \"02\"),\n", + " (\"57\", \"03\"),\n", + " (\"56\", \"04\"),\n", + " (\"55\", \"05\"),\n", + " (\"54\", \"06\"),\n", + " (\"53\", \"07\"),\n", + " (\"52\", \"08\"),\n", + " (\"51\", \"09\"),\n", + " (\"50\", \"10\"),\n", + " (\"49\", \"11\"),\n", + " (\"48\", \"12\"),\n", + " (\"47\", \"13\"),\n", + " (\"46\", \"14\"),\n", + " (\"45\", \"15\"),\n", + " (\"44\", \"16\"),\n", + " (\"43\", \"17\"),\n", + " (\"42\", \"18\"),\n", + " (\"41\", \"19\"),\n", + " (\"40\", \"20\"),\n", + " (\"39\", \"21\"),\n", + " (\"38\", \"22\"),\n", + " (\"37\", \"23\"),\n", + " (\"36\", \"24\"),\n", + " (\"35\", \"25\"),\n", + " (\"34\", \"26\"),\n", + " (\"33\", \"27\"),\n", + " (\"32\", \"28\"),\n", + " (\"31\", \"29\"),\n", + " (\"30\", \"30\"),\n", + " (\"29\", \"31\"),\n", + " (\"28\", \"32\"),\n", + " (\"27\", \"33\"),\n", + " (\"26\", \"34\"),\n", + " (\"25\", \"35\"),\n", + " (\"24\", \"36\"),\n", + " (\"23\", \"37\"),\n", + " (\"22\", \"38\"),\n", + " (\"21\", \"39\"),\n", + " (\"20\", \"40\"),\n", + " (\"19\", \"41\"),\n", + " (\"18\", \"42\"),\n", + " (\"17\", \"43\"),\n", + " (\"16\", \"44\"),\n", + " (\"15\", \"45\"),\n", + " (\"14\", \"46\"),\n", + " (\"13\", \"47\"),\n", + " (\"12\", \"48\"),\n", + " (\"11\", \"49\"),\n", + " (\"10\", \"50\"),\n", + " (\"09\", \"51\"),\n", + " (\"08\", \"52\"),\n", + " (\"07\", \"53\"),\n", + " (\"06\", \"54\"),\n", + " (\"05\", \"55\"),\n", + " (\"04\", \"56\"),\n", + " (\"03\", \"57\"),\n", + " (\"02\", \"58\"),\n", + " (\"01\", \"59\"),\n", + " ]\n", + ")\n", "graph_moins = pynutil.delete(\"moins\")" ] }, @@ -5300,11 +5393,12 @@ "\n", "graph_minutes_component = (\n", " pynutil.insert(\" minutes: \\\"\") + pynini.union(minutes, graph_fractions) + pynutil.insert(\"\\\"\")\n", - ") \n", + ")\n", "graph_minutes_component = delete_space + graph_minutes_component\n", "\n", - "graph_time_standard = (graph_hours_component + delete_space + graph_heures \n", - " + pynini.closure(graph_minutes_component, 0, 1))" + "graph_time_standard = (\n", + " graph_hours_component + delete_space + graph_heures + pynini.closure(graph_minutes_component, 0, 1)\n", + ")" ] }, { @@ -5329,8 +5423,9 @@ "graph_hours_to_component = pynutil.insert(\"hours: \\\"\") + graph_hours_to_component + pynutil.insert(\"\\\"\")\n", "graph_hours_to_component = graph_hours_to_component + delete_space + graph_heures\n", "\n", - "graph_minutes_to_component = (minutes | graph_demi | # No 'et' in fractions\n", - " (pynutil.delete(\"le \") + graph_quart) | graph_trois_quart)\n", + "graph_minutes_to_component = (\n", + " minutes | graph_demi | (pynutil.delete(\"le \") + graph_quart) | graph_trois_quart # No 'et' in fractions\n", + ")\n", "graph_minutes_to_component @= minutes_to\n", "graph_minutes_to_component = pynutil.insert(\" minutes: \\\"\") + graph_minutes_to_component + pynutil.insert(\"\\\"\")\n", "\n", @@ -5381,7 +5476,7 @@ "outputs": [], "source": [ "graph_suffix_am = pynini.cross(\"du matin\", \"am\")\n", - "graph_suffix_pm = pynini.string_map([(\"de l'après-midi\", \"pm\"),(\"du soir\", \"pm\")])\n", + "graph_suffix_pm = pynini.string_map([(\"de l'après-midi\", \"pm\"), (\"du soir\", \"pm\")])\n", "\n", "graph_suffix = pynini.cross(graph_suffix_am, \"am\") | pynini.cross(graph_suffix_pm, \"pm\")\n", "\n", @@ -5420,40 +5515,44 @@ "\n", " graph_minutes_component = (\n", " pynutil.insert(\" minutes: \\\"\") + pynini.union(minutes, graph_fractions) + pynutil.insert(\"\\\"\")\n", - " ) \n", + " )\n", " graph_minutes_component = delete_space + graph_minutes_component\n", "\n", - " graph_time_standard = (graph_hours_component + delete_space + graph_heures \n", - " + pynini.closure(graph_minutes_component, 0, 1))\n", + " graph_time_standard = (\n", + " graph_hours_component + delete_space + graph_heures + pynini.closure(graph_minutes_component, 0, 1)\n", + " )\n", "\n", " graph_hours_to_component = hours | graph_midi | graph_minuit\n", " graph_hours_to_component @= hours_to\n", " graph_hours_to_component = pynutil.insert(\"hours: \\\"\") + graph_hours_to_component + pynutil.insert(\"\\\"\")\n", " graph_hours_to_component = graph_hours_to_component + delete_space + graph_heures\n", "\n", - " graph_minutes_to_component = (minutes | graph_demi | # No 'et' in fractions\n", - " (pynutil.delete(\"le \") + graph_quart) | graph_trois_quart)\n", + " graph_minutes_to_component = (\n", + " minutes | graph_demi | (pynutil.delete(\"le \") + graph_quart) | graph_trois_quart # No 'et' in fractions\n", + " )\n", " graph_minutes_to_component @= minutes_to\n", " graph_minutes_to_component = pynutil.insert(\" minutes: \\\"\") + graph_minutes_to_component + pynutil.insert(\"\\\"\")\n", "\n", - " graph_time_to = graph_hours_to_component + delete_space + graph_moins + delete_space + graph_minutes_to_component\n", + " graph_time_to = (\n", + " graph_hours_to_component + delete_space + graph_moins + delete_space + graph_minutes_to_component\n", + " )\n", "\n", " graph_time_no_suffix = graph_time_standard | graph_time_to\n", "\n", " graph_suffix_am = pynini.cross(\"du matin\", \"am\")\n", - " graph_suffix_pm = pynini.string_map([(\"de l'après-midi\", \"pm\"),(\"du soir\", \"pm\")])\n", + " graph_suffix_pm = pynini.string_map([(\"de l'après-midi\", \"pm\"), (\"du soir\", \"pm\")])\n", "\n", " graph_suffix = pynini.cross(graph_suffix_am, \"am\") | pynini.cross(graph_suffix_pm, \"pm\")\n", "\n", " graph_suffix_component = pynutil.insert(\" suffix: \\\"\") + graph_suffix + pynutil.insert(\"\\\"\")\n", " graph_suffix_component = delete_space + graph_suffix_component\n", " graph_suffix_component = pynini.closure(graph_suffix_component, 0, 1)\n", - " \n", + "\n", " final_graph = graph_time_no_suffix + graph_suffix_component\n", "\n", " final_graph = self.add_tokens(final_graph)\n", "\n", - " self.fst = final_graph.optimize()\n" + " self.fst = final_graph.optimize()" ] }, { @@ -5556,19 +5655,21 @@ }, "outputs": [], "source": [ - "hour_to_night = pynini.string_map([\n", - " (\"1\", \"13\"),\n", - " (\"2\", \"14\"),\n", - " (\"3\", \"15\"),\n", - " (\"4\", \"16\"),\n", - " (\"5\", \"17\"),\n", - " (\"6\", \"18\"),\n", - " (\"7\", \"19\"),\n", - " (\"8\", \"20\"),\n", - " (\"9\", \"21\"),\n", - " (\"10\", \"22\"),\n", - " (\"11\", \"23\"), # Note that 12 and 24 would be phrased \"midi\" and \"minuit\" respectively\n", - "])" + "hour_to_night = pynini.string_map(\n", + " [\n", + " (\"1\", \"13\"),\n", + " (\"2\", \"14\"),\n", + " (\"3\", \"15\"),\n", + " (\"4\", \"16\"),\n", + " (\"5\", \"17\"),\n", + " (\"6\", \"18\"),\n", + " (\"7\", \"19\"),\n", + " (\"8\", \"20\"),\n", + " (\"9\", \"21\"),\n", + " (\"10\", \"22\"),\n", + " (\"11\", \"23\"), # Note that 12 and 24 would be phrased \"midi\" and \"minuit\" respectively\n", + " ]\n", + ")" ] }, { @@ -5589,14 +5690,7 @@ "outputs": [], "source": [ "night_suffixes = pynutil.delete(\"suffix: \\\"pm\\\"\")\n", - "graph |= (\n", - " hour @ hour_to_night\n", - " + delete_extra_space\n", - " + pynutil.insert(\"h\")\n", - " + minute.ques\n", - " + delete_space\n", - " + night_suffixes\n", - " )" + "graph |= hour @ hour_to_night + delete_extra_space + pynutil.insert(\"h\") + minute.ques + delete_space + night_suffixes" ] }, { @@ -5620,19 +5714,21 @@ " def __init__(self):\n", " super().__init__(name=\"time\", kind=\"verbalize\")\n", "\n", - " hour_to_night = pynini.string_map([\n", - " (\"1\", \"13\"),\n", - " (\"2\", \"14\"),\n", - " (\"3\", \"15\"),\n", - " (\"4\", \"16\"),\n", - " (\"5\", \"17\"),\n", - " (\"6\", \"18\"),\n", - " (\"7\", \"19\"),\n", - " (\"8\", \"20\"),\n", - " (\"9\", \"21\"),\n", - " (\"10\", \"22\"),\n", - " (\"11\", \"23\"),\n", - "])\n", + " hour_to_night = pynini.string_map(\n", + " [\n", + " (\"1\", \"13\"),\n", + " (\"2\", \"14\"),\n", + " (\"3\", \"15\"),\n", + " (\"4\", \"16\"),\n", + " (\"5\", \"17\"),\n", + " (\"6\", \"18\"),\n", + " (\"7\", \"19\"),\n", + " (\"8\", \"20\"),\n", + " (\"9\", \"21\"),\n", + " (\"10\", \"22\"),\n", + " (\"11\", \"23\"),\n", + " ]\n", + " )\n", "\n", " day_suffixes = pynutil.delete(\"suffix: \\\"am\\\"\")\n", " night_suffixes = pynutil.delete(\"suffix: \\\"pm\\\"\")\n", @@ -5723,11 +5819,7 @@ }, "outputs": [], "source": [ - "graph = pynini.string_map([\n", - " (\"mister\", \"mr.\"),\n", - " (\"h m s\", \"h.m.s\"),\n", - " (\"doctor\", \"dr.\")\n", - "])" + "graph = pynini.string_map([(\"mister\", \"mr.\"), (\"h m s\", \"h.m.s\"), (\"doctor\", \"dr.\")])" ] }, { @@ -5769,10 +5861,7 @@ " def __init__(self):\n", " super().__init__(name=\"whitelist\", kind=\"classify\")\n", "\n", - " whitelist = pynini.string_map([\n", - " (\"mister\", \"mr.\"),\n", - " (\"h m s\", \"h.m.s\"),\n", - " (\"doctor\", \"dr.\")])\n", + " whitelist = pynini.string_map([(\"mister\", \"mr.\"), (\"h m s\", \"h.m.s\"), (\"doctor\", \"dr.\")])\n", " graph = pynutil.insert(\"name: \\\"\") + convert_space(whitelist) + pynutil.insert(\"\\\"\")\n", " self.fst = graph.optimize()" ] @@ -5811,7 +5900,9 @@ " + pynini.closure(NEMO_CHAR - \" \", 1)\n", " + pynutil.delete(\"\\\"\")\n", " )\n", - " graph = graph @ pynini.cdrewrite(pynini.cross(u\"\\u00A0\", \" \"), \"\", \"\", NEMO_SIGMA) # Removes possible null token\n", + " graph = graph @ pynini.cdrewrite(\n", + " pynini.cross(u\"\\u00a0\", \" \"), \"\", \"\", NEMO_SIGMA\n", + " ) # Removes possible null token\n", " self.fst = graph.optimize()" ] }, @@ -5932,7 +6023,7 @@ " super().__init__(name=\"punctuation\", kind=\"classify\")\n", "\n", " s = \"!#$%&\\'()*+,-./:;<=>?@^_`{|}~\"\n", - " guillemets = \"\\u00AB\" + \"\\u00BB\" # quotation marks in French.\n", + " guillemets = \"\\u00ab\" + \"\\u00bb\" # quotation marks in French.\n", " s += guillemets\n", " punct = pynini.union(*s)\n", "\n", @@ -5972,7 +6063,9 @@ " super().__init__(name=\"word\", kind=\"verbalize\")\n", " chars = pynini.closure(NEMO_CHAR - \" \", 1)\n", " char = pynutil.delete(\"name:\") + delete_space + pynutil.delete(\"\\\"\") + chars + pynutil.delete(\"\\\"\")\n", - " graph = char @ pynini.cdrewrite(pynini.cross(u\"\\u00A0\", \" \"), \"\", \"\", NEMO_SIGMA) # Cleans up possible null character\n", + " graph = char @ pynini.cdrewrite(\n", + " pynini.cross(u\"\\u00a0\", \" \"), \"\", \"\", NEMO_SIGMA\n", + " ) # Cleans up possible null character\n", "\n", " self.fst = graph.optimize()" ] @@ -6167,13 +6260,7 @@ " punct_graph = PunctuationFst().fst\n", "\n", " classify = (\n", - " time_graph\n", - " | whitelist_graph\n", - " | decimal_graph\n", - " | cardinal_graph\n", - " | ordinal_graph\n", - " | money_graph\n", - " | word_graph\n", + " time_graph | whitelist_graph | decimal_graph | cardinal_graph | ordinal_graph | money_graph | word_graph\n", " )\n", " token = pynutil.insert(\"tokens { \") + classify + pynutil.insert(\" }\")" ] @@ -6220,9 +6307,9 @@ "source": [ "token = \"PLACEHOLDER\"\n", "token_plus_punct = (\n", - " pynini.closure(punct + pynutil.insert(\" \")) + token + pynini.closure(pynutil.insert(\" \") + punct)\n", - " ) # Note the use of closure incase there are multiple punctuations\n", - "graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)\n" + " pynini.closure(punct + pynutil.insert(\" \")) + token + pynini.closure(pynutil.insert(\" \") + punct)\n", + ") # Note the use of closure incase there are multiple punctuations\n", + "graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)" ] }, { @@ -6273,15 +6360,7 @@ }, "outputs": [], "source": [ - "classify = (\n", - " time_graph\n", - " | whitelist_graph\n", - " | decimal_graph\n", - " | cardinal_graph\n", - " | ordinal_graph\n", - " | money_graph\n", - " | word_graph\n", - " )\n", + "classify = time_graph | whitelist_graph | decimal_graph | cardinal_graph | ordinal_graph | money_graph | word_graph\n", "punct = pynutil.insert(\"tokens { \") + punct_graph + pynutil.insert(\" }\")" ] }, @@ -6303,14 +6382,14 @@ "outputs": [], "source": [ "classify = (\n", - " pynutil.add_weight(time_graph, 1)\n", - " | pynutil.add_weight(whitelist_graph, 1)\n", - " | pynutil.add_weight(decimal_graph, 1)\n", - " | pynutil.add_weight(cardinal_graph, 1)\n", - " | pynutil.add_weight(ordinal_graph, 1)\n", - " | pynutil.add_weight(money_graph, 1)\n", - " | pynutil.add_weight(word_graph, 1)\n", - " )\n", + " pynutil.add_weight(time_graph, 1)\n", + " | pynutil.add_weight(whitelist_graph, 1)\n", + " | pynutil.add_weight(decimal_graph, 1)\n", + " | pynutil.add_weight(cardinal_graph, 1)\n", + " | pynutil.add_weight(ordinal_graph, 1)\n", + " | pynutil.add_weight(money_graph, 1)\n", + " | pynutil.add_weight(word_graph, 1)\n", + ")\n", "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1) + pynutil.insert(\" }\")" ] }, @@ -6334,14 +6413,14 @@ "outputs": [], "source": [ "classify = (\n", - " pynutil.add_weight(time_graph, 1.1)\n", - " | pynutil.add_weight(whitelist_graph, 1.1)\n", - " | pynutil.add_weight(decimal_graph, 1.1)\n", - " | pynutil.add_weight(cardinal_graph, 1.1)\n", - " | pynutil.add_weight(ordinal_graph, 1.1)\n", - " | pynutil.add_weight(money_graph, 1.1)\n", - " | pynutil.add_weight(word_graph, 1.1)\n", - " )\n", + " pynutil.add_weight(time_graph, 1.1)\n", + " | pynutil.add_weight(whitelist_graph, 1.1)\n", + " | pynutil.add_weight(decimal_graph, 1.1)\n", + " | pynutil.add_weight(cardinal_graph, 1.1)\n", + " | pynutil.add_weight(ordinal_graph, 1.1)\n", + " | pynutil.add_weight(money_graph, 1.1)\n", + " | pynutil.add_weight(word_graph, 1.1)\n", + ")\n", "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(\" }\")" ] }, @@ -6361,14 +6440,14 @@ "outputs": [], "source": [ "classify = (\n", - " pynutil.add_weight(time_graph, 1.1)\n", - " | pynutil.add_weight(whitelist_graph, 1.1)\n", - " | pynutil.add_weight(decimal_graph, 1.1)\n", - " | pynutil.add_weight(cardinal_graph, 1.1)\n", - " | pynutil.add_weight(ordinal_graph, 1.1)\n", - " | pynutil.add_weight(money_graph, 1.1)\n", - " | pynutil.add_weight(word_graph, 100)\n", - " )\n", + " pynutil.add_weight(time_graph, 1.1)\n", + " | pynutil.add_weight(whitelist_graph, 1.1)\n", + " | pynutil.add_weight(decimal_graph, 1.1)\n", + " | pynutil.add_weight(cardinal_graph, 1.1)\n", + " | pynutil.add_weight(ordinal_graph, 1.1)\n", + " | pynutil.add_weight(money_graph, 1.1)\n", + " | pynutil.add_weight(word_graph, 100)\n", + ")\n", "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(\" }\")" ] }, @@ -6397,14 +6476,14 @@ "outputs": [], "source": [ "classify = (\n", - " pynutil.add_weight(time_graph, 1.1)\n", - " | pynutil.add_weight(whitelist_graph, 1.1)\n", - " | pynutil.add_weight(decimal_graph, 1.1)\n", - " | pynutil.add_weight(cardinal_graph, 1.2)\n", - " | pynutil.add_weight(ordinal_graph, 1.1)\n", - " | pynutil.add_weight(money_graph, 1.1)\n", - " | pynutil.add_weight(word_graph, 100)\n", - " )\n", + " pynutil.add_weight(time_graph, 1.1)\n", + " | pynutil.add_weight(whitelist_graph, 1.1)\n", + " | pynutil.add_weight(decimal_graph, 1.1)\n", + " | pynutil.add_weight(cardinal_graph, 1.2)\n", + " | pynutil.add_weight(ordinal_graph, 1.1)\n", + " | pynutil.add_weight(money_graph, 1.1)\n", + " | pynutil.add_weight(word_graph, 100)\n", + ")\n", "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(\" }\")" ] }, @@ -6426,14 +6505,14 @@ "outputs": [], "source": [ "classify = (\n", - " pynutil.add_weight(time_graph, 1)\n", - " | pynutil.add_weight(whitelist_graph, 1)\n", - " | pynutil.add_weight(decimal_graph, 1.1)\n", - " | pynutil.add_weight(cardinal_graph, 1.2)\n", - " | pynutil.add_weight(ordinal_graph, 1)\n", - " | pynutil.add_weight(money_graph, 1.09)\n", - " | pynutil.add_weight(word_graph, 100)\n", - " )\n", + " pynutil.add_weight(time_graph, 1)\n", + " | pynutil.add_weight(whitelist_graph, 1)\n", + " | pynutil.add_weight(decimal_graph, 1.1)\n", + " | pynutil.add_weight(cardinal_graph, 1.2)\n", + " | pynutil.add_weight(ordinal_graph, 1)\n", + " | pynutil.add_weight(money_graph, 1.09)\n", + " | pynutil.add_weight(word_graph, 100)\n", + ")\n", "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1) + pynutil.insert(\" }\")" ] }, @@ -6457,14 +6536,14 @@ "outputs": [], "source": [ "classify = (\n", - " pynutil.add_weight(time_graph, 1.1)\n", - " | pynutil.add_weight(whitelist_graph, 1.07)\n", - " | pynutil.add_weight(decimal_graph, 1.1)\n", - " | pynutil.add_weight(cardinal_graph, 1.2)\n", - " | pynutil.add_weight(ordinal_graph, 1.1)\n", - " | pynutil.add_weight(money_graph, 1.08)\n", - " | pynutil.add_weight(word_graph, 100)\n", - " )\n", + " pynutil.add_weight(time_graph, 1.1)\n", + " | pynutil.add_weight(whitelist_graph, 1.07)\n", + " | pynutil.add_weight(decimal_graph, 1.1)\n", + " | pynutil.add_weight(cardinal_graph, 1.2)\n", + " | pynutil.add_weight(ordinal_graph, 1.1)\n", + " | pynutil.add_weight(money_graph, 1.08)\n", + " | pynutil.add_weight(word_graph, 100)\n", + ")\n", "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(\" }\")" ] }, @@ -6490,7 +6569,7 @@ "class ClassifyFst(GraphFst):\n", " \"\"\"\n", " Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.\n", - " For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. \n", + " For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.\n", " More details to deployment at NeMo/tools/text_processing_deployment.\n", "\n", " Args:\n", @@ -6614,15 +6693,16 @@ "source": [ "import os\n", "\n", + "\n", "class ClassifyFst(GraphFst):\n", " def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):\n", " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", " # Grammar here\n", " # ....\n", " if cache_dir is not None and cache_dir != \"None\":\n", - " os.makedirs(cache_dir, exist_ok=True)\n", - " far_file = os.path.join(cache_dir, \"_fr_itn.far\")\n", - " generator_main(far_file, {\"tokenize_and_classify\": self.fst})" + " os.makedirs(cache_dir, exist_ok=True)\n", + " far_file = os.path.join(cache_dir, \"_fr_itn.far\")\n", + " generator_main(far_file, {\"tokenize_and_classify\": self.fst})" ] }, { @@ -6644,6 +6724,7 @@ "source": [ "import os\n", "\n", + "\n", "class ClassifyFst(GraphFst):\n", " def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):\n", " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", @@ -6655,7 +6736,7 @@ " if cache_dir is not None and cache_dir != \"None\":\n", " os.makedirs(cache_dir, exist_ok=True)\n", " far_file = os.path.join(cache_dir, \"_fr_itn.far\")\n", - " generator_main(far_file, {\"tokenize_and_classify\": self.fst})\n" + " generator_main(far_file, {\"tokenize_and_classify\": self.fst})" ] }, { @@ -6793,6 +6874,7 @@ "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.whitelist import WhiteListFst\n", "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.word import WordFst\n", "\n", + "\n", "class VerbalizeFst(GraphFst):\n", " def __init__(self):\n", " super().__init__(name=\"verbalize\", kind=\"verbalize\")\n", @@ -6804,14 +6886,7 @@ " whitelist_graph = WhiteListFst().fst\n", " money_graph = MoneyFst(decimal=decimal).fst\n", " time_graph = TimeFst().fst\n", - " graph = (\n", - " time_graph\n", - " | whitelist_graph\n", - " | money_graph\n", - " | ordinal_graph\n", - " | decimal_graph\n", - " | cardinal_graph\n", - " )\n", + " graph = time_graph | whitelist_graph | money_graph | ordinal_graph | decimal_graph | cardinal_graph\n", " self.fst = graph" ] }, @@ -6843,7 +6918,6 @@ }, "outputs": [], "source": [ - "\n", "class VerbalizeFinalFst(GraphFst):\n", " def __init__(self):\n", " super().__init__(name=\"verbalize_final\", kind=\"verbalize\")\n",