Added Language as a Feature.

PiperOrigin-RevId: 726931209
google-research · Feb 14, 2025 · ba4c928 · ba4c928
1 parent 17fcdb4
commit ba4c928
Show file tree

Hide file tree

Showing 15 changed files with 354 additions and 32 deletions.
diff --git a/nisaba/scripts/natural_translit/language_params/BUILD.bazel b/nisaba/scripts/natural_translit/language_params/BUILD.bazel
@@ -42,7 +42,7 @@ py_library(
     name = "en",
     srcs = ["en.py"],
     deps = [
-        "//nisaba/scripts/natural_translit/phonology/inventories:x_mul",
+        "//nisaba/scripts/natural_translit/phonology/inventories:x_uni",
         "//nisaba/scripts/natural_translit/script:grapheme",
         "//nisaba/scripts/natural_translit/script/inventories:latn",
     ],

diff --git a/nisaba/scripts/natural_translit/language_params/en.py b/nisaba/scripts/natural_translit/language_params/en.py
@@ -14,15 +14,15 @@
 
 """Language parameters for English."""
 
-from nisaba.scripts.natural_translit.phonology.inventories import x_mul
+from nisaba.scripts.natural_translit.phonology.inventories import x_uni
 from nisaba.scripts.natural_translit.script import grapheme as g
 from nisaba.scripts.natural_translit.script.inventories import latn as l
 
 
 def _latn_inventory() -> g.Grapheme.Inventory:
   """Builds a grapheme inventory for English."""
   latn = l.LATN
-  ph = x_mul.PHONEMES
+  ph = x_uni.PHONEMES
   gr = g.Grapheme.Inventory(g.Grapheme.GR_FEATURES.script.latn, 'en')
   lowercase = [
       latn.a,

diff --git a/nisaba/scripts/natural_translit/phonology/BUILD.bazel b/nisaba/scripts/natural_translit/phonology/BUILD.bazel
@@ -68,6 +68,7 @@ py_library(
     srcs = ["phonological_symbol.py"],
     deps = [
         "//nisaba/scripts/natural_translit/phonology/features:descriptive",
+        "//nisaba/scripts/natural_translit/phonology/features:language",
         "//nisaba/scripts/natural_translit/utils:expression",
         "//nisaba/scripts/natural_translit/utils:feature",
         "//nisaba/scripts/natural_translit/utils:inventory",

diff --git a/nisaba/scripts/natural_translit/phonology/features/BUILD.bazel b/nisaba/scripts/natural_translit/phonology/features/BUILD.bazel
@@ -37,3 +37,24 @@ py_test(
         "@io_abseil_py//absl/testing:absltest",
     ],
 )
+
+py_library(
+    name = "language",
+    srcs = ["language.py"],
+    deps = [
+        "//nisaba/scripts/natural_translit/utils:feature",
+        "//nisaba/scripts/natural_translit/utils:type_op",
+        requirement("pycountry"),
+    ],
+)
+
+py_test(
+    name = "language_test",
+    srcs = ["language_test.py"],
+    main = "language_test.py",
+    deps = [
+        ":language",
+        "//nisaba/scripts/natural_translit/utils:test_op",
+        "@io_abseil_py//absl/testing:absltest",
+    ],
+)
diff --git a/nisaba/scripts/natural_translit/phonology/features/README.md b/nisaba/scripts/natural_translit/phonology/features/README.md
@@ -2,11 +2,24 @@
 
 [TOC]
 
+## Language
+
+The language inventory is organized by language families. The depth and the
+granularity of the tree is pragmatically determined and can be updated as
+needed.
+
+For example, the Tamil-Kota branch of the Dravidian family is where the dental
+vs alveolar place becomes contrastive for /t̪/ and /t/, which has a direct
+impact on transliteration, so it is added as a node.
+
+[Family tree for the supported languages](https://github.com/google-research/nisaba/blob/main/nisaba/scripts/natural_translit/phonology/features/docs/language.md)
+
 ## Descriptive features
 
 This inventory defines a set of descriptive phonological features that can be
 used to build phonological profiles based on where a phonological symbol is
-located on an[IPA chart](https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart).
+located on an
+[IPA chart](https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart).
 
 ### How to build a feature profile
 

diff --git a/nisaba/scripts/natural_translit/phonology/features/docs/language.md b/nisaba/scripts/natural_translit/phonology/features/docs/language.md
@@ -0,0 +1,70 @@
+# Languages
+
+Feature family tree for the supported
+[languages](https://github.com/google-research/nisaba/blob/main/nisaba/scripts/natural_translit/phonology/features/README.md#language).
+
+<!-- AUTO-GENERATED INVENTORY STRING STARTS HERE -->
+
+```dot
+graph {
+ordering="out"
+size = 12
+rankdir="LR"
+ranksep="1, equally"
+style="invis"
+node [shape="plain"]
+language [label="Language"]
+dravidian [label="Dravidian"]
+language -- dravidian
+subgraph {
+kn [label="Kannada"]
+dravidian -- kn
+te [label="Telugu"]
+dravidian -- te
+tamil_kota [label="Tamil-Kota"]
+dravidian -- tamil_kota
+subgraph {
+ml [label="Malayalam"]
+tamil_kota -- ml
+ta [label="Tamil"]
+tamil_kota -- ta
+}
+}
+indo_european [label="Indo-European"]
+language -- indo_european
+subgraph {
+indo_aryan [label="Indo-Aryan"]
+indo_european -- indo_aryan
+subgraph {
+bn [label="Bengali"]
+indo_aryan -- bn
+gu [label="Gujarati"]
+indo_aryan -- gu
+hi [label="Hindi"]
+indo_aryan -- hi
+mr [label="Marathi"]
+indo_aryan -- mr
+pa [label="Panjabi"]
+indo_aryan -- pa
+}
+germanic [label="Germanic"]
+indo_european -- germanic
+subgraph {
+en [label="English"]
+germanic -- en
+}
+}
+mixed_family [label="Mixed Family Tags"]
+language -- mixed_family
+subgraph {
+mul [label="Multiple Languages"]
+mixed_family -- mul
+und [label="Undetermined"]
+mixed_family -- und
+x_uni [label="Unified Multilingual"]
+mixed_family -- x_uni
+x_psa [label="Pan South Asian"]
+mixed_family -- x_psa
+}
+}
+```
diff --git a/nisaba/scripts/natural_translit/phonology/features/language.py b/nisaba/scripts/natural_translit/phonology/features/language.py
@@ -0,0 +1,128 @@
+# Copyright 2024 Nisaba Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Language feature for PhonologicalSymbol."""
+
+from __future__ import annotations
+
+from typing import Union
+
+import pycountry
+
+from nisaba.scripts.natural_translit.utils import feature as ft
+from nisaba.scripts.natural_translit.utils import type_op as ty
+
+
+class Language(ft.Feature):
+  """Language feature for Phon."""
+
+  OR_NOTHING = Union['Language', ty.Nothing]
+
+  def __init__(self, alias: str, text: str, index: int):
+    super().__init__(alias, text)
+    self.index = index
+
+  @classmethod
+  def from_iso(cls, iso_code: str) -> Language:
+    if len(iso_code) == 2:
+      language = pycountry.languages.get(alpha_2=iso_code)
+    elif len(iso_code) == 3:
+      language = pycountry.languages.get(alpha_3=iso_code)
+    else:
+      raise ValueError(f'Invalid ISO code: {iso_code}')
+    return cls(
+        iso_code.lower(),
+        language.name,
+        list(pycountry.languages).index(language)
+    )
+
+  class Inventory(ft.Feature.Inventory):
+    """Language feature inventory."""
+
+    def __str__(self) -> str:
+      """Language family tree as markdown dot code block."""
+      # Currently all languages are equidistant and have 1 distance from each
+      # other so there is no need for distance tables. If the subgroup steps
+      # are changed to reflect similarity of related languages, the distance
+      # tables should be appended below.
+      return self.language.visualize()
+
+
+def _features() -> Language.Inventory:
+  """Language feature inventory.
+
+  Returns:
+    An inventory of languages.
+
+  The language inventory is organised by language families. The depth and the
+  granularity of the family hierarchy is pragmatic and can be updated as needed.
+  """
+  f = ft.Feature
+  features = Language.Inventory(
+      'languages',
+      f.Aspect(
+          f.equidistant(
+              'language',
+              f.equidistant(
+                  'dravidian',
+                  Language.from_iso('kn'),
+                  Language.from_iso('te'),
+                  f.equidistant(
+                      ('tamil_kota', 'Tamil-Kota'),
+                      Language.from_iso('ml'),
+                      Language.from_iso('ta'),
+                  ),
+              ),
+              f.equidistant(
+                  ('indo_european', 'Indo-European'),
+                  f.equidistant(
+                      ('indo_aryan', 'Indo-Aryan'),
+                      Language.from_iso('bn'),
+                      Language.from_iso('gu'),
+                      Language.from_iso('hi'),
+                      Language.from_iso('mr'),
+                      Language.from_iso('pa'),
+                  ),
+                  f.equidistant(
+                      'germanic',
+                      Language.from_iso('en'),
+                  ),
+              ),
+              f.equidistant(
+                  ('mixed_family', 'Mixed Family Tags'),
+                  # ISO 'mul' is a generic code for mixed-language datasets.
+                  Language.from_iso('mul'),
+                  Language.from_iso('und'),
+                  # x_uni is a custom code for the unified phoneme inventory
+                  # that will cover all IPA symbols and PHOIBLE segments. Index
+                  # 8000 is reserved for the unified inventory. This will
+                  # provide unique indices fol all phonemes as long as the
+                  # number of pycountry languages is less than 8000 (currently
+                  # 7874). If the number of pycounty languages exceeds 8000, we
+                  # can change ReservedIndex for phoneme to 30_000_000 and
+                  # LANG_PREFIX_MULTIPLIER to 1_000 to accommodate.
+                  Language('x_uni', 'Unified Multilingual', 8_000),
+                  # x_uni will potentially cover 2k to 6k segments, resulting in
+                  # symbol indices from 3_800_001 up to 3_806_xxx. In order to
+                  # avoid symbol index collision, we start other custom language
+                  # indices from 9000.
+                  Language('x_psa', 'Pan South Asian', 9_001),
+              ),
+          ),
+      ),
+  )
+  return features
+
+
+FEATURES = _features()
diff --git a/nisaba/scripts/natural_translit/phonology/features/language_test.py b/nisaba/scripts/natural_translit/phonology/features/language_test.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Nisaba Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from absl.testing import absltest
+from nisaba.scripts.natural_translit.phonology.features import language
+from nisaba.scripts.natural_translit.utils import test_op
+
+_LANG = language.FEATURES.language
+
+
+class LanguageTest(test_op.TestCase):
+
+  def test_language(self):
+    self.assertEqual(_LANG.bn.index, 624)
+    self.assertEqual(_LANG.x_uni.index, 8_000)
+
+  def test_no_duplicate_indices(self):
+    index_list = [l.index for l in _LANG]
+    self.assertEqual(len(set(index_list)), len(index_list))
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/nisaba/scripts/natural_translit/phonology/inventories/BUILD.bazel b/nisaba/scripts/natural_translit/phonology/inventories/BUILD.bazel
@@ -19,19 +19,19 @@ package(
 )
 
 py_library(
-    name = "x_mul",
-    srcs = ["x_mul.py"],
+    name = "x_uni",
+    srcs = ["x_uni.py"],
     deps = [
         "//nisaba/scripts/natural_translit/phonology:phonological_symbol",
     ],
 )
 
 py_test(
-    name = "x_mul_test",
-    srcs = ["x_mul_test.py"],
-    main = "x_mul_test.py",
+    name = "x_uni_test",
+    srcs = ["x_uni_test.py"],
+    main = "x_uni_test.py",
     deps = [
-        ":x_mul",
+        ":x_uni",
         "//nisaba/scripts/natural_translit/utils:test_op",
         "@io_abseil_py//absl/testing:absltest",
     ],

diff --git a/...nslit/phonology/inventories/docs/x_mul.md → ...nslit/phonology/inventories/docs/x_uni.md b/...nslit/phonology/inventories/docs/x_mul.md → ...nslit/phonology/inventories/docs/x_uni.md
@@ -6,7 +6,7 @@ Feature tables for the phonological symbols in multilingual 'x_mul' inventory.
 
 <!-- AUTO-GENERATED INVENTORY STRING STARTS HERE -->
 
-## Inventory: x_mul
+## Inventory: x_uni
 
 ### alias: a	ipa: a	name: open front unrounded vowel
   a features:

diff --git a/...l_translit/phonology/inventories/x_mul.py → ...l_translit/phonology/inventories/x_uni.py b/...l_translit/phonology/inventories/x_mul.py → ...l_translit/phonology/inventories/x_uni.py
diff --git a/...nslit/phonology/inventories/x_mul_test.py → ...nslit/phonology/inventories/x_uni_test.py b/...nslit/phonology/inventories/x_mul_test.py → ...nslit/phonology/inventories/x_uni_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from absl.testing import absltest
-from nisaba.scripts.natural_translit.phonology.inventories import x_mul
+from nisaba.scripts.natural_translit.phonology.inventories import x_uni
 from nisaba.scripts.natural_translit.utils import test_op