Skip to content

Commit

Permalink
feat:standardize_lang_tag (#267)
Browse files Browse the repository at this point in the history
* feat:standardize_lang_tag

* Update __init__.py

* normalize lang

* normalize lang

* fix:dialect support

* standardize eveyrwhere

* Update ovos_plugin_manager/templates/tts.py

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

* fix syntax error

---------

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
  • Loading branch information
JarbasAl and coderabbitai[bot] authored Oct 12, 2024
1 parent 1ab200b commit 08ad348
Show file tree
Hide file tree
Showing 36 changed files with 131 additions and 139 deletions.
1 change: 0 additions & 1 deletion ovos_plugin_manager/segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from ovos_plugin_manager.templates.segmentation import Segmenter



def find_segmentation_plugins() -> dict:
"""
Find all installed plugins
Expand Down
14 changes: 7 additions & 7 deletions ovos_plugin_manager/templates/coreference.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ovos_bus_client.message import dig_for_message
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements
from quebra_frases import word_tokenize

Expand Down Expand Up @@ -64,10 +65,10 @@ def lang(self):
msg = dig_for_message()
if msg:
lang = msg.data.get("lang")
return lang or "en-us"
return standardize_lang_tag(lang or "en-US")

def contains_corefs(self, text, lang=None):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang, macro=True)
if lang.startswith("en"):
indicators = self.COREFERENCE_INDICATORS_EN
elif lang.startswith("pt"):
Expand Down Expand Up @@ -120,7 +121,7 @@ def extract_replacements(original, solved):
return bucket

def add_context(self, word, solved, lang=None):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
if lang not in self.contexts:
self.contexts[lang] = {}
if word not in self.contexts[lang]:
Expand All @@ -130,7 +131,7 @@ def add_context(self, word, solved, lang=None):
self.contexts[lang][word].append(solved)

def extract_context(self, text=None, solved=None, lang=None):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
text = text or self._prev_sentence
solved = solved or self._prev_solved
replaced = self.extract_replacements(text, solved)
Expand All @@ -139,7 +140,7 @@ def extract_context(self, text=None, solved=None, lang=None):
return replaced

def replace_coreferences(self, text, lang=None, set_context=False):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
solved = self.solve_corefs(text, lang=lang)
self._prev_sentence = text
self._prev_solved = solved
Expand All @@ -148,7 +149,7 @@ def replace_coreferences(self, text, lang=None, set_context=False):
return solved

def replace_coreferences_with_context(self, text, lang=None, context=None, set_context=False):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
lang_context = self.contexts.get(lang) or {}
default_context = {k: v[0] for k, v in lang_context.items() if v}

Expand All @@ -168,7 +169,6 @@ def replace_coreferences_with_context(self, text, lang=None, context=None, set_c
return solved

def solve_corefs(self, text, lang=None):
lang = lang or self.lang
return text


Expand Down
5 changes: 3 additions & 2 deletions ovos_plugin_manager/templates/hotwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""
from ovos_config import Configuration
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements


Expand All @@ -30,7 +31,7 @@ class HotWordEngine:
lang (str): language code (BCP-47)
"""

def __init__(self, key_phrase="hey mycroft", config=None, lang="en-us"):
def __init__(self, key_phrase="hey mycroft", config=None, lang="en-US"):
self.key_phrase = str(key_phrase).lower()
mycroft_config = Configuration()
if config is None:
Expand All @@ -49,7 +50,7 @@ def __init__(self, key_phrase="hey mycroft", config=None, lang="en-us"):
self.expected_duration = self.num_phonemes * phoneme_duration

self.listener_config = mycroft_config.get("listener") or {}
self.lang = str(self.config.get("lang", lang)).lower()
self.lang = standardize_lang_tag(self.config.get("lang", lang))

@classproperty
def runtime_requirements(self):
Expand Down
16 changes: 9 additions & 7 deletions ovos_plugin_manager/templates/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from ovos_config.config import Configuration
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements
from typing import Optional, Dict, Union, List, Set

Expand All @@ -16,8 +17,10 @@ def __init__(self, config: Optional[Dict[str, Union[str, int]]] = None):
Can contain "lang" for default language, "hint_lang" for a hint language, and "boost" for language boost score.
"""
self.config = config or {}
self.default_language = self.config.get("lang", "en-us")
self.hint_language = self.config.get("hint_lang") or self.config.get('user') or self.default_language
self.default_language = standardize_lang_tag(self.config.get("lang", "en-US"))
self.hint_language = standardize_lang_tag(self.config.get("hint_lang") or
self.config.get('user') or
self.default_language)
self.boost = self.config.get("boost")

@classproperty
Expand Down Expand Up @@ -46,7 +49,7 @@ def detect(self, text: str) -> str:
text (str): The text to detect the language of.
Returns:
str: The detected language code (e.g., 'en-us').
str: The detected language code (e.g., 'en-US').
"""

@abc.abstractmethod
Expand Down Expand Up @@ -85,11 +88,10 @@ def __init__(self, config: Optional[Dict[str, str]] = None):
"""
self.config = config or {}
# translate from, unless specified/detected otherwise
self.default_language = self.config.get("lang") or "en-us"
self.default_language = standardize_lang_tag(self.config.get("lang") or "en-US")
# translate to
self.internal_language = (Configuration().get('language') or
dict()).get("internal") or \
self.default_language
self.internal_language = standardize_lang_tag(Configuration().get('language', {}).get("internal") or \
self.default_language)

@classproperty
def runtime_requirements(self) -> RuntimeRequirements:
Expand Down
5 changes: 3 additions & 2 deletions ovos_plugin_manager/templates/postag.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ovos_bus_client.message import dig_for_message
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements


Expand Down Expand Up @@ -48,10 +49,10 @@ def lang(self):
msg = dig_for_message()
if msg:
lang = msg.data.get("lang")
return lang or "en-us"
return standardize_lang_tag(lang or "en-US")

def postag(self, spans, lang=None):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
# this should be implemented by plugins!
if lang.startswith("pt"):
return _dummy_postag_pt(spans)
Expand Down
6 changes: 3 additions & 3 deletions ovos_plugin_manager/templates/segmentation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from ovos_bus_client.message import dig_for_message
from ovos_utils import classproperty
from ovos_utils import flatten_list
from ovos_utils import classproperty, flatten_list
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements
from quebra_frases import sentence_tokenize

Expand Down Expand Up @@ -58,7 +58,7 @@ def lang(self):
msg = dig_for_message()
if msg:
lang = msg.data.get("lang")
return lang or "en-us"
return standardize_lang_tag(lang or "en-US")

@staticmethod
def __extract(text, markers):
Expand Down
5 changes: 5 additions & 0 deletions ovos_plugin_manager/templates/solvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from json_database import JsonStorageXDG
from ovos_utils.log import LOG, log_deprecation
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.xdg_utils import xdg_cache_home

from ovos_plugin_manager.templates.language import LanguageTranslator, LanguageDetector
Expand All @@ -26,6 +27,8 @@ def func_wrapper(*args, **kwargs):
return func(*args, **kwargs)

lang = kwargs.get("lang")
if lang:
lang = standardize_lang_tag(lang)
# check if translation can be skipped
if any([lang is None,
lang == solver.default_lang,
Expand Down Expand Up @@ -91,6 +94,8 @@ def func_wrapper(*args, **kwargs):
lang = solver.detect_language(v)
LOG.debug(f"detected 'lang': {lang} in argument '{idx}' for func: {func}")

if lang:
lang = standardize_lang_tag(lang)
kwargs["lang"] = lang
return func(*args, **kwargs)

Expand Down
17 changes: 7 additions & 10 deletions ovos_plugin_manager/templates/stt.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ovos_utils import classproperty
from ovos_utils.log import deprecated
from ovos_utils.process_utils import RuntimeRequirements

from ovos_utils.lang import standardize_lang_tag
from ovos_plugin_manager.utils.config import get_plugin_config


Expand Down Expand Up @@ -78,14 +78,14 @@ def recognizer(self, val):

@property
def lang(self):
return self._lang or \
return standardize_lang_tag(self._lang or \
self.config.get("lang") or \
Configuration().get("lang", "en-us")
Configuration().get("lang", "en-US"))

@lang.setter
def lang(self, val):
# backwards compat
self._lang = val
self._lang = standardize_lang_tag(val)

@property
@deprecated("self.keys has been deprecated! "
Expand Down Expand Up @@ -114,10 +114,7 @@ def credential(self, val):
"implement config handling directly instead", "1.0.0")
def init_language(config_core):
lang = config_core.get("lang", "en-US")
langs = lang.split("-")
if len(langs) == 2:
return langs[0].lower() + "-" + langs[1].upper()
return lang
return standardize_lang_tag(lang, macro=True)

@abstractmethod
def execute(self, audio, language: Optional[str] = None) -> str:
Expand Down Expand Up @@ -180,7 +177,7 @@ class StreamThread(Thread, metaclass=ABCMeta):

def __init__(self, queue, language):
super().__init__()
self.language = language
self.language = standardize_lang_tag(language)
self.queue = queue
self.text = None

Expand Down Expand Up @@ -219,7 +216,7 @@ def stream_start(self, language=None):
self.stream_stop()
self.queue = Queue()
self.stream = self.create_streaming_thread()
self.stream.language = language or self.lang
self.stream.language = standardize_lang_tag(language or self.lang)
self.transcript_ready.clear()
self.stream.start()

Expand Down
5 changes: 2 additions & 3 deletions ovos_plugin_manager/templates/tokenization.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ovos_bus_client.message import dig_for_message
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements
from quebra_frases import span_indexed_word_tokenize, word_tokenize

Expand Down Expand Up @@ -49,14 +50,12 @@ def lang(self):
msg = dig_for_message()
if msg:
lang = msg.data.get("lang")
return lang or "en-us"
return standardize_lang_tag(lang or "en-US")

def span_tokenize(self, text, lang=None):
lang = lang or self.lang
return span_indexed_word_tokenize(text)

def tokenize(self, text, lang=None):
lang = lang or self.lang
return word_tokenize(text)

@staticmethod
Expand Down
7 changes: 4 additions & 3 deletions ovos_plugin_manager/templates/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ovos_utils.fakebus import FakeBus
from ovos_utils.file_utils import get_cache_directory
from ovos_utils.lang.visimes import VISIMES
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.log import LOG, deprecated, log_deprecation
from ovos_utils.metrics import Stopwatch
from ovos_utils.process_utils import RuntimeRequirements
Expand Down Expand Up @@ -62,7 +63,7 @@ def __init__(self, plugin_id: str, lang: str, voice: str, synth_kwargs: dict = N
synth_kwargs (dict, optional): Additional keyword arguments for the synthesizer.
"""
self.plugin_id = plugin_id
self.lang = lang
self.lang = standardize_lang_tag(lang)
self.voice = voice
self.synth_kwargs = synth_kwargs or {}

Expand Down Expand Up @@ -593,7 +594,7 @@ def _get_ctxt(self, kwargs=None) -> TTSContext:

LOG.debug(f"TTS kwargs: {kwargs}")
return TTSContext(plugin_id=self.plugin_id,
lang=kwargs.get("lang") or Configuration().get("lang", "en-us"),
lang=kwargs.get("lang") or Configuration().get("lang", "en-US"),
voice=kwargs.get("voice", "default"),
synth_kwargs=kwargs)

Expand Down Expand Up @@ -933,7 +934,7 @@ def lang(self):
if message:
sess = SessionManager.get(message)
return sess.lang
return self.config.get("lang") or 'en-us'
return standardize_lang_tag(self.config.get("lang") or 'en-US')

@lang.setter
@deprecated("language is defined per request in get_tts, self.lang is not used",
Expand Down
9 changes: 5 additions & 4 deletions ovos_plugin_manager/thirdparty/solvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from typing import Optional, List, Dict

from ovos_utils import flatten_list
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.log import LOG
from quebra_frases import sentence_tokenize

Expand All @@ -53,7 +54,7 @@ def __init__(self, config=None,
self.enable_cache = enable_cache
self.config = config or {}
self.supported_langs = self.config.get("supported_langs") or []
self.default_lang = internal_lang or self.config.get("lang", "en")
self.default_lang = standardize_lang_tag(internal_lang or self.config.get("lang", "en"), macro=True)
if self.default_lang not in self.supported_langs:
self.supported_langs.insert(0, self.default_lang)
self._translator = translator or OVOSLangTranslationFactory.create() if self.enable_tx else None
Expand Down Expand Up @@ -123,9 +124,9 @@ def translate(self, text: str,
:param source_lang: Source language code.
:return: Translated text.
"""
source_lang = source_lang or self.detect_language(text)
target_lang = target_lang or self.default_lang
if source_lang.split("-")[0] == target_lang.split("-")[0]:
source_lang = standardize_lang_tag(source_lang or self.detect_language(text), macro=True)
target_lang = standardize_lang_tag(target_lang or self.default_lang, macro=True)
if source_lang == target_lang:
return text # skip translation
return self.translator.translate(text,
target=target_lang,
Expand Down
5 changes: 0 additions & 5 deletions ovos_plugin_manager/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,3 @@ def create(config=None):
f'\nAvailable modules: {modules}')
raise
return tts


if __name__ == "__main__":
lang = "en-us"
print(find_tts_plugins())
23 changes: 4 additions & 19 deletions ovos_plugin_manager/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from typing import Optional

import pkg_resources
from ovos_utils.log import LOG, log_deprecation
from ovos_utils.log import LOG, log_deprecation, deprecated


class PluginTypes(str, Enum):
Expand Down Expand Up @@ -173,25 +173,10 @@ def load_plugin(plug_name: str, plug_type: Optional[PluginTypes] = None):
LOG.warning(f'Could not find the plugin {plug_type}.{plug_name}')
return None


@deprecated("normalize_lang has been deprecated! update to 'from ovos_utils.lang import standardize_lang_tag'", "1.0.0")
def normalize_lang(lang):
# TODO consider moving to LF or ovos_utils
# special handling, the parse sometimes messes this up
# eg, uk-ua gets normalized to uk-gb
# this also makes lookup easier as we
# often get duplicate entries with both variants
if "-" in lang:
pieces = lang.split("-")
if len(pieces) == 2 and pieces[0] == pieces[1]:
lang = pieces[0]

try:
from langcodes import standardize_tag as _normalize_lang
lang = _normalize_lang(lang, macro=True)
except ValueError:
# this lang code is apparently not valid ?
pass
return lang
from ovos_utils.lang import standardize_lang_tag
return standardize_lang_tag(lang)


class ReadWriteStream:
Expand Down
Loading

0 comments on commit 08ad348

Please sign in to comment.