diff --git a/mypy.ini b/mypy.ini index 446917559..9c4f87d86 100644 --- a/mypy.ini +++ b/mypy.ini @@ -112,3 +112,7 @@ follow_imports = skip [mypy-scipy.integrate.*] ignore_missing_imports = True follow_imports = skip + +[mypy-langdetect.*] +ignore_missing_imports = True +follow_imports = skip diff --git a/poetry.lock b/poetry.lock index 57952ca8b..3043f3de0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2627,6 +2627,20 @@ files = [ [package.extras] data = ["language-data (>=1.1,<2.0)"] +[[package]] +name = "langdetect" +version = "1.0.9" +description = "Language detection library ported from Google's language-detection." +optional = false +python-versions = "*" +files = [ + {file = "langdetect-1.0.9-py2-none-any.whl", hash = "sha256:7cbc0746252f19e76f77c0b1690aadf01963be835ef0cd4b56dddf2a8f1dfc2a"}, + {file = "langdetect-1.0.9.tar.gz", hash = "sha256:cbc1fef89f8d062739774bd51eda3da3274006b3661d199c2655f6b3f6d605a0"}, +] + +[package.dependencies] +six = "*" + [[package]] name = "locket" version = "1.0.0" @@ -6272,4 +6286,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "~3.9" -content-hash = "0fe7011e823632ce13aac8baffc73b672d774cfeb3df9aef817865cebaa1909e" +content-hash = "edce308e1dd973099aa851bd460c8a64bd9b0f579e426df1452baf22e507726a" diff --git a/pyproject.toml b/pyproject.toml index adc506dcc..a927fade4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ typing-extensions = "^4.7.1" uvicorn = {extras = ["standard"], version = "^0.22.0"} joblib = "^1.3.1" tenacity = "^8.2.2" +gunicorn = "^20.1.0" # For fast JSON serialization: https://fastapi.tiangolo.com/advanced/custom-response/#use-orjsonresponse orjson = "^3.8.10" @@ -50,11 +51,13 @@ email-reply-parser = "^0.5.12" # For text statistics. textacy = "^0.13.0" -gunicorn = "^20.1.0" # For PII and secrets. detect-secrets = "^1.4.0" +# For language detection. +langdetect = "^1.0.9" + [tool.poetry.group.dev] # Deps for development. optional = true diff --git a/src/signals/default_signals.py b/src/signals/default_signals.py index fedff0fd6..dddffd1f2 100644 --- a/src/signals/default_signals.py +++ b/src/signals/default_signals.py @@ -5,6 +5,7 @@ from ..embeddings.sbert import SBERT from .concept_labels import ConceptLabelsSignal from .concept_scorer import ConceptScoreSignal +from .lang_detection import LangDetectionSignal from .near_dup import NearDuplicateSignal from .ner import SpacyNER from .pii import PIISignal @@ -23,6 +24,7 @@ def register_default_signals() -> None: register_signal(TextStatisticsSignal) register_signal(SpacyNER) register_signal(NearDuplicateSignal) + register_signal(LangDetectionSignal) # Embeddings. register_signal(Cohere) diff --git a/src/signals/lang_detection.py b/src/signals/lang_detection.py new file mode 100644 index 000000000..2508cd721 --- /dev/null +++ b/src/signals/lang_detection.py @@ -0,0 +1,59 @@ +"""Language detection of a document.""" +from typing import Iterable, Optional, cast + +import langdetect +from langdetect import DetectorFactory, LangDetectException +from typing_extensions import override + +from ..data.dataset_utils import lilac_span +from ..schema import Field, Item, RichData, SignalInputType, field +from .signal import TextSignal + +# For consistent results. +DetectorFactory.seed = 42 + +LANG_CODE = 'lang_code' + + +class LangDetectionSignal(TextSignal): + """Detects the language code in text. + +
+ + Supports 55 languages returning their + [ISO 639-1 codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). + """ + name = 'lang_detection' + display_name = 'Language detection' + + input_type = SignalInputType.TEXT + compute_type = SignalInputType.TEXT + + @override + def fields(self) -> Field: + return field(fields=[field('string_span', fields={LANG_CODE: 'string'})]) + + @override + def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]: + data = cast(Iterable[str], data) + # Split on paragraphs. + split_symbol = '\n\n' + + for text in data: + offset = 0 + new_offset = 0 + result: list[Item] = [] + while offset < len(text): + new_offset = text.find(split_symbol, offset) + if new_offset == -1: + new_offset = len(text) + text_span = text[offset:new_offset] + text_span = text_span.strip() + if text_span: + try: + lang_code = langdetect.detect(text_span) + result.append(lilac_span(offset, new_offset, {LANG_CODE: lang_code})) + except LangDetectException: + pass + offset = new_offset + len(split_symbol) + yield result diff --git a/src/signals/lang_detection_test.py b/src/signals/lang_detection_test.py new file mode 100644 index 000000000..2324f39c9 --- /dev/null +++ b/src/signals/lang_detection_test.py @@ -0,0 +1,27 @@ +"""Tests for the language detection signal.""" + +from ..data.dataset_utils import lilac_span +from .lang_detection import LANG_CODE, LangDetectionSignal + + +def test_lang_detection_sentences() -> None: + signal = LangDetectionSignal() + docs = [ + 'War doesnt show whos right, just whos left.', + 'Ein, zwei, drei, vier', + ] + res = list(signal.compute(docs)) + assert res == [ + [lilac_span(0, 43, {LANG_CODE: 'en'})], + [lilac_span(0, 21, {LANG_CODE: 'de'})], + ] + + +def test_lang_detection_multiple_paragraphs() -> None: + signal = LangDetectionSignal() + doc = 'War doesnt show whos right, just whos left.\n\nEin, zwei, drei, vier' + res = list(signal.compute([doc])) + assert res == [[ + lilac_span(0, 43, {LANG_CODE: 'en'}), + lilac_span(45, 66, {LANG_CODE: 'de'}), + ]]