lilacai · dsmilkov · Jul 18, 2023 · Jul 18, 2023 · Jul 18, 2023
diff --git a/mypy.ini b/mypy.ini
@@ -112,3 +112,7 @@ follow_imports = skip
 [mypy-scipy.integrate.*]
 ignore_missing_imports = True
 follow_imports = skip
+
+[mypy-langdetect.*]
+ignore_missing_imports = True
+follow_imports = skip
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ typing-extensions = "^4.7.1"
 uvicorn = {extras = ["standard"], version = "^0.22.0"}
 joblib = "^1.3.1"
 tenacity = "^8.2.2"
+gunicorn = "^20.1.0"
 
 # For fast JSON serialization: https://fastapi.tiangolo.com/advanced/custom-response/#use-orjsonresponse
 orjson = "^3.8.10"
@@ -50,11 +51,13 @@ email-reply-parser = "^0.5.12"
 
 # For text statistics.
 textacy = "^0.13.0"
-gunicorn = "^20.1.0"
 
 # For PII and secrets.
 detect-secrets = "^1.4.0"
 
+# For language detection.
+langdetect = "^1.0.9"
+
 [tool.poetry.group.dev]  # Deps for development.
 optional = true
 

diff --git a/src/signals/default_signals.py b/src/signals/default_signals.py
@@ -5,6 +5,7 @@
 from ..embeddings.sbert import SBERT
 from .concept_labels import ConceptLabelsSignal
 from .concept_scorer import ConceptScoreSignal
+from .lang_detection import LangDetectionSignal
 from .near_dup import NearDuplicateSignal
 from .ner import SpacyNER
 from .pii import PIISignal
@@ -23,6 +24,7 @@ def register_default_signals() -> None:
   register_signal(TextStatisticsSignal)
   register_signal(SpacyNER)
   register_signal(NearDuplicateSignal)
+  register_signal(LangDetectionSignal)
 
   # Embeddings.
   register_signal(Cohere)

diff --git a/src/signals/lang_detection.py b/src/signals/lang_detection.py
@@ -0,0 +1,62 @@
+"""Language detection of a document."""
+from typing import Iterable, Optional, cast
+
+import langdetect
+from langdetect import DetectorFactory, LangDetectException
+from typing_extensions import override
+
+from ..data.dataset_utils import lilac_span
+from ..schema import Field, Item, RichData, SignalInputType, field
+from .signal import TextSignal
+
+# For consistent results.
+DetectorFactory.seed = 42
+
+LANG_CODE = 'lang_code'
+
+
+class LangDetectionSignal(TextSignal):
+  """Detects the language code in text.
+
+  <br>
+
+  Supports 55 languages returning their
+  [ISO 639-1 codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes).
+  """
+  name = 'lang_detection'
+  display_name = 'Language detection'
+
+  input_type = SignalInputType.TEXT
+  compute_type = SignalInputType.TEXT
+
+  @override
+  def fields(self) -> Field:
+    return field(fields=[field('string_span', fields={LANG_CODE: 'string'})])
+
+  @override
+  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
+    data = cast(Iterable[str], data)
+    # Split on paragraphs.
+    split_symbol = '\n\n'
+
+    for text in data:
+      if not text:
+        return None
+
+      offset = 0
+      new_offset = 0
+      result: list[Item] = []
+      while offset < len(text):
+        new_offset = text.find(split_symbol, offset)
+        if new_offset == -1:
+          new_offset = len(text)
+        text_span = text[offset:new_offset]
+        text_span = text_span.strip()
+        if text_span:
+          try:
+            lang_code = langdetect.detect(text_span)
+            result.append(lilac_span(offset, new_offset, {LANG_CODE: lang_code}))
+          except LangDetectException:
+            pass
+        offset = new_offset + len(split_symbol)
+      yield result
diff --git a/src/signals/lang_detection_test.py b/src/signals/lang_detection_test.py
@@ -0,0 +1,27 @@
+"""Tests for the language detection signal."""
+
+from ..data.dataset_utils import lilac_span
+from .lang_detection import LANG_CODE, LangDetectionSignal
+
+
+def test_lang_detection_sentences() -> None:
+  signal = LangDetectionSignal()
+  docs = [
+    'War doesnt show whos right, just whos left.',
+    'Ein, zwei, drei, vier',
+  ]
+  res = list(signal.compute(docs))
+  assert res == [
+    [lilac_span(0, 43, {LANG_CODE: 'en'})],
+    [lilac_span(0, 21, {LANG_CODE: 'de'})],
+  ]
+
+
+def test_lang_detection_multiple_paragraphs() -> None:
+  signal = LangDetectionSignal()
+  doc = 'War doesnt show whos right, just whos left.\n\nEin, zwei, drei, vier'
+  res = list(signal.compute([doc]))
+  assert res == [[
+    lilac_span(0, 43, {LANG_CODE: 'en'}),
+    lilac_span(45, 66, {LANG_CODE: 'de'}),
+  ]]