Skip to content

Commit

Permalink
Add signal for language detection (#453)
Browse files Browse the repository at this point in the history
  • Loading branch information
dsmilkov authored Jul 18, 2023
1 parent c744068 commit f30826d
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 2 deletions.
4 changes: 4 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,7 @@ follow_imports = skip
[mypy-scipy.integrate.*]
ignore_missing_imports = True
follow_imports = skip

[mypy-langdetect.*]
ignore_missing_imports = True
follow_imports = skip
16 changes: 15 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ typing-extensions = "^4.7.1"
uvicorn = {extras = ["standard"], version = "^0.22.0"}
joblib = "^1.3.1"
tenacity = "^8.2.2"
gunicorn = "^20.1.0"

# For fast JSON serialization: https://fastapi.tiangolo.com/advanced/custom-response/#use-orjsonresponse
orjson = "^3.8.10"
Expand All @@ -50,11 +51,13 @@ email-reply-parser = "^0.5.12"

# For text statistics.
textacy = "^0.13.0"
gunicorn = "^20.1.0"

# For PII and secrets.
detect-secrets = "^1.4.0"

# For language detection.
langdetect = "^1.0.9"

[tool.poetry.group.dev] # Deps for development.
optional = true

Expand Down
2 changes: 2 additions & 0 deletions src/signals/default_signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ..embeddings.sbert import SBERT
from .concept_labels import ConceptLabelsSignal
from .concept_scorer import ConceptScoreSignal
from .lang_detection import LangDetectionSignal
from .near_dup import NearDuplicateSignal
from .ner import SpacyNER
from .pii import PIISignal
Expand All @@ -23,6 +24,7 @@ def register_default_signals() -> None:
register_signal(TextStatisticsSignal)
register_signal(SpacyNER)
register_signal(NearDuplicateSignal)
register_signal(LangDetectionSignal)

# Embeddings.
register_signal(Cohere)
Expand Down
59 changes: 59 additions & 0 deletions src/signals/lang_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Language detection of a document."""
from typing import Iterable, Optional, cast

import langdetect
from langdetect import DetectorFactory, LangDetectException
from typing_extensions import override

from ..data.dataset_utils import lilac_span
from ..schema import Field, Item, RichData, SignalInputType, field
from .signal import TextSignal

# For consistent results.
DetectorFactory.seed = 42

LANG_CODE = 'lang_code'


class LangDetectionSignal(TextSignal):
"""Detects the language code in text.
<br>
Supports 55 languages returning their
[ISO 639-1 codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes).
"""
name = 'lang_detection'
display_name = 'Language detection'

input_type = SignalInputType.TEXT
compute_type = SignalInputType.TEXT

@override
def fields(self) -> Field:
return field(fields=[field('string_span', fields={LANG_CODE: 'string'})])

@override
def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
data = cast(Iterable[str], data)
# Split on paragraphs.
split_symbol = '\n\n'

for text in data:
offset = 0
new_offset = 0
result: list[Item] = []
while offset < len(text):
new_offset = text.find(split_symbol, offset)
if new_offset == -1:
new_offset = len(text)
text_span = text[offset:new_offset]
text_span = text_span.strip()
if text_span:
try:
lang_code = langdetect.detect(text_span)
result.append(lilac_span(offset, new_offset, {LANG_CODE: lang_code}))
except LangDetectException:
pass
offset = new_offset + len(split_symbol)
yield result
27 changes: 27 additions & 0 deletions src/signals/lang_detection_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Tests for the language detection signal."""

from ..data.dataset_utils import lilac_span
from .lang_detection import LANG_CODE, LangDetectionSignal


def test_lang_detection_sentences() -> None:
signal = LangDetectionSignal()
docs = [
'War doesnt show whos right, just whos left.',
'Ein, zwei, drei, vier',
]
res = list(signal.compute(docs))
assert res == [
[lilac_span(0, 43, {LANG_CODE: 'en'})],
[lilac_span(0, 21, {LANG_CODE: 'de'})],
]


def test_lang_detection_multiple_paragraphs() -> None:
signal = LangDetectionSignal()
doc = 'War doesnt show whos right, just whos left.\n\nEin, zwei, drei, vier'
res = list(signal.compute([doc]))
assert res == [[
lilac_span(0, 43, {LANG_CODE: 'en'}),
lilac_span(45, 66, {LANG_CODE: 'de'}),
]]

0 comments on commit f30826d

Please sign in to comment.