-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add signal for language detection (#453)
- Loading branch information
Showing
6 changed files
with
111 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
"""Language detection of a document.""" | ||
from typing import Iterable, Optional, cast | ||
|
||
import langdetect | ||
from langdetect import DetectorFactory, LangDetectException | ||
from typing_extensions import override | ||
|
||
from ..data.dataset_utils import lilac_span | ||
from ..schema import Field, Item, RichData, SignalInputType, field | ||
from .signal import TextSignal | ||
|
||
# For consistent results. | ||
DetectorFactory.seed = 42 | ||
|
||
LANG_CODE = 'lang_code' | ||
|
||
|
||
class LangDetectionSignal(TextSignal): | ||
"""Detects the language code in text. | ||
<br> | ||
Supports 55 languages returning their | ||
[ISO 639-1 codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). | ||
""" | ||
name = 'lang_detection' | ||
display_name = 'Language detection' | ||
|
||
input_type = SignalInputType.TEXT | ||
compute_type = SignalInputType.TEXT | ||
|
||
@override | ||
def fields(self) -> Field: | ||
return field(fields=[field('string_span', fields={LANG_CODE: 'string'})]) | ||
|
||
@override | ||
def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]: | ||
data = cast(Iterable[str], data) | ||
# Split on paragraphs. | ||
split_symbol = '\n\n' | ||
|
||
for text in data: | ||
offset = 0 | ||
new_offset = 0 | ||
result: list[Item] = [] | ||
while offset < len(text): | ||
new_offset = text.find(split_symbol, offset) | ||
if new_offset == -1: | ||
new_offset = len(text) | ||
text_span = text[offset:new_offset] | ||
text_span = text_span.strip() | ||
if text_span: | ||
try: | ||
lang_code = langdetect.detect(text_span) | ||
result.append(lilac_span(offset, new_offset, {LANG_CODE: lang_code})) | ||
except LangDetectException: | ||
pass | ||
offset = new_offset + len(split_symbol) | ||
yield result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
"""Tests for the language detection signal.""" | ||
|
||
from ..data.dataset_utils import lilac_span | ||
from .lang_detection import LANG_CODE, LangDetectionSignal | ||
|
||
|
||
def test_lang_detection_sentences() -> None: | ||
signal = LangDetectionSignal() | ||
docs = [ | ||
'War doesnt show whos right, just whos left.', | ||
'Ein, zwei, drei, vier', | ||
] | ||
res = list(signal.compute(docs)) | ||
assert res == [ | ||
[lilac_span(0, 43, {LANG_CODE: 'en'})], | ||
[lilac_span(0, 21, {LANG_CODE: 'de'})], | ||
] | ||
|
||
|
||
def test_lang_detection_multiple_paragraphs() -> None: | ||
signal = LangDetectionSignal() | ||
doc = 'War doesnt show whos right, just whos left.\n\nEin, zwei, drei, vier' | ||
res = list(signal.compute([doc])) | ||
assert res == [[ | ||
lilac_span(0, 43, {LANG_CODE: 'en'}), | ||
lilac_span(45, 66, {LANG_CODE: 'de'}), | ||
]] |