Skip to content

Commit

Permalink
Fix encoding issues for utf-16 (#3)
Browse files Browse the repository at this point in the history
* fix issues with utf-16 encoded subs

* add auto encoding detection using chardet

* add chardet as dependency and bump python to 3.7

* bump napi-py to 0.2.3

* add utf-16 test
  • Loading branch information
steciuk authored Jun 15, 2024
1 parent 86a1aa0 commit 3632fea
Show file tree
Hide file tree
Showing 5 changed files with 203 additions and 214 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@
.pytest_cache/
*.egg-info/
build/
dist/
dist/
__pycache__/
67 changes: 33 additions & 34 deletions napi/encoding.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,45 @@
import locale
from typing import Tuple, Optional

DECODING_ORDER = [
"windows-1250",
"windows-1251",
"windows-1252",
"windows-1253",
"windows-1254",
"utf-8",
]
SYMBOLS_WHEN_ENCODING_UTF8_AS_WIN1250 = [
"Ĺş",
"ĹĽ",
"Ĺ‚",
"Ĺ›",
"ć",
"Ä…",
"Ä™",
"Ăł",
"Ĺ„",
]
POLISH_DIACRITICS = ["ź", "ż", "ł", "ś", "ć", "ą", "ę", "ó", "ń"]
CHECK_IN_WORD_COUNT = 1000


def _diacritics_count_in_word(word: str) -> int:
return len([pd for pd in POLISH_DIACRITICS if pd.lower() in word.lower()])


def _err_symbol_count_in_word(word: str) -> int:
return len([err_sym for err_sym in SYMBOLS_WHEN_ENCODING_UTF8_AS_WIN1250 if err_sym.lower() in word.lower()])
from typing import Optional, Tuple

import chardet

DECODING_ORDER = ["utf-16", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "utf-8"]
CHECK_NUM_CHARS = 5000
AUTO_DETECT_THRESHOLD = 0.9


def _is_ascii(c: str) -> bool:
return ord(c) < 128


def _is_polish_diacritic(c: str) -> bool:
return c in "ąćęłńóśżźĄĆĘŁŃÓŚŻŹ"


def _is_correct_encoding(subs: str) -> bool:
err_symbols, diacritics = 0, 0
for word in subs.split()[:CHECK_IN_WORD_COUNT]:
diacritics += _diacritics_count_in_word(word)
err_symbols += _err_symbol_count_in_word(word)
for char in subs[:CHECK_NUM_CHARS]:
if _is_polish_diacritic(char):
diacritics += 1
elif not _is_ascii(char):
err_symbols += 1

return err_symbols < diacritics


def _detect_encoding(subs: bytes) -> Tuple[Optional[str], float]:
result = chardet.detect(subs)
return result["encoding"], result["confidence"]


def _try_decode(subs: bytes) -> Tuple[str, str]:
encoding, confidence = _detect_encoding(subs)
if encoding and confidence > AUTO_DETECT_THRESHOLD:
try:
return encoding, subs.decode(encoding)
except UnicodeDecodeError:
pass

last_exc = None
for i, enc in enumerate(DECODING_ORDER):
try:
Expand Down
Loading

0 comments on commit 3632fea

Please sign in to comment.