Skip to content

Commit

Permalink
Use charset_normalizer instead of chardet
Browse files Browse the repository at this point in the history
  • Loading branch information
joheriks committed Mar 22, 2024
1 parent bdfd975 commit ecc3511
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 5 deletions.
2 changes: 1 addition & 1 deletion requirements/base.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-c constraints.in
chardet
charset-normalizer
filetype
python-magic
lxml
Expand Down
3 changes: 1 addition & 2 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,9 @@ certifi==2024.2.2
# -c constraints.in
# requests
# unstructured-client
chardet==5.2.0
# via -r base.in
charset-normalizer==3.3.2
# via
# -r base.in
# requests
# unstructured-client
click==8.1.7
Expand Down
4 changes: 2 additions & 2 deletions unstructured/file_utils/encoding.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import IO, Optional, Tuple, Union

import chardet
import charset_normalizer

from unstructured.partition.common import convert_to_bytes

Expand Down Expand Up @@ -70,7 +70,7 @@ def detect_file_encoding(
else:
raise FileNotFoundError("No filename nor file were specified")

result = chardet.detect(byte_data)
result = charset_normalizer.detect(byte_data)
encoding = result["encoding"]
confidence = result["confidence"]

Expand Down

0 comments on commit ecc3511

Please sign in to comment.