Use charset_normalizer instead of chardet

stepzen-dev · Mar 22, 2024 · ecc3511 · ecc3511
1 parent bdfd975
commit ecc3511
Show file tree

Hide file tree

Showing 3 changed files with 4 additions and 5 deletions.
diff --git a/requirements/base.in b/requirements/base.in
@@ -1,5 +1,5 @@
 -c constraints.in
-chardet
+charset-normalizer
 filetype
 python-magic
 lxml

diff --git a/requirements/base.txt b/requirements/base.txt
@@ -13,10 +13,9 @@ certifi==2024.2.2
     #   -c constraints.in
     #   requests
     #   unstructured-client
-chardet==5.2.0
-    # via -r base.in
 charset-normalizer==3.3.2
     # via
+    #   -r base.in
     #   requests
     #   unstructured-client
 click==8.1.7

diff --git a/unstructured/file_utils/encoding.py b/unstructured/file_utils/encoding.py
@@ -1,6 +1,6 @@
 from typing import IO, Optional, Tuple, Union
 
-import chardet
+import charset_normalizer
 
 from unstructured.partition.common import convert_to_bytes
 
@@ -70,7 +70,7 @@ def detect_file_encoding(
     else:
         raise FileNotFoundError("No filename nor file were specified")
 
-    result = chardet.detect(byte_data)
+    result = charset_normalizer.detect(byte_data)
     encoding = result["encoding"]
     confidence = result["confidence"]