SocialFinanceDigitalLabs · StephenCarterLIIA · Jun 13, 2024 · Nov 23, 2023 · Feb 5, 2024 · Feb 6, 2024
diff --git a/liiatools/datasets/s903/lds_ssda903_clean/prep.py b/liiatools/datasets/s903/lds_ssda903_clean/prep.py
@@ -1,65 +1,66 @@
 import logging
 import pandas as pd
 from pathlib import Path
-import cchardet as chardet
+
+# import cchardet as chardet
 from datetime import datetime
 
 import pandas.errors
 
 
-def file_encoding_audit(
-    data_folder: Path,
-) -> pd.DataFrame:
-    """
-    Function takes in a folder path object, it then uses the cchardet library to fast detect the file encoding types
-
-    :param data_folder: Path object that is a folder containing files to be processed
-    :type data_folder: Path
-    :return: A Dataframe of low confidence encoded files
-    :rtype: pd.DataFrame
-    """
-
-    # TODO - Check csv encoding type of file, save to utf-8
-    # TODO - Check xml encoding type of file
-    # Save as an acceptable format
-    result_out = []
-
-    for cdf in data_folder.glob("**/*"):
-        if cdf.is_file() and "log" not in cdf.root:
-            with open(cdf, "rb") as f:
-                msg = f.read()
-                result = chardet.detect(msg)
-                out = f"{cdf.parts[-3]}, {cdf.stem}, {result}"
-                # this is messy.
-                outt = (
-                    out.replace("}", "")
-                    .replace("{", "")
-                    .replace("confidence", "")
-                    .replace("encoding", "")
-                    .replace("'':", "")
-                )
-                result_out.append(outt)
-
-    # Save the outputs of the list generated by running cchardet on the file list,
-    # the result is then appended into a dataframe and filtered to return a list of files that
-    # have low confidence as to their encoding types.
-    encoding_series = pd.Series(result_out)
-
-    encoding_df = pd.DataFrame(encoding_series, columns=["file_name"])
-
-    # Split out dataframe
-    encoding_df[
-        ["local_authority", "file_name", "encoding", "confidence"]
-    ] = encoding_df.file_name.str.split(",", expand=True)
-
-    # Filter out log files and drop high confidence files types
-    encoded_df = encoding_df[
-        ~encoding_df["file_name"].str.contains("Logs")
-        & ~(encoding_df["confidence"].str.contains("1.0"))
-    ]
-
-    encoded_df.to_csv("encoding_audit.csv", encoding="utf-8")
-    return encoded_df
+# def file_encoding_audit(
+#     data_folder: Path,
+# ) -> pd.DataFrame:
+#     """
+#     Function takes in a folder path object, it then uses the cchardet library to fast detect the file encoding types
+#
+#     :param data_folder: Path object that is a folder containing files to be processed
+#     :type data_folder: Path
+#     :return: A Dataframe of low confidence encoded files
+#     :rtype: pd.DataFrame
+#     """
+#
+#     # TODO - Check csv encoding type of file, save to utf-8
+#     # TODO - Check xml encoding type of file
+#     # Save as an acceptable format
+#     result_out = []
+#
+#     for cdf in data_folder.glob("**/*"):
+#         if cdf.is_file() and "log" not in cdf.root:
+#             with open(cdf, "rb") as f:
+#                 msg = f.read()
+#                 result = chardet.detect(msg)
+#                 out = f"{cdf.parts[-3]}, {cdf.stem}, {result}"
+#                 # this is messy.
+#                 outt = (
+#                     out.replace("}", "")
+#                     .replace("{", "")
+#                     .replace("confidence", "")
+#                     .replace("encoding", "")
+#                     .replace("'':", "")
+#                 )
+#                 result_out.append(outt)
+#
+#     # Save the outputs of the list generated by running cchardet on the file list,
+#     # the result is then appended into a dataframe and filtered to return a list of files that
+#     # have low confidence as to their encoding types.
+#     encoding_series = pd.Series(result_out)
+#
+#     encoding_df = pd.DataFrame(encoding_series, columns=["file_name"])
+#
+#     # Split out dataframe
+#     encoding_df[
+#         ["local_authority", "file_name", "encoding", "confidence"]
+#     ] = encoding_df.file_name.str.split(",", expand=True)
+#
+#     # Filter out log files and drop high confidence files types
+#     encoded_df = encoding_df[
+#         ~encoding_df["file_name"].str.contains("Logs")
+#         & ~(encoding_df["confidence"].str.contains("1.0"))
+#     ]
+#
+#     encoded_df.to_csv("encoding_audit.csv", encoding="utf-8")
+#     return encoded_df
 
 
 def delete_unrequired_files(input: str, drop_file_list: list, la_log_dir: str):

diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/__init__.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/__init__.py