Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Episodes fix #245

Merged
merged 28 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
d570f7d
add previous and next episodes to dataframe
patrick-troy Nov 23, 2023
1a1eda7
Create input test file
StephenCarterLIIA Feb 5, 2024
11f77d9
Latest changes
StephenCarterLIIA Feb 6, 2024
0d5a1cf
Work in progress
StephenCarterLIIA Feb 6, 2024
246aab8
Work in progress
StephenCarterLIIA Feb 12, 2024
eb96556
WIP - sort out rule to apply
StephenCarterLIIA Feb 12, 2024
43baf30
WIP - correctly identifies which stage 1 rule to apply
StephenCarterLIIA Feb 14, 2024
1632b25
WIP
StephenCarterLIIA Feb 14, 2024
892f6aa
WIP - stage 1 rules working
StephenCarterLIIA Feb 26, 2024
88eb385
WIP - applied stage 2 rules
StephenCarterLIIA Feb 27, 2024
6634508
Apply all rule fixes
StephenCarterLIIA Mar 1, 2024
bb85d64
add unit test
patrick-troy Mar 13, 2024
cd1472e
Add unit test
StephenCarterLIIA Mar 14, 2024
73d50a2
Add test for is_next_episode_duplicate
StephenCarterLIIA Mar 25, 2024
9904ca1
WIP Add more tests and function stubs
StephenCarterLIIA Mar 26, 2024
2d3e64d
WIP Add more test functions for episode fixes
StephenCarterLIIA Apr 2, 2024
6d7fa6b
Add more unit tests for episode fix
StephenCarterLIIA Apr 3, 2024
7f28a1a
Merge remote-tracking branch 'origin/main' into episodes-fix
StephenCarterLIIA Apr 3, 2024
89490c2
Implemented suggestions
StephenCarterLIIA May 20, 2024
d5111fa
add temp unit test rewriter
May 24, 2024
8e42565
add temp unit test rewriter
May 24, 2024
479135f
Expanded unit tests for episode fixes
StephenCarterLIIA May 29, 2024
7ee4a18
Merge remote-tracking branch 'origin/episodes-fix' into episodes-fix
StephenCarterLIIA May 29, 2024
bded68c
cleanup tests, remove test_rewriter
May 30, 2024
6e85e29
Update pan-agg yml
StephenCarterLIIA May 31, 2024
2bd1001
Merge remote-tracking branch 'refs/remotes/origin/episodes-fix' into …
StephenCarterLIIA May 31, 2024
0127ed2
Episode fixes minor tidy up
StephenCarterLIIA May 31, 2024
d2c70ce
Removed comment
StephenCarterLIIA Jun 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 55 additions & 54 deletions liiatools/datasets/s903/lds_ssda903_clean/prep.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,66 @@
import logging
import pandas as pd
from pathlib import Path
import cchardet as chardet

# import cchardet as chardet
from datetime import datetime

import pandas.errors


def file_encoding_audit(
data_folder: Path,
) -> pd.DataFrame:
"""
Function takes in a folder path object, it then uses the cchardet library to fast detect the file encoding types

:param data_folder: Path object that is a folder containing files to be processed
:type data_folder: Path
:return: A Dataframe of low confidence encoded files
:rtype: pd.DataFrame
"""

# TODO - Check csv encoding type of file, save to utf-8
# TODO - Check xml encoding type of file
# Save as an acceptable format
result_out = []

for cdf in data_folder.glob("**/*"):
if cdf.is_file() and "log" not in cdf.root:
with open(cdf, "rb") as f:
msg = f.read()
result = chardet.detect(msg)
out = f"{cdf.parts[-3]}, {cdf.stem}, {result}"
# this is messy.
outt = (
out.replace("}", "")
.replace("{", "")
.replace("confidence", "")
.replace("encoding", "")
.replace("'':", "")
)
result_out.append(outt)

# Save the outputs of the list generated by running cchardet on the file list,
# the result is then appended into a dataframe and filtered to return a list of files that
# have low confidence as to their encoding types.
encoding_series = pd.Series(result_out)

encoding_df = pd.DataFrame(encoding_series, columns=["file_name"])

# Split out dataframe
encoding_df[
["local_authority", "file_name", "encoding", "confidence"]
] = encoding_df.file_name.str.split(",", expand=True)

# Filter out log files and drop high confidence files types
encoded_df = encoding_df[
~encoding_df["file_name"].str.contains("Logs")
& ~(encoding_df["confidence"].str.contains("1.0"))
]

encoded_df.to_csv("encoding_audit.csv", encoding="utf-8")
return encoded_df
# def file_encoding_audit(
# data_folder: Path,
# ) -> pd.DataFrame:
# """
# Function takes in a folder path object, it then uses the cchardet library to fast detect the file encoding types
#
# :param data_folder: Path object that is a folder containing files to be processed
# :type data_folder: Path
# :return: A Dataframe of low confidence encoded files
# :rtype: pd.DataFrame
# """
#
# # TODO - Check csv encoding type of file, save to utf-8
# # TODO - Check xml encoding type of file
# # Save as an acceptable format
# result_out = []
#
# for cdf in data_folder.glob("**/*"):
# if cdf.is_file() and "log" not in cdf.root:
# with open(cdf, "rb") as f:
# msg = f.read()
# result = chardet.detect(msg)
# out = f"{cdf.parts[-3]}, {cdf.stem}, {result}"
# # this is messy.
# outt = (
# out.replace("}", "")
# .replace("{", "")
# .replace("confidence", "")
# .replace("encoding", "")
# .replace("'':", "")
# )
# result_out.append(outt)
#
# # Save the outputs of the list generated by running cchardet on the file list,
# # the result is then appended into a dataframe and filtered to return a list of files that
# # have low confidence as to their encoding types.
# encoding_series = pd.Series(result_out)
#
# encoding_df = pd.DataFrame(encoding_series, columns=["file_name"])
#
# # Split out dataframe
# encoding_df[
# ["local_authority", "file_name", "encoding", "confidence"]
# ] = encoding_df.file_name.str.split(",", expand=True)
#
# # Filter out log files and drop high confidence files types
# encoded_df = encoding_df[
# ~encoding_df["file_name"].str.contains("Logs")
# & ~(encoding_df["confidence"].str.contains("1.0"))
# ]
#
# encoded_df.to_csv("encoding_audit.csv", encoding="utf-8")
# return encoded_df


def delete_unrequired_files(input: str, drop_file_list: list, la_log_dir: str):
Expand Down
Empty file.
Loading
Loading