From d570f7d8c89f2e591ab0bd758d7970d3bb7edab0 Mon Sep 17 00:00:00 2001 From: patrick-troy <58770937+patrick-troy@users.noreply.github.com> Date: Thu, 23 Nov 2023 13:35:23 +0000 Subject: [PATCH 01/25] add previous and next episodes to dataframe --- .../s903/lds_ssda903_episodes_fix/__init__.py | 0 .../s903/lds_ssda903_episodes_fix/process.py | 40 +++++++++++++++++++ .../datasets/s903/s903_main_functions.py | 30 ++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 liiatools/datasets/s903/lds_ssda903_episodes_fix/__init__.py create mode 100644 liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/__init__.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py new file mode 100644 index 00000000..1f445831 --- /dev/null +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -0,0 +1,40 @@ +import numpy as np +import pandas as pd + +__COLUMNS = [ + "DECOM", + "RNE", + "LS", + "CIN", + "PLACE", + "PLACE_PROVIDER", + "DEC", + "REC", + "REASON_PLACE_CHANGE", + "HOME_POST", + "PL_POST", + "URN", + "YEAR", +] + + +def create_previous_and_next_episode(dataframe: pd.DataFrame, columns: list) -> pd.DataFrame: + """ + Add previous and next episode information to each line of a dataframe + + :param dataframe: Dataframe with SSDA903 Episodes data + :param columns: List of columns containing required data from previous/next episodes + :return: Dataframe with columns showing previous and next episodes + """ + for column in columns: + dataframe[column + "_previous"] = np.where( + dataframe["CHILD"] == dataframe["CHILD"].shift(1), + dataframe[column].shift(1), + None, + ) + dataframe[column + "_next"] = np.where( + dataframe["CHILD"] == dataframe["CHILD"].shift(-1), + dataframe[column].shift(-1), + None, + ) + return dataframe diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index e73f5c36..a34257ec 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -24,6 +24,9 @@ from liiatools.datasets.s903.lds_ssda903_sufficiency import configuration as suff_config from liiatools.datasets.s903.lds_ssda903_sufficiency import process as suff_process +# dependencies for episodes fix() +from liiatools.datasets.s903.lds_ssda903_episodes_fix import process as episodes_process + from liiatools.spec import common as common_asset_dir from liiatools.datasets.shared_functions import ( prep, @@ -200,3 +203,30 @@ def sufficiency_output(input, output): minimise = config["minimise"] s903_df = suff_process.data_min(s903_df, minimise, table_name) suff_process.export_suff_file(output, table_name, s903_df) + + +def episodes_fix(input, output): + """" + Applies fixes to la_agg SSDA903 Episodes files + :param input: should specify the input file location, including file name and suffix, and be usable by a Path function + :param output: should specify the path to the output folder + :return: None + """ + + # Configuration + config = agg_config.Config() + + # Read file and match type + s903_df = common_process.read_file(input) + column_names = config["column_names"] + table_name = common_process.match_load_file(s903_df, column_names) + if table_name == "Episodes": + s903_df = s903_df.sort_values(["CHILD", "DECOM"], ignore_index=True) + s903_df_next = episodes_process.create_previous_and_next_episode(s903_df, episodes_process.__COLUMNS) + print(s903_df_next) + + +episodes_fix( + r"C:\Users\patrick.troy\OneDrive - Social Finance Ltd\Work\LIIA\LIIA tests\903\SSDA903_episodes.csv", + r"C:\Users\patrick.troy\OneDrive - Social Finance Ltd\Work\LIIA\LIIA tests\903" +) From 1a1eda759a0e3640d14d8f4bf820823a48476e65 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Mon, 5 Feb 2024 14:21:08 +0000 Subject: [PATCH 02/25] Create input test file --- ...DA903_episodes_for_testing_fixes_INPUT.csv | 19 +++++++++++++++++++ .../datasets/s903/s903_main_functions.py | 17 ++++++++++++++--- 2 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv b/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv new file mode 100644 index 00000000..c402e4a2 --- /dev/null +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv @@ -0,0 +1,19 @@ +CHILD,DECOM,RNE,LS,CIN,PLACE,PLACE_PROVIDER,DEC,REC,REASON_PLACE_CHANGE,HOME_POST,PL_POST,URN,LA,YEAR +RULE1_BAD,2019/06/21,P,C2,N1,H5,PR4,2019/07/23,E6,,R1 1,Z28 8,,Barking and Dagenham,2020 +RULE1_BAD,2017/08/20,T,C2,N1,U4,PR4,2019/06/21,X1,CARPL,R1 1,Z8 5,4743233,Barking and Dagenham,2020 +RULE1_BAD,2016/01/13,P,C2,N1,U6,PR4,,,,R1 1,Z8 5,4743233,Barking and Dagenham,2018 +RULE1A_BAD,2017/04/20,S,V2,N1,H5,PR4,2017/06/13,E8,,R1A 1,Z28 2,,Barking and Dagenham,2018 +RULE1A_BAD,2016/01/22,S,V2,N1,P2,PR4,2016/10/24,X1,CARPL,R1A 1,Z25 1,,Barking and Dagenham,2017 +RULE1A_BAD,2016/10/24,P,V2,N1,P2,PR5,,,,R1A 1,Z8 5,,Barking and Dagenham,2017 +RULE2_BAD,2019/05/10,S,J1,N6,R5,PR3,,,,R2 1,Z32 4,,Barking and Dagenham,2020 +RULE3_BAD,2016/09/23,S,V2,N1,U6,PR1,2017/05/30,E2,,R3 1,Z12 3,4270000,Barking and Dagenham,2018 +RULE3_BAD,2016/09/27,S,V2,N1,U6,PR1,,,,R3 1,Z12 3,4270000,Barking and Dagenham,2017 +RULE3A_BAD,2018/01/25,L,C1,N1,U3,PR1,2018/05/16,E45,PLACE,R3A 1,Z11 4,4740000,Barking and Dagenham,2019 +RULE3A_BAD,2017/11/25,S,V2,N1,U3,PR1,2018/01/25,X1,,R3A 1,Z11 4,4740000,Barking and Dagenham,2018 +RULE3A_BAD,2018/03/22,T,C1,N1,U1,PR1,,,,,,4740000,Barking and Dagenham,2018 +RULE4_BAD,2019/08/19,P,C1,N1,P1,PR0,2019/10/01,E4A,,R4 3,Z1 4,,Barking and Dagenham,2020 +RULE4_BAD,2018/12/20,P,C1,N1,U6,PR1,2019/08/19,X1,CARPL,R4 3,Z5 2,4890000,Barking and Dagenham,2020 +RULE4_BAD,2018/11/23,S,C1,N1,U6,PR1,2018/12/24,X1,OTHER,R4 3,Z22 9,4890000,Barking and Dagenham,2019 +RULE4_BAD,2018/12/24,P,C1,N1,U6,PR1,,,,R4 3,Z15 2,4890000,Barking and Dagenham,2019 +NORULE_BAD,2016/07/26,T,C2,N1,U4,PR4,2021/12/12,E17,,ABC 1,DEF 2,4610000,Barking and Dagenham,2022 +NORULE_BAD,2015/06/19,P,C2,N1,U6,PR4,2016/07/26,X1,CARPL,ABC 1,FGH 3,4610000,Barking and Dagenham,2017 diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index a34257ec..27be02b3 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -221,12 +221,23 @@ def episodes_fix(input, output): column_names = config["column_names"] table_name = common_process.match_load_file(s903_df, column_names) if table_name == "Episodes": + dates = config["dates"] + s903_df = common_process.convert_datetimes(s903_df, dates, table_name) s903_df = s903_df.sort_values(["CHILD", "DECOM"], ignore_index=True) + print (s903_df.dtypes) s903_df_next = episodes_process.create_previous_and_next_episode(s903_df, episodes_process.__COLUMNS) - print(s903_df_next) + + # Following code used to test outputs during development + s903_df_next = s903_df_next.sort_values(["CHILD", "DECOM"], ignore_index=True) + print (s903_df_next.dtypes) # Issue: np.where is converting types, e.g. year to 2017.0, dates to 1469491200000000000 + print (s903_df_next) + s903_df_next.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", + index=False) episodes_fix( - r"C:\Users\patrick.troy\OneDrive - Social Finance Ltd\Work\LIIA\LIIA tests\903\SSDA903_episodes.csv", - r"C:\Users\patrick.troy\OneDrive - Social Finance Ltd\Work\LIIA\LIIA tests\903" + r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv", + r"liiatools/datasets/s903/lds_ssda903_episodes_fix" ) + +# poetry run python liiatools/datasets/s903/s903_main_functions.py From 11f77d9f49ef34e79d9cecd97abc4c9b4593f524 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:57:20 +0000 Subject: [PATCH 03/25] Latest changes --- ...DA903_episodes_for_testing_fixes_INPUT.csv | 37 ++++++++++--------- .../s903/lds_ssda903_episodes_fix/process.py | 25 +++++++++++++ .../datasets/s903/s903_main_functions.py | 10 ++--- 3 files changed, 49 insertions(+), 23 deletions(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv b/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv index c402e4a2..37cf95a1 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv @@ -1,19 +1,20 @@ CHILD,DECOM,RNE,LS,CIN,PLACE,PLACE_PROVIDER,DEC,REC,REASON_PLACE_CHANGE,HOME_POST,PL_POST,URN,LA,YEAR -RULE1_BAD,2019/06/21,P,C2,N1,H5,PR4,2019/07/23,E6,,R1 1,Z28 8,,Barking and Dagenham,2020 -RULE1_BAD,2017/08/20,T,C2,N1,U4,PR4,2019/06/21,X1,CARPL,R1 1,Z8 5,4743233,Barking and Dagenham,2020 -RULE1_BAD,2016/01/13,P,C2,N1,U6,PR4,,,,R1 1,Z8 5,4743233,Barking and Dagenham,2018 -RULE1A_BAD,2017/04/20,S,V2,N1,H5,PR4,2017/06/13,E8,,R1A 1,Z28 2,,Barking and Dagenham,2018 -RULE1A_BAD,2016/01/22,S,V2,N1,P2,PR4,2016/10/24,X1,CARPL,R1A 1,Z25 1,,Barking and Dagenham,2017 -RULE1A_BAD,2016/10/24,P,V2,N1,P2,PR5,,,,R1A 1,Z8 5,,Barking and Dagenham,2017 -RULE2_BAD,2019/05/10,S,J1,N6,R5,PR3,,,,R2 1,Z32 4,,Barking and Dagenham,2020 -RULE3_BAD,2016/09/23,S,V2,N1,U6,PR1,2017/05/30,E2,,R3 1,Z12 3,4270000,Barking and Dagenham,2018 -RULE3_BAD,2016/09/27,S,V2,N1,U6,PR1,,,,R3 1,Z12 3,4270000,Barking and Dagenham,2017 -RULE3A_BAD,2018/01/25,L,C1,N1,U3,PR1,2018/05/16,E45,PLACE,R3A 1,Z11 4,4740000,Barking and Dagenham,2019 -RULE3A_BAD,2017/11/25,S,V2,N1,U3,PR1,2018/01/25,X1,,R3A 1,Z11 4,4740000,Barking and Dagenham,2018 -RULE3A_BAD,2018/03/22,T,C1,N1,U1,PR1,,,,,,4740000,Barking and Dagenham,2018 -RULE4_BAD,2019/08/19,P,C1,N1,P1,PR0,2019/10/01,E4A,,R4 3,Z1 4,,Barking and Dagenham,2020 -RULE4_BAD,2018/12/20,P,C1,N1,U6,PR1,2019/08/19,X1,CARPL,R4 3,Z5 2,4890000,Barking and Dagenham,2020 -RULE4_BAD,2018/11/23,S,C1,N1,U6,PR1,2018/12/24,X1,OTHER,R4 3,Z22 9,4890000,Barking and Dagenham,2019 -RULE4_BAD,2018/12/24,P,C1,N1,U6,PR1,,,,R4 3,Z15 2,4890000,Barking and Dagenham,2019 -NORULE_BAD,2016/07/26,T,C2,N1,U4,PR4,2021/12/12,E17,,ABC 1,DEF 2,4610000,Barking and Dagenham,2022 -NORULE_BAD,2015/06/19,P,C2,N1,U6,PR4,2016/07/26,X1,CARPL,ABC 1,FGH 3,4610000,Barking and Dagenham,2017 +RULE1_BAD,21/06/2019,P,C2,N1,H5,PR4,23/07/2019,E6,,R1 1,Z28 8,,Barking and Dagenham,2020 +RULE1_BAD,20/08/2017,T,C2,N1,U4,PR4,21/06/2019,X1,CARPL,R1 1,Z8 5,4743233,Barking and Dagenham,2020 +RULE1_BAD,13/01/2016,P,C2,N1,U6,PR4,,,,R1 1,Z8 5,4743233,Barking and Dagenham,2018 +RULE1A_BAD,20/04/2017,S,V2,N1,H5,PR4,13/06/2017,E8,,R1A 1,Z28 2,,Barking and Dagenham,2018 +RULE1A_BAD,22/01/2016,S,V2,N1,P2,PR4,24/10/2016,X1,CARPL,R1A 1,Z25 1,,Barking and Dagenham,2017 +RULE1A_BAD,24/10/2016,P,V2,N1,P2,PR5,,,,R1A 1,Z8 5,,Barking and Dagenham,2017 +RULE2_BAD,10/05/2019,S,J1,N6,R5,PR3,,,,R2 1,Z32 4,,Barking and Dagenham,2020 +RULE3_BAD,23/09/2016,S,V2,N1,U6,PR1,30/05/2017,E2,,R3 1,Z12 3,4270000,Barking and Dagenham,2018 +RULE3_BAD,27/09/2016,S,V2,N1,U6,PR1,,,,R3 1,Z12 3,4270000,Barking and Dagenham,2017 +RULE3A_BAD,25/01/2018,L,C1,N1,U3,PR1,16/05/2018,E45,PLACE,R3A 1,Z11 4,4740000,Barking and Dagenham,2019 +RULE3A_BAD,25/11/2017,S,V2,N1,U3,PR1,25/01/2018,X1,,R3A 1,Z11 4,4740000,Barking and Dagenham,2018 +RULE3A_BAD,22/03/2018,T,C1,N1,U1,PR1,,,,,,4740000,Barking and Dagenham,2018 +RULE4_BAD,19/08/2019,P,C1,N1,P1,PR0,01/10/2019,E4A,,R4 3,Z1 4,,Barking and Dagenham,2020 +RULE4_BAD,20/12/2018,P,C1,N1,U6,PR1,19/08/2019,X1,CARPL,R4 3,Z5 2,4890000,Barking and Dagenham,2020 +RULE4_BAD,23/11/2018,S,C1,N1,U6,PR1,24/12/2018,X1,OTHER,R4 3,Z22 9,4890000,Barking and Dagenham,2019 +RULE4_BAD,24/12/2018,P,C1,N1,U6,PR1,,,,R4 3,Z15 2,4890000,Barking and Dagenham,2019 +NORULE_BAD,26/07/2016,T,C2,N1,U4,PR4,12/12/2021,E17,,ABC 1,DEF 2,4610000,Barking and Dagenham,2022 +NORULE_BAD,19/06/2015,P,C2,N1,U6,PR4,26/07/2016,X1,CARPL,ABC 1,FGH 3,4610000,Barking and Dagenham,2017 +NORULE_NEW,01/02/2022,T,C2,N1,U4,PR4,10/07/2022,E17,,ABC 3,DEF 4,4310000,Newham,2023 diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index 1f445831..18be5107 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -26,6 +26,7 @@ def create_previous_and_next_episode(dataframe: pd.DataFrame, columns: list) -> :param columns: List of columns containing required data from previous/next episodes :return: Dataframe with columns showing previous and next episodes """ + print("create_previous_and_next_episode()...") for column in columns: dataframe[column + "_previous"] = np.where( dataframe["CHILD"] == dataframe["CHILD"].shift(1), @@ -38,3 +39,27 @@ def create_previous_and_next_episode(dataframe: pd.DataFrame, columns: list) -> None, ) return dataframe + + +def add_latest_year_and_source_for_la(dataframe: pd.DataFrame) -> pd.DataFrame: + """ + Add column to containing latest submission year for each LA + + :param dataframe: Dataframe with SSDA903 Episodes data + :return: Dataframe with column showing latest submission year for each LA + """ + print("add_latest_year_for_la()") + dataframe['YEAR_latest'] = dataframe.groupby('LA')['YEAR'].transform('max') + dataframe["Episode_source"] = "Original" + return dataframe + + +def identify_not_latest_open_episode(dataframe: pd.DataFrame) -> pd.DataFrame: + """ + Add column to identify rows with open episodes that were not submitted in the latest file year + + :param dataframe: Dataframe with SSDA903 Episodes data + :return: Dataframe with column showing true if episode is open but is not from the latest file year + """ + print("identify_not_latest_open_episode()...TODO") + return dataframe \ No newline at end of file diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index 27be02b3..fc2dab15 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -221,15 +221,15 @@ def episodes_fix(input, output): column_names = config["column_names"] table_name = common_process.match_load_file(s903_df, column_names) if table_name == "Episodes": - dates = config["dates"] - s903_df = common_process.convert_datetimes(s903_df, dates, table_name) + #dates = config["dates"] + #s903_df = common_process.convert_datetimes(s903_df, dates, table_name) s903_df = s903_df.sort_values(["CHILD", "DECOM"], ignore_index=True) - print (s903_df.dtypes) s903_df_next = episodes_process.create_previous_and_next_episode(s903_df, episodes_process.__COLUMNS) - + s903_df_next = episodes_process.add_latest_year_and_source_for_la(s903_df_next) + #s903_df_next = episodes_process.identify_not_latest_open_episode(s903_df_next) + # Following code used to test outputs during development s903_df_next = s903_df_next.sort_values(["CHILD", "DECOM"], ignore_index=True) - print (s903_df_next.dtypes) # Issue: np.where is converting types, e.g. year to 2017.0, dates to 1469491200000000000 print (s903_df_next) s903_df_next.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", index=False) From 0d5a1cf809e111863eea835e5967c0676eee5a7b Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Tue, 6 Feb 2024 16:29:03 +0000 Subject: [PATCH 04/25] Work in progress --- ...DA903_episodes_for_testing_fixes_INPUT.csv | 40 +++++++------- .../s903/lds_ssda903_episodes_fix/process.py | 54 ++++++++++++++++--- .../datasets/s903/s903_main_functions.py | 3 +- 3 files changed, 70 insertions(+), 27 deletions(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv b/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv index 37cf95a1..a3b579f1 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv @@ -1,20 +1,22 @@ CHILD,DECOM,RNE,LS,CIN,PLACE,PLACE_PROVIDER,DEC,REC,REASON_PLACE_CHANGE,HOME_POST,PL_POST,URN,LA,YEAR -RULE1_BAD,21/06/2019,P,C2,N1,H5,PR4,23/07/2019,E6,,R1 1,Z28 8,,Barking and Dagenham,2020 -RULE1_BAD,20/08/2017,T,C2,N1,U4,PR4,21/06/2019,X1,CARPL,R1 1,Z8 5,4743233,Barking and Dagenham,2020 -RULE1_BAD,13/01/2016,P,C2,N1,U6,PR4,,,,R1 1,Z8 5,4743233,Barking and Dagenham,2018 -RULE1A_BAD,20/04/2017,S,V2,N1,H5,PR4,13/06/2017,E8,,R1A 1,Z28 2,,Barking and Dagenham,2018 -RULE1A_BAD,22/01/2016,S,V2,N1,P2,PR4,24/10/2016,X1,CARPL,R1A 1,Z25 1,,Barking and Dagenham,2017 -RULE1A_BAD,24/10/2016,P,V2,N1,P2,PR5,,,,R1A 1,Z8 5,,Barking and Dagenham,2017 -RULE2_BAD,10/05/2019,S,J1,N6,R5,PR3,,,,R2 1,Z32 4,,Barking and Dagenham,2020 -RULE3_BAD,23/09/2016,S,V2,N1,U6,PR1,30/05/2017,E2,,R3 1,Z12 3,4270000,Barking and Dagenham,2018 -RULE3_BAD,27/09/2016,S,V2,N1,U6,PR1,,,,R3 1,Z12 3,4270000,Barking and Dagenham,2017 -RULE3A_BAD,25/01/2018,L,C1,N1,U3,PR1,16/05/2018,E45,PLACE,R3A 1,Z11 4,4740000,Barking and Dagenham,2019 -RULE3A_BAD,25/11/2017,S,V2,N1,U3,PR1,25/01/2018,X1,,R3A 1,Z11 4,4740000,Barking and Dagenham,2018 -RULE3A_BAD,22/03/2018,T,C1,N1,U1,PR1,,,,,,4740000,Barking and Dagenham,2018 -RULE4_BAD,19/08/2019,P,C1,N1,P1,PR0,01/10/2019,E4A,,R4 3,Z1 4,,Barking and Dagenham,2020 -RULE4_BAD,20/12/2018,P,C1,N1,U6,PR1,19/08/2019,X1,CARPL,R4 3,Z5 2,4890000,Barking and Dagenham,2020 -RULE4_BAD,23/11/2018,S,C1,N1,U6,PR1,24/12/2018,X1,OTHER,R4 3,Z22 9,4890000,Barking and Dagenham,2019 -RULE4_BAD,24/12/2018,P,C1,N1,U6,PR1,,,,R4 3,Z15 2,4890000,Barking and Dagenham,2019 -NORULE_BAD,26/07/2016,T,C2,N1,U4,PR4,12/12/2021,E17,,ABC 1,DEF 2,4610000,Barking and Dagenham,2022 -NORULE_BAD,19/06/2015,P,C2,N1,U6,PR4,26/07/2016,X1,CARPL,ABC 1,FGH 3,4610000,Barking and Dagenham,2017 -NORULE_NEW,01/02/2022,T,C2,N1,U4,PR4,10/07/2022,E17,,ABC 3,DEF 4,4310000,Newham,2023 +NORULE_BAD,2015-06-19,P,C2,N1,U6,PR4,2016-07-26,X1,CARPL,ABC 1,FGH 3,4610000,Barking and Dagenham,2017 +NORULE_BAD,2016-07-26,T,C2,N1,U4,PR4,2021-12-12,E17,,ABC 1,DEF 2,4610000,Barking and Dagenham,2022 +NORULE_NEW,2022-02-01,T,C2,N1,U4,PR4,2022-07-10,E17,,ABC 3,DEF 4,4310000,Newham,2023 +NORULE_NEW,2022-10-23,S,V2,N1,U4,PR4,,,,ABC 3,DEF 4,4310000,Newham,2023 +NORULE_SUT,2017-12-15,S,V2,N1,U1,PR1,,,,FGH 2,JKL 1,4870000,Sutton,2022 +RULE1_BAD,2016-01-13,P,C2,N1,U6,PR4,,,,R1 1,Z8 5,4743233,Barking and Dagenham,2018 +RULE1_BAD,2017-08-20,T,C2,N1,U4,PR4,2019-06-21,X1,CARPL,R1 1,Z8 5,4743233,Barking and Dagenham,2020 +RULE1_BAD,2019-06-21,P,C2,N1,H5,PR4,2019-07-23,E6,,R1 1,Z28 8,,Barking and Dagenham,2020 +RULE1A_BAD,2016-01-22,S,V2,N1,P2,PR4,2016-10-24,X1,CARPL,R1A 1,Z25 1,,Barking and Dagenham,2017 +RULE1A_BAD,2016-10-24,P,V2,N1,P2,PR5,,,,R1A 1,Z8 5,,Barking and Dagenham,2017 +RULE1A_BAD,2017-04-20,S,V2,N1,H5,PR4,2017-06-13,E8,,R1A 1,Z28 2,,Barking and Dagenham,2018 +RULE2_BAD,2019-05-10,S,J1,N6,R5,PR3,,,,R2 1,Z32 4,,Barking and Dagenham,2020 +RULE3_BAD,2016-09-23,S,V2,N1,U6,PR1,2017-05-30,E2,,R3 1,Z12 3,4270000,Barking and Dagenham,2018 +RULE3_BAD,2016-09-27,S,V2,N1,U6,PR1,,,,R3 1,Z12 3,4270000,Barking and Dagenham,2017 +RULE3A_BAD,2017-11-25,S,V2,N1,U3,PR1,2018-01-25,X1,,R3A 1,Z11 4,4740000,Barking and Dagenham,2018 +RULE3A_BAD,2018-01-25,L,C1,N1,U3,PR1,2018-05-16,E45,PLACE,R3A 1,Z11 4,4740000,Barking and Dagenham,2019 +RULE3A_BAD,2018-03-22,T,C1,N1,U1,PR1,,,,,,4740000,Barking and Dagenham,2018 +RULE4_BAD,2018-11-23,S,C1,N1,U6,PR1,2018-12-24,X1,OTHER,R4 3,Z22 9,4890000,Barking and Dagenham,2019 +RULE4_BAD,2018-12-20,P,C1,N1,U6,PR1,2019-08-19,X1,CARPL,R4 3,Z5 2,4890000,Barking and Dagenham,2020 +RULE4_BAD,2018-12-24,P,C1,N1,U6,PR1,,,,R4 3,Z15 2,4890000,Barking and Dagenham,2019 +RULE4_BAD,2019-08-19,P,C1,N1,P1,PR0,2019-10-01,E4A,,R4 3,Z1 4,,Barking and Dagenham,2020 diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index 18be5107..277aedd8 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -43,23 +43,63 @@ def create_previous_and_next_episode(dataframe: pd.DataFrame, columns: list) -> def add_latest_year_and_source_for_la(dataframe: pd.DataFrame) -> pd.DataFrame: """ - Add column to containing latest submission year for each LA + Add column to containing latest submission year and source for each LA :param dataframe: Dataframe with SSDA903 Episodes data - :return: Dataframe with column showing latest submission year for each LA + :return: Dataframe with column showing latest submission year for each LA and column showing episode source """ - print("add_latest_year_for_la()") + print("add_latest_year_and_source_for_la()...") dataframe['YEAR_latest'] = dataframe.groupby('LA')['YEAR'].transform('max') dataframe["Episode_source"] = "Original" return dataframe -def identify_not_latest_open_episode(dataframe: pd.DataFrame) -> pd.DataFrame: +def add_stage1_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: """ - Add column to identify rows with open episodes that were not submitted in the latest file year + Add columns to identify rows with open episodes that meet certain criteria :param dataframe: Dataframe with SSDA903 Episodes data - :return: Dataframe with column showing true if episode is open but is not from the latest file year + :return: Dataframe with columns showing true if certain conditions are met """ - print("identify_not_latest_open_episode()...TODO") + print("add_stage1_rule_identifier_columns...") + dataframe = dataframe.assign(Has_open_episode_error=lambda x: (x.DEC.isnull() ) & (x.YEAR != x.YEAR_latest) ) + dataframe["Has_next_episode"] = dataframe["DECOM_next"].notnull() + dataframe["Has_previous_episode"] = dataframe["DECOM_previous"].notnull() + dataframe = dataframe.assign(Has_next_episode_with_RNE_equals_S=lambda x: (x.Has_next_episode) & (x.RNE_next == "S") ) + dataframe = dataframe.assign(Next_episode_is_duplicate=lambda x: (x.DEC).isnull() & + (x.Has_next_episode) & + (x.DECOM_next != x.DECOM) & + (x.RNE_next == x.RNE) & + (x.LS_next == x.LS) & + (x.PLACE_next == x.PLACE) & + (x.PLACE_PROVIDER_next == x.PLACE_PROVIDER) & + (x.PL_POST_next == x.PL_POST) & + (x.URN_next == x.URN) + ) + dataframe = dataframe.assign(Previous_episode_is_duplicate=lambda x: (x.DEC).isnull() & + (x.Has_previous_episode) & + (x.DECOM_previous != x.DECOM) & + (x.RNE_previous == x.RNE) & + (x.LS_previous == x.LS) & + (x.PLACE_previous == x.PLACE) & + (x.PLACE_PROVIDER_previous == x.PLACE_PROVIDER) & + (x.PL_POST_previous == x.PL_POST) & + (x.URN_previous == x.URN) + ) + dataframe = dataframe.assign(Previous_episode_submitted_later=lambda x: (x.DEC).isnull() & + (x.Has_previous_episode) & + (x.YEAR_previous > x.YEAR) + ) + return dataframe + + +def identify_stage1_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame: + """ + Add column to identify which stage 1 rule should be applied + + :param dataframe: Dataframe with SSDA903 Episodes data + :return: Dataframe with column showing stage 1 rule to be applied + """ + print("identify_stage1_rule_to_apply...") + # To do based on criteria identified return dataframe \ No newline at end of file diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index fc2dab15..b41e5a1f 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -226,7 +226,8 @@ def episodes_fix(input, output): s903_df = s903_df.sort_values(["CHILD", "DECOM"], ignore_index=True) s903_df_next = episodes_process.create_previous_and_next_episode(s903_df, episodes_process.__COLUMNS) s903_df_next = episodes_process.add_latest_year_and_source_for_la(s903_df_next) - #s903_df_next = episodes_process.identify_not_latest_open_episode(s903_df_next) + s903_df_next = episodes_process.add_stage1_rule_identifier_columns(s903_df_next) + s903_df_next = episodes_process.identify_stage1_rule_to_apply(s903_df_next) # Following code used to test outputs during development s903_df_next = s903_df_next.sort_values(["CHILD", "DECOM"], ignore_index=True) From 246aab8d7d7b57179be07183a0b70f03327b7c10 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Mon, 12 Feb 2024 14:55:58 +0000 Subject: [PATCH 05/25] Work in progress --- .../s903/lds_ssda903_episodes_fix/process.py | 24 +++++++++++++++++++ .../datasets/s903/s903_main_functions.py | 20 ++++++++-------- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index 277aedd8..a21c1e69 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -17,6 +17,15 @@ "YEAR", ] +__DATES = [ + "DECOM", + "DEC", + "DECOM_previous", + "DEC_previous", + "DECOM_next", + "DEC_next", +] + def create_previous_and_next_episode(dataframe: pd.DataFrame, columns: list) -> pd.DataFrame: """ @@ -41,6 +50,21 @@ def create_previous_and_next_episode(dataframe: pd.DataFrame, columns: list) -> return dataframe +def format_datetime(dataframe: pd.DataFrame, date_columns: list) -> pd.DataFrame: + """ + Format date columns to datetime type + + :param dataframe: Dataframe with SSDA903 Episodes data + :param columns: List of columns containing dates + :return: Dataframe with date columns showing as datetime data type + """ + print("format_datetime()...") + + # dataframe["DECOM"].apply(pd.to_datetime, format='%Y-%m-%d', errors='raise') + dataframe[date_columns] = dataframe[date_columns].apply(pd.to_datetime, format="%Y-%m-%d", errors="raise") + return dataframe + + def add_latest_year_and_source_for_la(dataframe: pd.DataFrame) -> pd.DataFrame: """ Add column to containing latest submission year and source for each LA diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index b41e5a1f..5a7a3faf 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -220,19 +220,19 @@ def episodes_fix(input, output): s903_df = common_process.read_file(input) column_names = config["column_names"] table_name = common_process.match_load_file(s903_df, column_names) + + # Process stage 1 fixes for Episodes table if table_name == "Episodes": - #dates = config["dates"] - #s903_df = common_process.convert_datetimes(s903_df, dates, table_name) s903_df = s903_df.sort_values(["CHILD", "DECOM"], ignore_index=True) - s903_df_next = episodes_process.create_previous_and_next_episode(s903_df, episodes_process.__COLUMNS) - s903_df_next = episodes_process.add_latest_year_and_source_for_la(s903_df_next) - s903_df_next = episodes_process.add_stage1_rule_identifier_columns(s903_df_next) - s903_df_next = episodes_process.identify_stage1_rule_to_apply(s903_df_next) - + s903_df_stage1 = episodes_process.create_previous_and_next_episode(s903_df, episodes_process.__COLUMNS) + s903_df_stage1 = episodes_process.format_datetime(s903_df_stage1, episodes_process.__DATES) + #print(s903_df_stage1[episodes_process.__DATES].dtypes) # Hooray! datetime64[ns] + s903_df_stage1 = episodes_process.add_latest_year_and_source_for_la(s903_df_stage1) + s903_df_stage1 = episodes_process.add_stage1_rule_identifier_columns(s903_df_stage1) + s903_df_stage1 = episodes_process.identify_stage1_rule_to_apply(s903_df_stage1) # Following code used to test outputs during development - s903_df_next = s903_df_next.sort_values(["CHILD", "DECOM"], ignore_index=True) - print (s903_df_next) - s903_df_next.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", + s903_df_stage1 = s903_df_stage1.sort_values(["CHILD", "DECOM"], ignore_index=True) + s903_df_stage1.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", index=False) From eb96556f50b31778c7ef80a4550ec52289b83157 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Mon, 12 Feb 2024 16:03:26 +0000 Subject: [PATCH 06/25] WIP - sort out rule to apply --- .../s903/lds_ssda903_episodes_fix/process.py | 71 ++++++++++++------- .../datasets/s903/s903_main_functions.py | 1 + 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index a21c1e69..06690c7b 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -78,6 +78,43 @@ def add_latest_year_and_source_for_la(dataframe: pd.DataFrame) -> pd.DataFrame: return dataframe +def _is_next_episode_duplicate(row): + return (row.DEC.isnull() & + row.Has_next_episode & + ( (row.DECOM_next != row.DECOM) | (row.DECOM_next.isnull() & row.DECOM.isnull()) ) & + ( (row.RNE_next == row.RNE) | (row.RNE_next.isnull() & row.RNE.isnull()) ) & + ( (row.LS_next == row.LS) | (row.LS_next.isnull() & row.LS.isnull()) ) & + ( (row.PLACE_next == row.PLACE) | (row.PLACE_next.isnull() | row.PLACE.isnull()) ) & + ( (row.PLACE_PROVIDER_next == row.PLACE_PROVIDER) | (row.PLACE_PROVIDER_next.isnull() | row.PLACE_PROVIDER.isnull()) ) & + ( (row.PL_POST_next == row.PL_POST) | (row.PL_POST_next.isnull() | row.PL_POST.isnull()) ) & + ( (row.URN_next == row.URN) | (row.URN_next.isnull() | row.URN.isnull()) ) + ) + + +def _is_previous_episode_duplicate(row): + return (row.DEC.isnull() & + row.Has_previous_episode & + ( (row.DECOM_previous != row.DECOM) | (row.DECOM_previous.isnull() & row.DECOM.isnull()) ) & + ( (row.RNE_previous == row.RNE) | (row.RNE_previous.isnull() & row.RNE.isnull()) ) & + ( (row.LS_previous == row.LS) | (row.LS_previous.isnull() & row.LS.isnull()) ) & + ( (row.PLACE_previous == row.PLACE) | (row.PLACE_previous.isnull() | row.PLACE.isnull()) ) & + ( (row.PLACE_PROVIDER_previous == row.PLACE_PROVIDER) | (row.PLACE_PROVIDER_previous.isnull() | row.PLACE_PROVIDER.isnull()) ) & + ( (row.PL_POST_previous == row.PL_POST) | (row.PL_POST_previous.isnull() | row.PL_POST.isnull()) ) & + ( (row.URN_previous == row.URN) | (row.URN_previous.isnull() | row.URN.isnull()) ) + ) + + +def _is_previous_episode_submitted_later(row): + return (row.DEC.isnull() & + (row.Has_previous_episode) & + (row.YEAR_previous > row.YEAR) + ) + +def _rule_to_apply(row): + if row["Has_open_episode_error"]: + return "Some rule tbd" + + def add_stage1_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: """ Add columns to identify rows with open episodes that meet certain criteria @@ -86,34 +123,13 @@ def add_stage1_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: :return: Dataframe with columns showing true if certain conditions are met """ print("add_stage1_rule_identifier_columns...") - dataframe = dataframe.assign(Has_open_episode_error=lambda x: (x.DEC.isnull() ) & (x.YEAR != x.YEAR_latest) ) + dataframe = dataframe.assign(Has_open_episode_error=lambda row: (row.DEC.isnull() ) & (row.YEAR != row.YEAR_latest) ) dataframe["Has_next_episode"] = dataframe["DECOM_next"].notnull() dataframe["Has_previous_episode"] = dataframe["DECOM_previous"].notnull() - dataframe = dataframe.assign(Has_next_episode_with_RNE_equals_S=lambda x: (x.Has_next_episode) & (x.RNE_next == "S") ) - dataframe = dataframe.assign(Next_episode_is_duplicate=lambda x: (x.DEC).isnull() & - (x.Has_next_episode) & - (x.DECOM_next != x.DECOM) & - (x.RNE_next == x.RNE) & - (x.LS_next == x.LS) & - (x.PLACE_next == x.PLACE) & - (x.PLACE_PROVIDER_next == x.PLACE_PROVIDER) & - (x.PL_POST_next == x.PL_POST) & - (x.URN_next == x.URN) - ) - dataframe = dataframe.assign(Previous_episode_is_duplicate=lambda x: (x.DEC).isnull() & - (x.Has_previous_episode) & - (x.DECOM_previous != x.DECOM) & - (x.RNE_previous == x.RNE) & - (x.LS_previous == x.LS) & - (x.PLACE_previous == x.PLACE) & - (x.PLACE_PROVIDER_previous == x.PLACE_PROVIDER) & - (x.PL_POST_previous == x.PL_POST) & - (x.URN_previous == x.URN) - ) - dataframe = dataframe.assign(Previous_episode_submitted_later=lambda x: (x.DEC).isnull() & - (x.Has_previous_episode) & - (x.YEAR_previous > x.YEAR) - ) + dataframe = dataframe.assign(Has_next_episode_with_RNE_equals_S=lambda row: (row.Has_next_episode) & (row.RNE_next == "S") ) + dataframe = dataframe.assign(Next_episode_is_duplicate=lambda row: _is_next_episode_duplicate(row)) + dataframe = dataframe.assign(Previous_episode_is_duplicate=lambda row: _is_previous_episode_duplicate(row)) + dataframe = dataframe.assign(Previous_episode_submitted_later=lambda row: _is_previous_episode_submitted_later(row)) return dataframe @@ -125,5 +141,6 @@ def identify_stage1_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame: :return: Dataframe with column showing stage 1 rule to be applied """ print("identify_stage1_rule_to_apply...") - # To do based on criteria identified + #rule1_condition = (dataframe["Has_open_episode_error"]) + dataframe["Rule_to_apply"] = dataframe.apply(_rule_to_apply, axis=1) return dataframe \ No newline at end of file diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index 5a7a3faf..8fc3583a 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -230,6 +230,7 @@ def episodes_fix(input, output): s903_df_stage1 = episodes_process.add_latest_year_and_source_for_la(s903_df_stage1) s903_df_stage1 = episodes_process.add_stage1_rule_identifier_columns(s903_df_stage1) s903_df_stage1 = episodes_process.identify_stage1_rule_to_apply(s903_df_stage1) + print(s903_df_stage1) # Following code used to test outputs during development s903_df_stage1 = s903_df_stage1.sort_values(["CHILD", "DECOM"], ignore_index=True) s903_df_stage1.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", From 43baf30d127d29b6dd2e6b3d38aad96aae6b7075 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 14 Feb 2024 14:50:18 +0000 Subject: [PATCH 07/25] WIP - correctly identifies which stage 1 rule to apply --- .../s903/lds_ssda903_episodes_fix/process.py | 49 ++++++++++++++++--- .../datasets/s903/s903_main_functions.py | 20 +++++--- 2 files changed, 55 insertions(+), 14 deletions(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index 06690c7b..d398f523 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -110,9 +110,17 @@ def _is_previous_episode_submitted_later(row): (row.YEAR_previous > row.YEAR) ) -def _rule_to_apply(row): +def _stage1_rule_to_apply(row): if row["Has_open_episode_error"]: - return "Some rule tbd" + if row["Next_episode_is_duplicate"] | row["Previous_episode_is_duplicate"]: + return "RULE_3" # Duplicate + if row["Previous_episode_submitted_later"]: + return "RULE_3A" # Episode replaced in later submission + if row["Has_next_episode"] is False: + return "RULE_2" # Ceases LAC + if row["Has_next_episode_with_RNE_equals_S"]: + return "RULE_1A" # Ceases LAC, but re-enters care later + return "RULE_1" # Remains LAC, episode changes def add_stage1_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: @@ -127,20 +135,45 @@ def add_stage1_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: dataframe["Has_next_episode"] = dataframe["DECOM_next"].notnull() dataframe["Has_previous_episode"] = dataframe["DECOM_previous"].notnull() dataframe = dataframe.assign(Has_next_episode_with_RNE_equals_S=lambda row: (row.Has_next_episode) & (row.RNE_next == "S") ) - dataframe = dataframe.assign(Next_episode_is_duplicate=lambda row: _is_next_episode_duplicate(row)) - dataframe = dataframe.assign(Previous_episode_is_duplicate=lambda row: _is_previous_episode_duplicate(row)) - dataframe = dataframe.assign(Previous_episode_submitted_later=lambda row: _is_previous_episode_submitted_later(row)) + #dataframe = dataframe.assign(Next_episode_is_duplicate=lambda row: _is_next_episode_duplicate(row)) + dataframe = dataframe.assign(Next_episode_is_duplicate=_is_next_episode_duplicate) + #dataframe = dataframe.assign(Previous_episode_is_duplicate=lambda row: _is_previous_episode_duplicate(row)) + dataframe = dataframe.assign(Previous_episode_is_duplicate=_is_previous_episode_duplicate) + #dataframe = dataframe.assign(Previous_episode_submitted_later=lambda row: _is_previous_episode_submitted_later(row)) + dataframe = dataframe.assign(Previous_episode_submitted_later=_is_previous_episode_submitted_later) return dataframe def identify_stage1_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame: """ - Add column to identify which stage 1 rule should be applied + Add column to identify which stage 1 rule should be applied: + RULE_1 : Child remains LAC but episode changes + RULE_1A: Child ceases LAC but re-enters care later + RULE_2 : Child ceases LAC + RULE_3 : Episode is a duplicate - delete + RULE_3A: Episode replaced in later submission - delete :param dataframe: Dataframe with SSDA903 Episodes data :return: Dataframe with column showing stage 1 rule to be applied """ print("identify_stage1_rule_to_apply...") - #rule1_condition = (dataframe["Has_open_episode_error"]) - dataframe["Rule_to_apply"] = dataframe.apply(_rule_to_apply, axis=1) + dataframe["Rule_to_apply"] = dataframe.apply(_stage1_rule_to_apply, axis=1) + return dataframe + + +def apply_stage1_rules(dataframe: pd.DataFrame) -> pd.DataFrame: + """ + Apply stage 1 rules: + RULE_1 : Child remains LAC but episode changes + RULE_1A: Child ceases LAC but re-enters care later + RULE_2 : Child ceases LAC + RULE_3 : Episode is a duplicate - delete + RULE_3A: Episode replaced in later submission - delete + + :param dataframe: Dataframe with SSDA903 Episodes data + :return: Dataframe with stage 1 rules applied + """ + print("apply_stage1_rules...TODO") + episodes_to_delete = (dataframe["Rule_to_apply"] == 'RULE_3') | (dataframe["Rule_to_apply"] == 'RULE_3A') + dataframe = dataframe.drop(dataframe[episodes_to_delete].index) return dataframe \ No newline at end of file diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index 8fc3583a..e5f84f64 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -221,22 +221,30 @@ def episodes_fix(input, output): column_names = config["column_names"] table_name = common_process.match_load_file(s903_df, column_names) - # Process stage 1 fixes for Episodes table + # Process stage 1 rule fixes for Episodes table if table_name == "Episodes": + # Add columns to dataframe to identify which rules should be applied s903_df = s903_df.sort_values(["CHILD", "DECOM"], ignore_index=True) s903_df_stage1 = episodes_process.create_previous_and_next_episode(s903_df, episodes_process.__COLUMNS) s903_df_stage1 = episodes_process.format_datetime(s903_df_stage1, episodes_process.__DATES) - #print(s903_df_stage1[episodes_process.__DATES].dtypes) # Hooray! datetime64[ns] s903_df_stage1 = episodes_process.add_latest_year_and_source_for_la(s903_df_stage1) s903_df_stage1 = episodes_process.add_stage1_rule_identifier_columns(s903_df_stage1) s903_df_stage1 = episodes_process.identify_stage1_rule_to_apply(s903_df_stage1) - print(s903_df_stage1) + + # Apply the stage 1 rules + s903_df_stage1_applied = episodes_process.apply_stage1_rules(s903_df_stage1) + # Following code used to test outputs during development - s903_df_stage1 = s903_df_stage1.sort_values(["CHILD", "DECOM"], ignore_index=True) - s903_df_stage1.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", - index=False) + print("Dataframe with rules identified:") + print(s903_df_stage1[["CHILD", "YEAR", "DECOM", "DEC", "Has_open_episode_error", "Rule_to_apply"]]) + print("Dataframe with stage 1 rules applied (Incomplete - more rules to apply):") + print(s903_df_stage1_applied[["CHILD", "YEAR", "DECOM", "DEC", "Has_open_episode_error", "Rule_to_apply"]]) + s903_df_stage1_applied = s903_df_stage1_applied.sort_values(["CHILD", "DECOM"], ignore_index=True) + s903_df_stage1_applied.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", + index=False) +# Run episodes_fix() with our test file which contains examples of each rule (CHILD id indicates which rule) episodes_fix( r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv", r"liiatools/datasets/s903/lds_ssda903_episodes_fix" From 1632b251583e5acf4074ba4ff1af51769f79899c Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 14 Feb 2024 15:05:13 +0000 Subject: [PATCH 08/25] WIP --- liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index d398f523..18de2e5c 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -174,6 +174,9 @@ def apply_stage1_rules(dataframe: pd.DataFrame) -> pd.DataFrame: :return: Dataframe with stage 1 rules applied """ print("apply_stage1_rules...TODO") - episodes_to_delete = (dataframe["Rule_to_apply"] == 'RULE_3') | (dataframe["Rule_to_apply"] == 'RULE_3A') + episodes_to_delete = dataframe["Rule_to_apply"].isin( ['RULE_3', 'RULE_3A']) dataframe = dataframe.drop(dataframe[episodes_to_delete].index) + + # write code here for rules 1, 1A, 2 + return dataframe \ No newline at end of file From 892f6aab4a5816211df657d7e675f3d0547808e9 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Mon, 26 Feb 2024 15:59:09 +0000 Subject: [PATCH 09/25] WIP - stage 1 rules working --- .../s903/lds_ssda903_episodes_fix/process.py | 127 +++++++++++++++++- .../datasets/s903/s903_main_functions.py | 27 ++-- 2 files changed, 143 insertions(+), 11 deletions(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index 18de2e5c..b8c29c51 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +from datetime import datetime, timedelta __COLUMNS = [ "DECOM", @@ -26,6 +27,25 @@ "DEC_next", ] +__COLUMNS_TO_KEEP = [ + "CHILD", + "LA", + "DECOM", + "RNE", + "LS", + "CIN", + "PLACE", + "PLACE_PROVIDER", + "DEC", + "REC", + "REASON_PLACE_CHANGE", + "HOME_POST", + "PL_POST", + "URN", + "YEAR", + "YEAR_latest", + "Episode_source", +] def create_previous_and_next_episode(dataframe: pd.DataFrame, columns: list) -> pd.DataFrame: """ @@ -161,6 +181,64 @@ def identify_stage1_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame: return dataframe +def _update_dec(row): + """ + Determine updated DEC value. Defaults to input DEC if no rule to apply + :param row: Row from dataframe with SSDA903 Episodes data + :return: Updated DEC date + """ + end_of_year = datetime(row["YEAR"], 3, 31) + if row["Has_open_episode_error"]: + if row["Rule_to_apply"] == "RULE_1": + return row["DECOM_next"] + if row["Rule_to_apply"] == "RULE_1A": + day_before_next_decom = row["DECOM_next"] - timedelta(days = 1) + return min(end_of_year, day_before_next_decom) # get earliest date + if row["Rule_to_apply"] == "RULE_2": + return end_of_year + return row["DEC"] + + +def _update_rec(row): + """ + Determine updated REC value. Defaults to input REC if no rule to apply + :param row: Row from dataframe with SSDA903 Episodes data + :return: Updated REC value + """ + episode_ends_liia_fix = "E99" + episode_continues = "X1" + if row["Has_open_episode_error"]: + if row["Rule_to_apply"] == "RULE_1": + return episode_continues + if row["Rule_to_apply"] in ("RULE_1A", "RULE_2"): + return episode_ends_liia_fix + return row["REC"] + + +def _update_reason_place_change(row): + """ + Determine updated REASON_PLACE_CHANGE value. Defaults to input value if no rule to apply + :param row: Row from dataframe with SSDA903 Episodes data + :return: Updated REASON_PLACE_CHANGE value + """ + reason_liia_fix = "LIIAF" + if row["Has_open_episode_error"]: + if (row["Rule_to_apply"] == "RULE_1") & (row["RNE_next"] in ("P", "B", "T", "U")): + return reason_liia_fix + return row["REASON_PLACE_CHANGE"] + + +def _update_episode_source(row): + """ + Determine updated Episode_source value. Defaults to input value if no rule to apply + :param row: Row from dataframe with SSDA903 Episodes data + :return: Updated Episode_source value + """ + if row["Has_open_episode_error"]: + return row["Rule_to_apply"] + return row["Episode_source"] + + def apply_stage1_rules(dataframe: pd.DataFrame) -> pd.DataFrame: """ Apply stage 1 rules: @@ -173,10 +251,53 @@ def apply_stage1_rules(dataframe: pd.DataFrame) -> pd.DataFrame: :param dataframe: Dataframe with SSDA903 Episodes data :return: Dataframe with stage 1 rules applied """ - print("apply_stage1_rules...TODO") + print("apply_stage1_rules") + # Apply rules 3, 3A to delete rows episodes_to_delete = dataframe["Rule_to_apply"].isin( ['RULE_3', 'RULE_3A']) dataframe = dataframe.drop(dataframe[episodes_to_delete].index) - # write code here for rules 1, 1A, 2 - + # Apply rules 1, 1A, 2 + dataframe["DEC"] = dataframe.apply(_update_dec, axis=1) + dataframe["REC"] = dataframe.apply(_update_rec, axis=1) + dataframe["REASON_PLACE_CHANGE"] = dataframe.apply(_update_reason_place_change, axis=1) + dataframe["Episode_source"] = dataframe.apply(_update_episode_source, axis=1) + + return dataframe + + +def add_stage2_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: + """ + Add columns to identify rows which overlap or underlap surrounding episodes + + :param dataframe: Dataframe with SSDA903 Episodes data + :return: Dataframe with columns showing true if certain conditions are met + """ + print("add_stage2_rule_identifier_columns...TODO") + + return dataframe + + +def identify_stage2_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame: + """ + Add column to identify which stage 2 rule should be applied: + RULE_4: Overlaps with next episode + RULE_5: End reason is "X1" - episode continues - but there is gap before next episode + + :param dataframe: Dataframe with SSDA903 Episodes data + :return: Dataframe with column showing stage 2 rule to be applied + """ + print("identify_stage2_rule_to_apply...TODO") + return dataframe + + +def apply_stage2_rules(dataframe: pd.DataFrame) -> pd.DataFrame: + """ + Apply stage 2 rules: + RULE_4: Overlaps with next episode + RULE_5: End reason is "X1" - episode continues - but there is gap before next episode + + :param dataframe: Dataframe with SSDA903 Episodes data + :return: Dataframe with stage 2 rules applied + """ + print("apply_stage2_rules...TODO") return dataframe \ No newline at end of file diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index e5f84f64..056d7d04 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -223,7 +223,7 @@ def episodes_fix(input, output): # Process stage 1 rule fixes for Episodes table if table_name == "Episodes": - # Add columns to dataframe to identify which rules should be applied + # Add columns to dataframe to identify which rules should be applied at stage 1 s903_df = s903_df.sort_values(["CHILD", "DECOM"], ignore_index=True) s903_df_stage1 = episodes_process.create_previous_and_next_episode(s903_df, episodes_process.__COLUMNS) s903_df_stage1 = episodes_process.format_datetime(s903_df_stage1, episodes_process.__DATES) @@ -233,15 +233,26 @@ def episodes_fix(input, output): # Apply the stage 1 rules s903_df_stage1_applied = episodes_process.apply_stage1_rules(s903_df_stage1) - + + # Add columns to dataframe to identify which rules should be applied at stage 2 TODO + s903_df_stage2 = s903_df_stage1_applied[episodes_process.__COLUMNS_TO_KEEP] + s903_df_stage2 = episodes_process.create_previous_and_next_episode(s903_df_stage2, episodes_process.__COLUMNS) + s903_df_stage2 = episodes_process.format_datetime(s903_df_stage2, episodes_process.__DATES) + s903_df_stage2 = episodes_process.add_stage2_rule_identifier_columns(s903_df_stage2) + s903_df_stage2 = episodes_process.identify_stage2_rule_to_apply(s903_df_stage2) + + # Apply the stage 2 rules TODO + s903_df_stage2_applied = episodes_process.apply_stage2_rules(s903_df_stage2) + # Following code used to test outputs during development print("Dataframe with rules identified:") - print(s903_df_stage1[["CHILD", "YEAR", "DECOM", "DEC", "Has_open_episode_error", "Rule_to_apply"]]) - print("Dataframe with stage 1 rules applied (Incomplete - more rules to apply):") - print(s903_df_stage1_applied[["CHILD", "YEAR", "DECOM", "DEC", "Has_open_episode_error", "Rule_to_apply"]]) - - s903_df_stage1_applied = s903_df_stage1_applied.sort_values(["CHILD", "DECOM"], ignore_index=True) - s903_df_stage1_applied.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", + print(s903_df_stage1[["CHILD", "YEAR", "DECOM", "DEC", "RNE", "REC", "REASON_PLACE_CHANGE", "Has_open_episode_error", "Rule_to_apply"]]) + print("Dataframe with stage 1 rules applied:") + print(s903_df_stage1_applied[["CHILD", "YEAR", "DECOM", "DEC", "RNE", "REC", "REASON_PLACE_CHANGE", "Episode_source", "Has_open_episode_error", "Rule_to_apply"]]) + print("Dataframe with stage 2 rules applied...to be developed:") + print(s903_df_stage2_applied[["CHILD", "YEAR", "DECOM", "DEC", "RNE", "REC", "REASON_PLACE_CHANGE", "Episode_source"]]) + s903_df_stage2_applied = s903_df_stage2_applied.sort_values(["CHILD", "DECOM"], ignore_index=True) + s903_df_stage2_applied.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", index=False) # Run episodes_fix() with our test file which contains examples of each rule (CHILD id indicates which rule) From 88eb38515bdf0f9f17fb379af8602f56debd5e15 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Tue, 27 Feb 2024 11:27:24 +0000 Subject: [PATCH 10/25] WIP - applied stage 2 rules --- ...DA903_episodes_for_testing_fixes_INPUT.csv | 2 + .../s903/lds_ssda903_episodes_fix/process.py | 80 +++++++++++++++---- .../datasets/s903/s903_main_functions.py | 14 ++-- 3 files changed, 74 insertions(+), 22 deletions(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv b/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv index a3b579f1..8e2f469c 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv @@ -20,3 +20,5 @@ RULE4_BAD,2018-11-23,S,C1,N1,U6,PR1,2018-12-24,X1,OTHER,R4 3,Z22 9,4890000,Barki RULE4_BAD,2018-12-20,P,C1,N1,U6,PR1,2019-08-19,X1,CARPL,R4 3,Z5 2,4890000,Barking and Dagenham,2020 RULE4_BAD,2018-12-24,P,C1,N1,U6,PR1,,,,R4 3,Z15 2,4890000,Barking and Dagenham,2019 RULE4_BAD,2019-08-19,P,C1,N1,P1,PR0,2019-10-01,E4A,,R4 3,Z1 4,,Barking and Dagenham,2020 +RULE5_BAD,2019-11-06,P,C1,N1,U6,PR1,2020-03-29,X1,CARPL,R4 3,Z5 2,4890000,Barking and Dagenham,2020 +RULE5_BAD,2020-04-02,P,C1,N1,U6,PR1,,,,R4 3,Z15 2,4890000,Barking and Dagenham,2022 diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index b8c29c51..25e1d355 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -80,7 +80,6 @@ def format_datetime(dataframe: pd.DataFrame, date_columns: list) -> pd.DataFrame """ print("format_datetime()...") - # dataframe["DECOM"].apply(pd.to_datetime, format='%Y-%m-%d', errors='raise') dataframe[date_columns] = dataframe[date_columns].apply(pd.to_datetime, format="%Y-%m-%d", errors="raise") return dataframe @@ -92,9 +91,10 @@ def add_latest_year_and_source_for_la(dataframe: pd.DataFrame) -> pd.DataFrame: :param dataframe: Dataframe with SSDA903 Episodes data :return: Dataframe with column showing latest submission year for each LA and column showing episode source """ + source_for_episode_row = "Original" print("add_latest_year_and_source_for_la()...") dataframe['YEAR_latest'] = dataframe.groupby('LA')['YEAR'].transform('max') - dataframe["Episode_source"] = "Original" + dataframe["Episode_source"] = source_for_episode_row return dataframe @@ -155,11 +155,8 @@ def add_stage1_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: dataframe["Has_next_episode"] = dataframe["DECOM_next"].notnull() dataframe["Has_previous_episode"] = dataframe["DECOM_previous"].notnull() dataframe = dataframe.assign(Has_next_episode_with_RNE_equals_S=lambda row: (row.Has_next_episode) & (row.RNE_next == "S") ) - #dataframe = dataframe.assign(Next_episode_is_duplicate=lambda row: _is_next_episode_duplicate(row)) dataframe = dataframe.assign(Next_episode_is_duplicate=_is_next_episode_duplicate) - #dataframe = dataframe.assign(Previous_episode_is_duplicate=lambda row: _is_previous_episode_duplicate(row)) dataframe = dataframe.assign(Previous_episode_is_duplicate=_is_previous_episode_duplicate) - #dataframe = dataframe.assign(Previous_episode_submitted_later=lambda row: _is_previous_episode_submitted_later(row)) dataframe = dataframe.assign(Previous_episode_submitted_later=_is_previous_episode_submitted_later) return dataframe @@ -181,7 +178,7 @@ def identify_stage1_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame: return dataframe -def _update_dec(row): +def _update_dec_stage1(row): """ Determine updated DEC value. Defaults to input DEC if no rule to apply :param row: Row from dataframe with SSDA903 Episodes data @@ -199,7 +196,7 @@ def _update_dec(row): return row["DEC"] -def _update_rec(row): +def _update_rec_stage1(row): """ Determine updated REC value. Defaults to input REC if no rule to apply :param row: Row from dataframe with SSDA903 Episodes data @@ -215,7 +212,7 @@ def _update_rec(row): return row["REC"] -def _update_reason_place_change(row): +def _update_reason_place_change_stage1(row): """ Determine updated REASON_PLACE_CHANGE value. Defaults to input value if no rule to apply :param row: Row from dataframe with SSDA903 Episodes data @@ -228,7 +225,7 @@ def _update_reason_place_change(row): return row["REASON_PLACE_CHANGE"] -def _update_episode_source(row): +def _update_episode_source_stage1(row): """ Determine updated Episode_source value. Defaults to input value if no rule to apply :param row: Row from dataframe with SSDA903 Episodes data @@ -257,14 +254,57 @@ def apply_stage1_rules(dataframe: pd.DataFrame) -> pd.DataFrame: dataframe = dataframe.drop(dataframe[episodes_to_delete].index) # Apply rules 1, 1A, 2 - dataframe["DEC"] = dataframe.apply(_update_dec, axis=1) - dataframe["REC"] = dataframe.apply(_update_rec, axis=1) - dataframe["REASON_PLACE_CHANGE"] = dataframe.apply(_update_reason_place_change, axis=1) - dataframe["Episode_source"] = dataframe.apply(_update_episode_source, axis=1) + dataframe["DEC"] = dataframe.apply(_update_dec_stage1, axis=1) + dataframe["REC"] = dataframe.apply(_update_rec_stage1, axis=1) + dataframe["REASON_PLACE_CHANGE"] = dataframe.apply(_update_reason_place_change_stage1, axis=1) + dataframe["Episode_source"] = dataframe.apply(_update_episode_source_stage1, axis=1) return dataframe +def _overlaps_next_episode(row): + if row["Has_next_episode"]: + return (row.YEAR < row.YEAR_next) & (row.DEC > row.DECOM_next) + return False + + +def _has_x1_gap_before_next_episode(row): + if row["Has_next_episode"]: + return (row.YEAR < row.YEAR_next) & (row.DEC < row.DECOM_next) & (row.REC == "X1") + return False + + +def _stage2_rule_to_apply(row): + if row["Overlaps_next_episode"]: + return "RULE_4" # Overlaps next episode and next episode was submitted later + if row["Has_X1_gap_before_next_episode"]: + return "RULE_5" # Ends before next episode but has reason "X1" - continuous and next ep was submitted later + + +def _update_dec_stage2(row): + """ + Determine updated DEC value. Defaults to input DEC if no rule to apply + :param row: Row from dataframe with SSDA903 Episodes data + :return: Updated DEC date + """ + if (row["Rule_to_apply"]=="RULE_4") | (row["Rule_to_apply"]=="RULE_5"): + return row["DECOM_next"] + return row["DEC"] + + +def _update_episode_source_stage2(row): + """ + Determine updated Episode_source value. Defaults to input value if no rule to apply + :param row: Row from dataframe with SSDA903 Episodes data + :return: Updated Episode_source value + """ + if (row["Rule_to_apply"]=="RULE_4") | (row["Rule_to_apply"]=="RULE_5"): + if row["Episode_source"] == "Original": + return row["Rule_to_apply"] + return row["Episode_source"] & " | " & row["Rule_to_apply"] + return row["Episode_source"] + + def add_stage2_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: """ Add columns to identify rows which overlap or underlap surrounding episodes @@ -272,8 +312,10 @@ def add_stage2_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: :param dataframe: Dataframe with SSDA903 Episodes data :return: Dataframe with columns showing true if certain conditions are met """ - print("add_stage2_rule_identifier_columns...TODO") - + print("add_stage2_rule_identifier_columns") + dataframe["Has_next_episode"] = dataframe["DECOM_next"].notnull() + dataframe["Overlaps_next_episode"] = dataframe.apply(_overlaps_next_episode, axis=1) + dataframe["Has_X1_gap_before_next_episode"] = dataframe.apply(_has_x1_gap_before_next_episode, axis=1) return dataframe @@ -286,7 +328,9 @@ def identify_stage2_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame: :param dataframe: Dataframe with SSDA903 Episodes data :return: Dataframe with column showing stage 2 rule to be applied """ - print("identify_stage2_rule_to_apply...TODO") + print("identify_stage2_rule_to_apply") + dataframe["Rule_to_apply"] = dataframe.apply(_stage2_rule_to_apply, axis=1) + dataframe["Episode_source"] = dataframe.apply(_update_episode_source_stage2, axis=1) return dataframe @@ -299,5 +343,7 @@ def apply_stage2_rules(dataframe: pd.DataFrame) -> pd.DataFrame: :param dataframe: Dataframe with SSDA903 Episodes data :return: Dataframe with stage 2 rules applied """ - print("apply_stage2_rules...TODO") + print("apply_stage2_rules") + # Apply rules 4, 5 + dataframe["DEC"] = dataframe.apply(_update_dec_stage2, axis=1) return dataframe \ No newline at end of file diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index 056d7d04..bf5258bf 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -241,19 +241,23 @@ def episodes_fix(input, output): s903_df_stage2 = episodes_process.add_stage2_rule_identifier_columns(s903_df_stage2) s903_df_stage2 = episodes_process.identify_stage2_rule_to_apply(s903_df_stage2) - # Apply the stage 2 rules TODO + # Apply the stage 2 rules s903_df_stage2_applied = episodes_process.apply_stage2_rules(s903_df_stage2) + s903_df_final = s903_df_stage2_applied[episodes_process.__COLUMNS_TO_KEEP] + # Following code used to test outputs during development print("Dataframe with rules identified:") print(s903_df_stage1[["CHILD", "YEAR", "DECOM", "DEC", "RNE", "REC", "REASON_PLACE_CHANGE", "Has_open_episode_error", "Rule_to_apply"]]) print("Dataframe with stage 1 rules applied:") print(s903_df_stage1_applied[["CHILD", "YEAR", "DECOM", "DEC", "RNE", "REC", "REASON_PLACE_CHANGE", "Episode_source", "Has_open_episode_error", "Rule_to_apply"]]) - print("Dataframe with stage 2 rules applied...to be developed:") - print(s903_df_stage2_applied[["CHILD", "YEAR", "DECOM", "DEC", "RNE", "REC", "REASON_PLACE_CHANGE", "Episode_source"]]) - s903_df_stage2_applied = s903_df_stage2_applied.sort_values(["CHILD", "DECOM"], ignore_index=True) - s903_df_stage2_applied.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", + print("Dataframe with stage 2 rules applied:") + print(s903_df_stage2_applied[["CHILD", "YEAR", "DECOM", "DEC", "RNE", "REC", "REASON_PLACE_CHANGE", "Episode_source", "DECOM_next", "YEAR_next","Has_next_episode","Overlaps_next_episode","Has_X1_gap_before_next_episode", "Rule_to_apply"]]) + s903_df_final = s903_df_final.sort_values(["CHILD", "DECOM"], ignore_index=True) + s903_df_final.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", index=False) + print("Final dataframe with all rules applied") + print(s903_df_final) # Run episodes_fix() with our test file which contains examples of each rule (CHILD id indicates which rule) episodes_fix( From 6634508ebd366dfe54db2ff313e0cd039b852faa Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Fri, 1 Mar 2024 11:33:26 +0000 Subject: [PATCH 11/25] Apply all rule fixes --- .../s903/lds_ssda903_episodes_fix/process.py | 161 +++++++++++------- .../datasets/s903/s903_main_functions.py | 113 +++++++++--- 2 files changed, 194 insertions(+), 80 deletions(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index 25e1d355..a7cc4f58 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -47,7 +47,10 @@ "Episode_source", ] -def create_previous_and_next_episode(dataframe: pd.DataFrame, columns: list) -> pd.DataFrame: + +def create_previous_and_next_episode( + dataframe: pd.DataFrame, columns: list +) -> pd.DataFrame: """ Add previous and next episode information to each line of a dataframe @@ -55,7 +58,6 @@ def create_previous_and_next_episode(dataframe: pd.DataFrame, columns: list) -> :param columns: List of columns containing required data from previous/next episodes :return: Dataframe with columns showing previous and next episodes """ - print("create_previous_and_next_episode()...") for column in columns: dataframe[column + "_previous"] = np.where( dataframe["CHILD"] == dataframe["CHILD"].shift(1), @@ -78,9 +80,9 @@ def format_datetime(dataframe: pd.DataFrame, date_columns: list) -> pd.DataFrame :param columns: List of columns containing dates :return: Dataframe with date columns showing as datetime data type """ - print("format_datetime()...") - - dataframe[date_columns] = dataframe[date_columns].apply(pd.to_datetime, format="%Y-%m-%d", errors="raise") + dataframe[date_columns] = dataframe[date_columns].apply( + pd.to_datetime, format="%Y-%m-%d", errors="raise" + ) return dataframe @@ -92,55 +94,86 @@ def add_latest_year_and_source_for_la(dataframe: pd.DataFrame) -> pd.DataFrame: :return: Dataframe with column showing latest submission year for each LA and column showing episode source """ source_for_episode_row = "Original" - print("add_latest_year_and_source_for_la()...") - dataframe['YEAR_latest'] = dataframe.groupby('LA')['YEAR'].transform('max') + dataframe["YEAR_latest"] = dataframe.groupby("LA")["YEAR"].transform("max") dataframe["Episode_source"] = source_for_episode_row return dataframe def _is_next_episode_duplicate(row): - return (row.DEC.isnull() & - row.Has_next_episode & - ( (row.DECOM_next != row.DECOM) | (row.DECOM_next.isnull() & row.DECOM.isnull()) ) & - ( (row.RNE_next == row.RNE) | (row.RNE_next.isnull() & row.RNE.isnull()) ) & - ( (row.LS_next == row.LS) | (row.LS_next.isnull() & row.LS.isnull()) ) & - ( (row.PLACE_next == row.PLACE) | (row.PLACE_next.isnull() | row.PLACE.isnull()) ) & - ( (row.PLACE_PROVIDER_next == row.PLACE_PROVIDER) | (row.PLACE_PROVIDER_next.isnull() | row.PLACE_PROVIDER.isnull()) ) & - ( (row.PL_POST_next == row.PL_POST) | (row.PL_POST_next.isnull() | row.PL_POST.isnull()) ) & - ( (row.URN_next == row.URN) | (row.URN_next.isnull() | row.URN.isnull()) ) - ) + return ( + row.DEC.isnull() + & row.Has_next_episode + & ( + (row.DECOM_next != row.DECOM) + | (row.DECOM_next.isnull() & row.DECOM.isnull()) + ) + & ((row.RNE_next == row.RNE) | (row.RNE_next.isnull() & row.RNE.isnull())) + & ((row.LS_next == row.LS) | (row.LS_next.isnull() & row.LS.isnull())) + & ( + (row.PLACE_next == row.PLACE) + | (row.PLACE_next.isnull() | row.PLACE.isnull()) + ) + & ( + (row.PLACE_PROVIDER_next == row.PLACE_PROVIDER) + | (row.PLACE_PROVIDER_next.isnull() | row.PLACE_PROVIDER.isnull()) + ) + & ( + (row.PL_POST_next == row.PL_POST) + | (row.PL_POST_next.isnull() | row.PL_POST.isnull()) + ) + & ((row.URN_next == row.URN) | (row.URN_next.isnull() | row.URN.isnull())) + ) def _is_previous_episode_duplicate(row): - return (row.DEC.isnull() & - row.Has_previous_episode & - ( (row.DECOM_previous != row.DECOM) | (row.DECOM_previous.isnull() & row.DECOM.isnull()) ) & - ( (row.RNE_previous == row.RNE) | (row.RNE_previous.isnull() & row.RNE.isnull()) ) & - ( (row.LS_previous == row.LS) | (row.LS_previous.isnull() & row.LS.isnull()) ) & - ( (row.PLACE_previous == row.PLACE) | (row.PLACE_previous.isnull() | row.PLACE.isnull()) ) & - ( (row.PLACE_PROVIDER_previous == row.PLACE_PROVIDER) | (row.PLACE_PROVIDER_previous.isnull() | row.PLACE_PROVIDER.isnull()) ) & - ( (row.PL_POST_previous == row.PL_POST) | (row.PL_POST_previous.isnull() | row.PL_POST.isnull()) ) & - ( (row.URN_previous == row.URN) | (row.URN_previous.isnull() | row.URN.isnull()) ) - ) + return ( + row.DEC.isnull() + & row.Has_previous_episode + & ( + (row.DECOM_previous != row.DECOM) + | (row.DECOM_previous.isnull() & row.DECOM.isnull()) + ) + & ( + (row.RNE_previous == row.RNE) + | (row.RNE_previous.isnull() & row.RNE.isnull()) + ) + & ((row.LS_previous == row.LS) | (row.LS_previous.isnull() & row.LS.isnull())) + & ( + (row.PLACE_previous == row.PLACE) + | (row.PLACE_previous.isnull() | row.PLACE.isnull()) + ) + & ( + (row.PLACE_PROVIDER_previous == row.PLACE_PROVIDER) + | (row.PLACE_PROVIDER_previous.isnull() | row.PLACE_PROVIDER.isnull()) + ) + & ( + (row.PL_POST_previous == row.PL_POST) + | (row.PL_POST_previous.isnull() | row.PL_POST.isnull()) + ) + & ( + (row.URN_previous == row.URN) + | (row.URN_previous.isnull() | row.URN.isnull()) + ) + ) def _is_previous_episode_submitted_later(row): - return (row.DEC.isnull() & - (row.Has_previous_episode) & - (row.YEAR_previous > row.YEAR) - ) + return ( + row.DEC.isnull() & (row.Has_previous_episode) & (row.YEAR_previous > row.YEAR) + ) + def _stage1_rule_to_apply(row): if row["Has_open_episode_error"]: if row["Next_episode_is_duplicate"] | row["Previous_episode_is_duplicate"]: - return "RULE_3" # Duplicate + return "RULE_3" # Duplicate if row["Previous_episode_submitted_later"]: - return "RULE_3A" # Episode replaced in later submission + return "RULE_3A" # Episode replaced in later submission if row["Has_next_episode"] is False: - return "RULE_2" # Ceases LAC + return "RULE_2" # Ceases LAC if row["Has_next_episode_with_RNE_equals_S"]: - return "RULE_1A" # Ceases LAC, but re-enters care later - return "RULE_1" # Remains LAC, episode changes + return "RULE_1A" # Ceases LAC, but re-enters care later + return "RULE_1" # Remains LAC, episode changes def add_stage1_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: @@ -150,14 +183,23 @@ def add_stage1_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: :param dataframe: Dataframe with SSDA903 Episodes data :return: Dataframe with columns showing true if certain conditions are met """ - print("add_stage1_rule_identifier_columns...") - dataframe = dataframe.assign(Has_open_episode_error=lambda row: (row.DEC.isnull() ) & (row.YEAR != row.YEAR_latest) ) + dataframe = dataframe.assign( + Has_open_episode_error=lambda row: (row.DEC.isnull()) + & (row.YEAR != row.YEAR_latest) + ) dataframe["Has_next_episode"] = dataframe["DECOM_next"].notnull() dataframe["Has_previous_episode"] = dataframe["DECOM_previous"].notnull() - dataframe = dataframe.assign(Has_next_episode_with_RNE_equals_S=lambda row: (row.Has_next_episode) & (row.RNE_next == "S") ) + dataframe = dataframe.assign( + Has_next_episode_with_RNE_equals_S=lambda row: (row.Has_next_episode) + & (row.RNE_next == "S") + ) dataframe = dataframe.assign(Next_episode_is_duplicate=_is_next_episode_duplicate) - dataframe = dataframe.assign(Previous_episode_is_duplicate=_is_previous_episode_duplicate) - dataframe = dataframe.assign(Previous_episode_submitted_later=_is_previous_episode_submitted_later) + dataframe = dataframe.assign( + Previous_episode_is_duplicate=_is_previous_episode_duplicate + ) + dataframe = dataframe.assign( + Previous_episode_submitted_later=_is_previous_episode_submitted_later + ) return dataframe @@ -173,7 +215,6 @@ def identify_stage1_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame: :param dataframe: Dataframe with SSDA903 Episodes data :return: Dataframe with column showing stage 1 rule to be applied """ - print("identify_stage1_rule_to_apply...") dataframe["Rule_to_apply"] = dataframe.apply(_stage1_rule_to_apply, axis=1) return dataframe @@ -189,8 +230,8 @@ def _update_dec_stage1(row): if row["Rule_to_apply"] == "RULE_1": return row["DECOM_next"] if row["Rule_to_apply"] == "RULE_1A": - day_before_next_decom = row["DECOM_next"] - timedelta(days = 1) - return min(end_of_year, day_before_next_decom) # get earliest date + day_before_next_decom = row["DECOM_next"] - timedelta(days=1) + return min(end_of_year, day_before_next_decom) # get earliest date if row["Rule_to_apply"] == "RULE_2": return end_of_year return row["DEC"] @@ -220,7 +261,9 @@ def _update_reason_place_change_stage1(row): """ reason_liia_fix = "LIIAF" if row["Has_open_episode_error"]: - if (row["Rule_to_apply"] == "RULE_1") & (row["RNE_next"] in ("P", "B", "T", "U")): + if (row["Rule_to_apply"] == "RULE_1") & ( + row["RNE_next"] in ("P", "B", "T", "U") + ): return reason_liia_fix return row["REASON_PLACE_CHANGE"] @@ -248,15 +291,16 @@ def apply_stage1_rules(dataframe: pd.DataFrame) -> pd.DataFrame: :param dataframe: Dataframe with SSDA903 Episodes data :return: Dataframe with stage 1 rules applied """ - print("apply_stage1_rules") # Apply rules 3, 3A to delete rows - episodes_to_delete = dataframe["Rule_to_apply"].isin( ['RULE_3', 'RULE_3A']) + episodes_to_delete = dataframe["Rule_to_apply"].isin(["RULE_3", "RULE_3A"]) dataframe = dataframe.drop(dataframe[episodes_to_delete].index) # Apply rules 1, 1A, 2 dataframe["DEC"] = dataframe.apply(_update_dec_stage1, axis=1) dataframe["REC"] = dataframe.apply(_update_rec_stage1, axis=1) - dataframe["REASON_PLACE_CHANGE"] = dataframe.apply(_update_reason_place_change_stage1, axis=1) + dataframe["REASON_PLACE_CHANGE"] = dataframe.apply( + _update_reason_place_change_stage1, axis=1 + ) dataframe["Episode_source"] = dataframe.apply(_update_episode_source_stage1, axis=1) return dataframe @@ -270,15 +314,17 @@ def _overlaps_next_episode(row): def _has_x1_gap_before_next_episode(row): if row["Has_next_episode"]: - return (row.YEAR < row.YEAR_next) & (row.DEC < row.DECOM_next) & (row.REC == "X1") + return ( + (row.YEAR < row.YEAR_next) & (row.DEC < row.DECOM_next) & (row.REC == "X1") + ) return False def _stage2_rule_to_apply(row): if row["Overlaps_next_episode"]: - return "RULE_4" # Overlaps next episode and next episode was submitted later + return "RULE_4" # Overlaps next episode and next episode was submitted later if row["Has_X1_gap_before_next_episode"]: - return "RULE_5" # Ends before next episode but has reason "X1" - continuous and next ep was submitted later + return "RULE_5" # Ends before next episode but has reason "X1" - continuous and next ep was submitted later def _update_dec_stage2(row): @@ -287,7 +333,7 @@ def _update_dec_stage2(row): :param row: Row from dataframe with SSDA903 Episodes data :return: Updated DEC date """ - if (row["Rule_to_apply"]=="RULE_4") | (row["Rule_to_apply"]=="RULE_5"): + if (row["Rule_to_apply"] == "RULE_4") | (row["Rule_to_apply"] == "RULE_5"): return row["DECOM_next"] return row["DEC"] @@ -298,7 +344,7 @@ def _update_episode_source_stage2(row): :param row: Row from dataframe with SSDA903 Episodes data :return: Updated Episode_source value """ - if (row["Rule_to_apply"]=="RULE_4") | (row["Rule_to_apply"]=="RULE_5"): + if (row["Rule_to_apply"] == "RULE_4") | (row["Rule_to_apply"] == "RULE_5"): if row["Episode_source"] == "Original": return row["Rule_to_apply"] return row["Episode_source"] & " | " & row["Rule_to_apply"] @@ -312,10 +358,11 @@ def add_stage2_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: :param dataframe: Dataframe with SSDA903 Episodes data :return: Dataframe with columns showing true if certain conditions are met """ - print("add_stage2_rule_identifier_columns") dataframe["Has_next_episode"] = dataframe["DECOM_next"].notnull() dataframe["Overlaps_next_episode"] = dataframe.apply(_overlaps_next_episode, axis=1) - dataframe["Has_X1_gap_before_next_episode"] = dataframe.apply(_has_x1_gap_before_next_episode, axis=1) + dataframe["Has_X1_gap_before_next_episode"] = dataframe.apply( + _has_x1_gap_before_next_episode, axis=1 + ) return dataframe @@ -328,7 +375,6 @@ def identify_stage2_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame: :param dataframe: Dataframe with SSDA903 Episodes data :return: Dataframe with column showing stage 2 rule to be applied """ - print("identify_stage2_rule_to_apply") dataframe["Rule_to_apply"] = dataframe.apply(_stage2_rule_to_apply, axis=1) dataframe["Episode_source"] = dataframe.apply(_update_episode_source_stage2, axis=1) return dataframe @@ -343,7 +389,6 @@ def apply_stage2_rules(dataframe: pd.DataFrame) -> pd.DataFrame: :param dataframe: Dataframe with SSDA903 Episodes data :return: Dataframe with stage 2 rules applied """ - print("apply_stage2_rules") # Apply rules 4, 5 dataframe["DEC"] = dataframe.apply(_update_dec_stage2, axis=1) - return dataframe \ No newline at end of file + return dataframe diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index bf5258bf..047e1da5 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -206,7 +206,7 @@ def sufficiency_output(input, output): def episodes_fix(input, output): - """" + """ " Applies fixes to la_agg SSDA903 Episodes files :param input: should specify the input file location, including file name and suffix, and be usable by a Path function :param output: should specify the path to the output folder @@ -220,15 +220,23 @@ def episodes_fix(input, output): s903_df = common_process.read_file(input) column_names = config["column_names"] table_name = common_process.match_load_file(s903_df, column_names) - + # Process stage 1 rule fixes for Episodes table if table_name == "Episodes": # Add columns to dataframe to identify which rules should be applied at stage 1 s903_df = s903_df.sort_values(["CHILD", "DECOM"], ignore_index=True) - s903_df_stage1 = episodes_process.create_previous_and_next_episode(s903_df, episodes_process.__COLUMNS) - s903_df_stage1 = episodes_process.format_datetime(s903_df_stage1, episodes_process.__DATES) - s903_df_stage1 = episodes_process.add_latest_year_and_source_for_la(s903_df_stage1) - s903_df_stage1 = episodes_process.add_stage1_rule_identifier_columns(s903_df_stage1) + s903_df_stage1 = episodes_process.create_previous_and_next_episode( + s903_df, episodes_process.__COLUMNS + ) + s903_df_stage1 = episodes_process.format_datetime( + s903_df_stage1, episodes_process.__DATES + ) + s903_df_stage1 = episodes_process.add_latest_year_and_source_for_la( + s903_df_stage1 + ) + s903_df_stage1 = episodes_process.add_stage1_rule_identifier_columns( + s903_df_stage1 + ) s903_df_stage1 = episodes_process.identify_stage1_rule_to_apply(s903_df_stage1) # Apply the stage 1 rules @@ -236,33 +244,94 @@ def episodes_fix(input, output): # Add columns to dataframe to identify which rules should be applied at stage 2 TODO s903_df_stage2 = s903_df_stage1_applied[episodes_process.__COLUMNS_TO_KEEP] - s903_df_stage2 = episodes_process.create_previous_and_next_episode(s903_df_stage2, episodes_process.__COLUMNS) - s903_df_stage2 = episodes_process.format_datetime(s903_df_stage2, episodes_process.__DATES) - s903_df_stage2 = episodes_process.add_stage2_rule_identifier_columns(s903_df_stage2) + s903_df_stage2 = episodes_process.create_previous_and_next_episode( + s903_df_stage2, episodes_process.__COLUMNS + ) + s903_df_stage2 = episodes_process.format_datetime( + s903_df_stage2, episodes_process.__DATES + ) + s903_df_stage2 = episodes_process.add_stage2_rule_identifier_columns( + s903_df_stage2 + ) s903_df_stage2 = episodes_process.identify_stage2_rule_to_apply(s903_df_stage2) # Apply the stage 2 rules s903_df_stage2_applied = episodes_process.apply_stage2_rules(s903_df_stage2) s903_df_final = s903_df_stage2_applied[episodes_process.__COLUMNS_TO_KEEP] - - # Following code used to test outputs during development - print("Dataframe with rules identified:") - print(s903_df_stage1[["CHILD", "YEAR", "DECOM", "DEC", "RNE", "REC", "REASON_PLACE_CHANGE", "Has_open_episode_error", "Rule_to_apply"]]) - print("Dataframe with stage 1 rules applied:") - print(s903_df_stage1_applied[["CHILD", "YEAR", "DECOM", "DEC", "RNE", "REC", "REASON_PLACE_CHANGE", "Episode_source", "Has_open_episode_error", "Rule_to_apply"]]) - print("Dataframe with stage 2 rules applied:") - print(s903_df_stage2_applied[["CHILD", "YEAR", "DECOM", "DEC", "RNE", "REC", "REASON_PLACE_CHANGE", "Episode_source", "DECOM_next", "YEAR_next","Has_next_episode","Overlaps_next_episode","Has_X1_gap_before_next_episode", "Rule_to_apply"]]) s903_df_final = s903_df_final.sort_values(["CHILD", "DECOM"], ignore_index=True) - s903_df_final.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", - index=False) - print("Final dataframe with all rules applied") - print(s903_df_final) + s903_df_final.to_csv( + r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", + index=False, + ) + + # Following code used to print dataframe outputs during development + print_df = False + if print_df: + print("Dataframe with stage 1 rules identified:") + print( + s903_df_stage1[ + [ + "CHILD", + "YEAR", + "DECOM", + "DEC", + "RNE", + "REC", + "REASON_PLACE_CHANGE", + "Has_open_episode_error", + "Rule_to_apply", + ] + ] + ) + print("Dataframe with stage 1 rules applied:") + print( + s903_df_stage1_applied[ + [ + "CHILD", + "YEAR", + "DECOM", + "DEC", + "RNE", + "REC", + "REASON_PLACE_CHANGE", + "Episode_source", + "Has_open_episode_error", + "Rule_to_apply", + ] + ] + ) + print("Dataframe with stage 2 rules applied:") + print( + s903_df_stage2_applied[ + [ + "CHILD", + "YEAR", + "DECOM", + "DEC", + "RNE", + "REC", + "REASON_PLACE_CHANGE", + "Episode_source", + "DECOM_next", + "YEAR_next", + "Has_next_episode", + "Overlaps_next_episode", + "Has_X1_gap_before_next_episode", + "Rule_to_apply", + ] + ] + ) + + print("Final dataframe with all rules applied") + print(s903_df_final) + # Run episodes_fix() with our test file which contains examples of each rule (CHILD id indicates which rule) episodes_fix( r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv", - r"liiatools/datasets/s903/lds_ssda903_episodes_fix" + r"liiatools/datasets/s903/lds_ssda903_episodes_fix", ) # poetry run python liiatools/datasets/s903/s903_main_functions.py +# python -m black "/workspaces/liia-tools/liiatools/datasets/s903/s903_main_functions.py" \ No newline at end of file From bb85d64463ca7d96572d26c00bf79d08901e3660 Mon Sep 17 00:00:00 2001 From: patrick-troy <58770937+patrick-troy@users.noreply.github.com> Date: Wed, 13 Mar 2024 11:17:29 +0000 Subject: [PATCH 12/25] add unit test --- tests/s903/test_episodes_fix.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tests/s903/test_episodes_fix.py diff --git a/tests/s903/test_episodes_fix.py b/tests/s903/test_episodes_fix.py new file mode 100644 index 00000000..7338c2da --- /dev/null +++ b/tests/s903/test_episodes_fix.py @@ -0,0 +1,26 @@ +import pandas as pd + +from liiatools.datasets.s903.lds_ssda903_episodes_fix.process import ( + create_previous_and_next_episode, +) + + +def test_create_previous_and_next_episode(): + data = pd.DataFrame( + { + "CHILD": ["123", "123", "123"], + "DECOM": ["2016-07-26", "2016-08-22", "2016-09-13"], + "RNE": ["S", "L", "P"], + "YEAR": [2016, 2016, 2016], + } + ) + + columns = ["DECOM", "RNE", "YEAR"] + + data_with_previous_next_episode = create_previous_and_next_episode(data, columns) + assert data_with_previous_next_episode["DECOM_previous"].tolist() == [None, "2016-07-26", "2016-08-22"] + assert data_with_previous_next_episode["DECOM_next"].tolist() == ["2016-08-22", "2016-09-13", None] + assert data_with_previous_next_episode["RNE_previous"].tolist() == [None, "S", "L"] + assert data_with_previous_next_episode["RNE_next"].tolist() == ["L", "P", None] + assert data_with_previous_next_episode["YEAR_previous"].tolist() == [None, 2016, 2016] + assert data_with_previous_next_episode["YEAR_next"].tolist() == [2016, 2016, None] From cd1472edb55d9c3ef90d7ee525e4fe5fa3991473 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Thu, 14 Mar 2024 12:47:45 +0000 Subject: [PATCH 13/25] Add unit test --- tests/s903/test_episodes_fix.py | 43 ++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/tests/s903/test_episodes_fix.py b/tests/s903/test_episodes_fix.py index 7338c2da..cd99f860 100644 --- a/tests/s903/test_episodes_fix.py +++ b/tests/s903/test_episodes_fix.py @@ -2,6 +2,7 @@ from liiatools.datasets.s903.lds_ssda903_episodes_fix.process import ( create_previous_and_next_episode, + add_latest_year_and_source_for_la, ) @@ -18,9 +19,45 @@ def test_create_previous_and_next_episode(): columns = ["DECOM", "RNE", "YEAR"] data_with_previous_next_episode = create_previous_and_next_episode(data, columns) - assert data_with_previous_next_episode["DECOM_previous"].tolist() == [None, "2016-07-26", "2016-08-22"] - assert data_with_previous_next_episode["DECOM_next"].tolist() == ["2016-08-22", "2016-09-13", None] + assert data_with_previous_next_episode["DECOM_previous"].tolist() == [ + None, + "2016-07-26", + "2016-08-22", + ] + assert data_with_previous_next_episode["DECOM_next"].tolist() == [ + "2016-08-22", + "2016-09-13", + None, + ] assert data_with_previous_next_episode["RNE_previous"].tolist() == [None, "S", "L"] assert data_with_previous_next_episode["RNE_next"].tolist() == ["L", "P", None] - assert data_with_previous_next_episode["YEAR_previous"].tolist() == [None, 2016, 2016] + assert data_with_previous_next_episode["YEAR_previous"].tolist() == [ + None, + 2016, + 2016, + ] assert data_with_previous_next_episode["YEAR_next"].tolist() == [2016, 2016, None] + + +def test_add_latest_year_and_source_for_la(): + data = pd.DataFrame( + { + "LA": ["BAD", "BAD", "NEW", "NEW"], + "YEAR": [2019, 2020, 2022, 2021], + } + ) + + data_with_latest_year_and_source_for_la = add_latest_year_and_source_for_la(data) + assert data_with_latest_year_and_source_for_la["YEAR_latest"].tolist() == [ + 2020, + 2020, + 2022, + 2022, + ] + assert data_with_latest_year_and_source_for_la["Episode_source"].tolist() == [ + "Original", + "Original", + "Original", + "Original", + ] + From 73d50a259ef4158484446bc54fb28c8b86239c9b Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Mon, 25 Mar 2024 12:27:48 +0000 Subject: [PATCH 14/25] Add test for is_next_episode_duplicate --- .../s903/lds_ssda903_episodes_fix/process.py | 8 +- tests/s903/test_episodes_fix.py | 116 ++++++++++++++++++ 2 files changed, 120 insertions(+), 4 deletions(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index a7cc4f58..331e49a4 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -111,17 +111,17 @@ def _is_next_episode_duplicate(row): & ((row.LS_next == row.LS) | (row.LS_next.isnull() & row.LS.isnull())) & ( (row.PLACE_next == row.PLACE) - | (row.PLACE_next.isnull() | row.PLACE.isnull()) + | (row.PLACE_next.isnull() & row.PLACE.isnull()) ) & ( (row.PLACE_PROVIDER_next == row.PLACE_PROVIDER) - | (row.PLACE_PROVIDER_next.isnull() | row.PLACE_PROVIDER.isnull()) + | (row.PLACE_PROVIDER_next.isnull() & row.PLACE_PROVIDER.isnull()) ) & ( (row.PL_POST_next == row.PL_POST) - | (row.PL_POST_next.isnull() | row.PL_POST.isnull()) + | (row.PL_POST_next.isnull() & row.PL_POST.isnull()) ) - & ((row.URN_next == row.URN) | (row.URN_next.isnull() | row.URN.isnull())) + & ((row.URN_next == row.URN) | (row.URN_next.isnull() & row.URN.isnull())) ) diff --git a/tests/s903/test_episodes_fix.py b/tests/s903/test_episodes_fix.py index cd99f860..573f51ff 100644 --- a/tests/s903/test_episodes_fix.py +++ b/tests/s903/test_episodes_fix.py @@ -3,6 +3,7 @@ from liiatools.datasets.s903.lds_ssda903_episodes_fix.process import ( create_previous_and_next_episode, add_latest_year_and_source_for_la, + _is_next_episode_duplicate, ) @@ -61,3 +62,118 @@ def test_add_latest_year_and_source_for_la(): "Original", ] + +def test__is_next_episode_duplicate(): + data = pd.DataFrame( + { + "DEC": [None, None, None, None, None, None, None, None, None], + "Has_next_episode": [True, True, True, True, True, True, True, True, True], + "DECOM": [ + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + ], + "DECOM_next": [ + "2016-11-22", + "2016-11-22", + "2016-11-22", + "2016-11-22", + "2016-11-22", + "2016-11-22", + "2016-11-22", + "2016-11-22", + "2016-11-22", + ], + "RNE": ["P", "P", "P", "P", "P", "P", "P", None, "P"], + "RNE_next": ["P", "DIFF", "P", "P", "P", "P", "P", None, None], + "LS": ["C2", "C2", "C2", "C2", "C2", "C2", "C2", None, "C2"], + "LS_next": ["C2", "C2", "DIFF", "C2", "C2", "C2", "C2", None, None], + "PLACE": ["U1", "U1", "U1", "U1", "U1", "U1", "U1", None, "U1"], + "PLACE_next": ["U1", "U1", "U1", "DIFF", "U1", "U1", "U1", None, None], + "PLACE_PROVIDER": [ + "PR1", + "PR1", + "PR1", + "PR1", + "PR1", + "PR1", + "PR1", + None, + "PR1", + ], + "PLACE_PROVIDER_next": [ + "PR1", + "PR1", + "PR1", + "PR1", + "DIFF", + "PR1", + "PR1", + None, + None, + ], + "PL_POST": [ + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "ABC1", + None, + "ABC1", + ], + "PL_POST_next": [ + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "DIFF", + "ABC1", + None, + None, + ], + "URN": [ + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + None, + "SC1234", + ], + "URN_next": [ + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "DIFF", + None, + None, + ], + } + ) + + data["Test result"] = _is_next_episode_duplicate(data) + assert data["Test result"].tolist() == [ + True, + False, + False, + False, + False, + False, + False, + True, + False, + ] From 9904ca176eb2461d0f52c67db3dc9ece657cc25c Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Tue, 26 Mar 2024 11:40:21 +0000 Subject: [PATCH 15/25] WIP Add more tests and function stubs --- .../s903/lds_ssda903_episodes_fix/process.py | 125 +++--- tests/s903/test_episodes_fix.py | 383 +++++++++++++++++- 2 files changed, 449 insertions(+), 59 deletions(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index 331e49a4..95787157 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -99,7 +99,27 @@ def add_latest_year_and_source_for_la(dataframe: pd.DataFrame) -> pd.DataFrame: return dataframe -def _is_next_episode_duplicate(row): +def _is_the_same(value_1, value_2) -> bool: + """ + Compare two dataframe cell values and return true if they are both the same or if they are both null + + :param value_1: Dataframe cell value + :param value_2: Dataframe cell value + :return: True if both values are the same or if they are both null, False otherwise + """ + return (value_1 == value_2) | (value_1.isnull() & value_2.isnull()) + + +def _is_next_episode_duplicate(row: pd.Series) -> bool: + """ + Determine if this episode and the next episode are duplicates + Criteria: this episode has no end date (DEC) and + has an episode with a later start date (DECOM_next) + and all other values are the same + + :param row: Row from dataframe with SSDA903 Episodes data + :return: True if both episodes are the same, False otherwise + """ return ( row.DEC.isnull() & row.Has_next_episode @@ -107,25 +127,25 @@ def _is_next_episode_duplicate(row): (row.DECOM_next != row.DECOM) | (row.DECOM_next.isnull() & row.DECOM.isnull()) ) - & ((row.RNE_next == row.RNE) | (row.RNE_next.isnull() & row.RNE.isnull())) - & ((row.LS_next == row.LS) | (row.LS_next.isnull() & row.LS.isnull())) - & ( - (row.PLACE_next == row.PLACE) - | (row.PLACE_next.isnull() & row.PLACE.isnull()) - ) - & ( - (row.PLACE_PROVIDER_next == row.PLACE_PROVIDER) - | (row.PLACE_PROVIDER_next.isnull() & row.PLACE_PROVIDER.isnull()) - ) - & ( - (row.PL_POST_next == row.PL_POST) - | (row.PL_POST_next.isnull() & row.PL_POST.isnull()) - ) - & ((row.URN_next == row.URN) | (row.URN_next.isnull() & row.URN.isnull())) + & _is_the_same(row.RNE_next, row.RNE) + & _is_the_same(row.LS_next, row.LS) + & _is_the_same(row.PLACE_next, row.PLACE) + & _is_the_same(row.PLACE_PROVIDER_next, row.PLACE_PROVIDER) + & _is_the_same(row.PL_POST_next, row.PL_POST) + & _is_the_same(row.URN_next, row.URN) ) -def _is_previous_episode_duplicate(row): +def _is_previous_episode_duplicate(row: pd.Series) -> bool: + """ + Determine if this episode and the previous episode are duplicates + Criteria: this episode has no end date (DEC) and + has an episode with an earlier start date (DECOM_previous) + and all other values are the same + + :param row: Row from dataframe with SSDA903 Episodes data + :return: True if both episodes are the same, False otherwise + """ return ( row.DEC.isnull() & row.Has_previous_episode @@ -133,37 +153,34 @@ def _is_previous_episode_duplicate(row): (row.DECOM_previous != row.DECOM) | (row.DECOM_previous.isnull() & row.DECOM.isnull()) ) - & ( - (row.RNE_previous == row.RNE) - | (row.RNE_previous.isnull() & row.RNE.isnull()) - ) - & ((row.LS_previous == row.LS) | (row.LS_previous.isnull() & row.LS.isnull())) - & ( - (row.PLACE_previous == row.PLACE) - | (row.PLACE_previous.isnull() | row.PLACE.isnull()) - ) - & ( - (row.PLACE_PROVIDER_previous == row.PLACE_PROVIDER) - | (row.PLACE_PROVIDER_previous.isnull() | row.PLACE_PROVIDER.isnull()) - ) - & ( - (row.PL_POST_previous == row.PL_POST) - | (row.PL_POST_previous.isnull() | row.PL_POST.isnull()) - ) - & ( - (row.URN_previous == row.URN) - | (row.URN_previous.isnull() | row.URN.isnull()) - ) + & _is_the_same(row.RNE_previous, row.RNE) + & _is_the_same(row.LS_previous, row.LS) + & _is_the_same(row.PLACE_previous, row.PLACE) + & _is_the_same(row.PLACE_PROVIDER_previous, row.PLACE_PROVIDER) + & _is_the_same(row.PL_POST_previous, row.PL_POST) + & _is_the_same(row.URN_previous, row.URN) ) -def _is_previous_episode_submitted_later(row): +def _is_previous_episode_submitted_later(row: pd.Series) -> bool: + """ + Determine if episode with earlier start date (DECOM) was submitted in later file YEAR + + :param row: Row from dataframe with SSDA903 Episodes data + :return: True if previous episode was submitted in later file YEAR, False otherwise + """ return ( row.DEC.isnull() & (row.Has_previous_episode) & (row.YEAR_previous > row.YEAR) ) -def _stage1_rule_to_apply(row): +def _stage1_rule_to_apply(row: pd.Series) -> pd.Series: + """ + Determine which Stage 1 rule should be applied + + :param row: Row from dataframe with SSDA903 Episodes data + :return: Name of rule to be applied + """ if row["Has_open_episode_error"]: if row["Next_episode_is_duplicate"] | row["Previous_episode_is_duplicate"]: return "RULE_3" # Duplicate @@ -219,9 +236,10 @@ def identify_stage1_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame: return dataframe -def _update_dec_stage1(row): +def _update_dec_stage1(row: pd.Series) -> pd.Series: """ Determine updated DEC value. Defaults to input DEC if no rule to apply + :param row: Row from dataframe with SSDA903 Episodes data :return: Updated DEC date """ @@ -237,11 +255,12 @@ def _update_dec_stage1(row): return row["DEC"] -def _update_rec_stage1(row): +def _update_rec_stage1(row: pd.Series) -> pd.Series: """ Determine updated REC value. Defaults to input REC if no rule to apply + :param row: Row from dataframe with SSDA903 Episodes data - :return: Updated REC value + :return: Updated REC value or the original value if no rule to apply """ episode_ends_liia_fix = "E99" episode_continues = "X1" @@ -253,11 +272,12 @@ def _update_rec_stage1(row): return row["REC"] -def _update_reason_place_change_stage1(row): +def _update_reason_place_change_stage1(row: pd.Series) -> pd.Series: """ Determine updated REASON_PLACE_CHANGE value. Defaults to input value if no rule to apply + :param row: Row from dataframe with SSDA903 Episodes data - :return: Updated REASON_PLACE_CHANGE value + :return: Updated REASON_PLACE_CHANGE value or the original value if no rule to apply """ reason_liia_fix = "LIIAF" if row["Has_open_episode_error"]: @@ -268,11 +288,12 @@ def _update_reason_place_change_stage1(row): return row["REASON_PLACE_CHANGE"] -def _update_episode_source_stage1(row): +def _update_episode_source_stage1(row: pd.Series) -> pd.Series: """ Determine updated Episode_source value. Defaults to input value if no rule to apply + :param row: Row from dataframe with SSDA903 Episodes data - :return: Updated Episode_source value + :return: Updated Episode_source value or the original value if no rule to apply """ if row["Has_open_episode_error"]: return row["Rule_to_apply"] @@ -306,13 +327,13 @@ def apply_stage1_rules(dataframe: pd.DataFrame) -> pd.DataFrame: return dataframe -def _overlaps_next_episode(row): +def _overlaps_next_episode(row: pd.Series) -> bool: if row["Has_next_episode"]: return (row.YEAR < row.YEAR_next) & (row.DEC > row.DECOM_next) return False -def _has_x1_gap_before_next_episode(row): +def _has_x1_gap_before_next_episode(row: pd.Series) -> bool: if row["Has_next_episode"]: return ( (row.YEAR < row.YEAR_next) & (row.DEC < row.DECOM_next) & (row.REC == "X1") @@ -327,9 +348,10 @@ def _stage2_rule_to_apply(row): return "RULE_5" # Ends before next episode but has reason "X1" - continuous and next ep was submitted later -def _update_dec_stage2(row): +def _update_dec_stage2(row: pd.Series) -> pd.Series: """ Determine updated DEC value. Defaults to input DEC if no rule to apply + :param row: Row from dataframe with SSDA903 Episodes data :return: Updated DEC date """ @@ -338,9 +360,10 @@ def _update_dec_stage2(row): return row["DEC"] -def _update_episode_source_stage2(row): +def _update_episode_source_stage2(row: pd.Series) -> pd.Series: """ Determine updated Episode_source value. Defaults to input value if no rule to apply + :param row: Row from dataframe with SSDA903 Episodes data :return: Updated Episode_source value """ diff --git a/tests/s903/test_episodes_fix.py b/tests/s903/test_episodes_fix.py index 573f51ff..3bc2a0f5 100644 --- a/tests/s903/test_episodes_fix.py +++ b/tests/s903/test_episodes_fix.py @@ -3,7 +3,26 @@ from liiatools.datasets.s903.lds_ssda903_episodes_fix.process import ( create_previous_and_next_episode, add_latest_year_and_source_for_la, + _is_the_same, _is_next_episode_duplicate, + _is_previous_episode_duplicate, + _is_previous_episode_submitted_later, + _stage1_rule_to_apply, + add_stage1_rule_identifier_columns, + identify_stage1_rule_to_apply, + _update_dec_stage1, + _update_rec_stage1, + _update_reason_place_change_stage1, + _update_episode_source_stage1, + apply_stage1_rules, + _overlaps_next_episode, + _has_x1_gap_before_next_episode, + _stage2_rule_to_apply, + _update_dec_stage2, + _update_episode_source_stage2, + add_stage2_rule_identifier_columns, + identify_stage2_rule_to_apply, + apply_stage2_rules, ) @@ -63,11 +82,51 @@ def test_add_latest_year_and_source_for_la(): ] +def test__is_the_same(): + data = pd.DataFrame( + { + "VALUE1": ["123", "123", "123", None], + "VALUE2": ["123", "456", None, None], + } + ) + data["Test result"] = _is_the_same(data["VALUE1"], data["VALUE2"]) + assert data["Test result"].tolist() == [ + True, + False, + False, + True, + ] + + def test__is_next_episode_duplicate(): data = pd.DataFrame( { - "DEC": [None, None, None, None, None, None, None, None, None], - "Has_next_episode": [True, True, True, True, True, True, True, True, True], + "DEC": [ + None, + None, + None, + None, + None, + None, + None, + None, + None, + "2016-08-31", + None, + ], + "Has_next_episode": [ + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + False, + ], "DECOM": [ "2016-08-22", "2016-08-22", @@ -78,6 +137,8 @@ def test__is_next_episode_duplicate(): "2016-08-22", "2016-08-22", "2016-08-22", + "2016-08-22", + "2016-08-22", ], "DECOM_next": [ "2016-11-22", @@ -89,13 +150,39 @@ def test__is_next_episode_duplicate(): "2016-11-22", "2016-11-22", "2016-11-22", + "2016-11-22", + None, + ], + "RNE": ["P", "P", "P", "P", "P", "P", "P", None, "P", "P", "P"], + "RNE_next": ["P", "DIFF", "P", "P", "P", "P", "P", None, None, "P", None], + "LS": ["C2", "C2", "C2", "C2", "C2", "C2", "C2", None, "C2", "C2", "C2"], + "LS_next": [ + "C2", + "C2", + "DIFF", + "C2", + "C2", + "C2", + "C2", + None, + None, + "C2", + None, + ], + "PLACE": ["U1", "U1", "U1", "U1", "U1", "U1", "U1", None, "U1", "U1", "U1"], + "PLACE_next": [ + "U1", + "U1", + "U1", + "DIFF", + "U1", + "U1", + "U1", + None, + None, + "U1", + None, ], - "RNE": ["P", "P", "P", "P", "P", "P", "P", None, "P"], - "RNE_next": ["P", "DIFF", "P", "P", "P", "P", "P", None, None], - "LS": ["C2", "C2", "C2", "C2", "C2", "C2", "C2", None, "C2"], - "LS_next": ["C2", "C2", "DIFF", "C2", "C2", "C2", "C2", None, None], - "PLACE": ["U1", "U1", "U1", "U1", "U1", "U1", "U1", None, "U1"], - "PLACE_next": ["U1", "U1", "U1", "DIFF", "U1", "U1", "U1", None, None], "PLACE_PROVIDER": [ "PR1", "PR1", @@ -106,6 +193,8 @@ def test__is_next_episode_duplicate(): "PR1", None, "PR1", + "PR1", + "PR1", ], "PLACE_PROVIDER_next": [ "PR1", @@ -117,6 +206,8 @@ def test__is_next_episode_duplicate(): "PR1", None, None, + "PR1", + None, ], "PL_POST": [ "ABC1", @@ -128,6 +219,8 @@ def test__is_next_episode_duplicate(): "ABC1", None, "ABC1", + "ABC1", + "ABC1", ], "PL_POST_next": [ "ABC1", @@ -139,6 +232,8 @@ def test__is_next_episode_duplicate(): "ABC1", None, None, + "ABC1", + None, ], "URN": [ "SC1234", @@ -150,6 +245,8 @@ def test__is_next_episode_duplicate(): "SC1234", None, "SC1234", + "SC1234", + "SC1234", ], "URN_next": [ "SC1234", @@ -161,6 +258,8 @@ def test__is_next_episode_duplicate(): "DIFF", None, None, + "SC1234", + None, ], } ) @@ -176,4 +275,272 @@ def test__is_next_episode_duplicate(): False, True, False, + False, + False, ] + + +def test__is_previous_episode_duplicate(): + data = pd.DataFrame( + { + "DEC": [ + None, + None, + None, + None, + None, + None, + None, + None, + None, + "2016-08-31", + None, + ], + "Has_previous_episode": [ + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + False, + ], + "DECOM": [ + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + ], + "DECOM_previous": [ + "2016-01-22", + "2016-01-22", + "2016-01-22", + "2016-01-22", + "2016-01-22", + "2016-01-22", + "2016-01-22", + "2016-01-22", + "2016-01-22", + "2016-01-22", + None, + ], + "RNE": ["P", "P", "P", "P", "P", "P", "P", None, "P", "P", "P"], + "RNE_previous": [ + "P", + "DIFF", + "P", + "P", + "P", + "P", + "P", + None, + None, + "P", + None, + ], + "LS": ["C2", "C2", "C2", "C2", "C2", "C2", "C2", None, "C2", "C2", "C2"], + "LS_previous": [ + "C2", + "C2", + "DIFF", + "C2", + "C2", + "C2", + "C2", + None, + None, + "C2", + None, + ], + "PLACE": ["U1", "U1", "U1", "U1", "U1", "U1", "U1", None, "U1", "U1", "U1"], + "PLACE_previous": [ + "U1", + "U1", + "U1", + "DIFF", + "U1", + "U1", + "U1", + None, + None, + "U1", + None, + ], + "PLACE_PROVIDER": [ + "PR1", + "PR1", + "PR1", + "PR1", + "PR1", + "PR1", + "PR1", + None, + "PR1", + "PR1", + "PR1", + ], + "PLACE_PROVIDER_previous": [ + "PR1", + "PR1", + "PR1", + "PR1", + "DIFF", + "PR1", + "PR1", + None, + None, + "PR1", + None, + ], + "PL_POST": [ + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "ABC1", + None, + "ABC1", + "ABC1", + "ABC1", + ], + "PL_POST_previous": [ + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "DIFF", + "ABC1", + None, + None, + "ABC1", + None, + ], + "URN": [ + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + None, + "SC1234", + "SC1234", + "SC1234", + ], + "URN_previous": [ + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "DIFF", + None, + None, + "SC1234", + None, + ], + } + ) + + data["Test result"] = _is_previous_episode_duplicate(data) + assert data["Test result"].tolist() == [ + True, + False, + False, + False, + False, + False, + False, + True, + False, + False, + False, + ] + + +def test__is_previous_episode_submitted_later(): + None + + +def test__stage1_rule_to_apply(): + None + + +def test_add_stage1_rule_identifier_columns(): + None + + +def test_identify_stage1_rule_to_apply(): + None + + +def test__update_dec_stage1(): + None + + +def test__update_rec_stage1(): + None + + +def test__update_reason_place_change_stage1(): + None + + +def test__update_episode_source_stage1(): + None + + +def test_apply_stage1_rules(): + None + + +def test_overlaps_next_episode(): + None + + +def test__has_x1_gap_before_next_episode(): + None + + +def test__stage2_rule_to_apply(): + None + + +def test__update_dec_stage2(): + None + + +def test__update_episode_source_stage2(): + None + + +def test_add_stage2_rule_identifier_columns(): + None + + +def test_identify_stage2_rule_to_apply(): + None + + +def test_apply_stage2_rules(): + None + + +# python -m black "/workspaces/liia-tools/tests/s903/" +# poetry run coverage run -m pytest From 2d3e64d33ba48eaa806c9b1e6b606e640315a99b Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Tue, 2 Apr 2024 14:59:31 +0000 Subject: [PATCH 16/25] WIP Add more test functions for episode fixes --- .../s903/lds_ssda903_episodes_fix/process.py | 81 +++--- tests/s903/test_episodes_fix.py | 259 ++++++++++++++++-- 2 files changed, 276 insertions(+), 64 deletions(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index 95787157..b2430c54 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -99,6 +99,33 @@ def add_latest_year_and_source_for_la(dataframe: pd.DataFrame) -> pd.DataFrame: return dataframe +def add_stage1_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: + """ + Add columns to identify rows with open episodes that meet certain criteria + + :param dataframe: Dataframe with SSDA903 Episodes data + :return: Dataframe with columns showing true if certain conditions are met + """ + dataframe = dataframe.assign( + Has_open_episode_error=lambda row: (row.DEC.isnull()) + & (row.YEAR != row.YEAR_latest) + ) + dataframe["Has_next_episode"] = dataframe["DECOM_next"].notnull() + dataframe["Has_previous_episode"] = dataframe["DECOM_previous"].notnull() + dataframe = dataframe.assign( + Has_next_episode_with_RNE_equals_S=lambda row: (row.Has_next_episode) + & (row.RNE_next == "S") + ) + dataframe = dataframe.assign(Next_episode_is_duplicate=_is_next_episode_duplicate) + dataframe = dataframe.assign( + Previous_episode_is_duplicate=_is_previous_episode_duplicate + ) + dataframe = dataframe.assign( + Previous_episode_submitted_later=_is_previous_episode_submitted_later + ) + return dataframe + + def _is_the_same(value_1, value_2) -> bool: """ Compare two dataframe cell values and return true if they are both the same or if they are both null @@ -165,7 +192,7 @@ def _is_previous_episode_duplicate(row: pd.Series) -> bool: def _is_previous_episode_submitted_later(row: pd.Series) -> bool: """ Determine if episode with earlier start date (DECOM) was submitted in later file YEAR - + :param row: Row from dataframe with SSDA903 Episodes data :return: True if previous episode was submitted in later file YEAR, False otherwise """ @@ -177,9 +204,9 @@ def _is_previous_episode_submitted_later(row: pd.Series) -> bool: def _stage1_rule_to_apply(row: pd.Series) -> pd.Series: """ Determine which Stage 1 rule should be applied - + :param row: Row from dataframe with SSDA903 Episodes data - :return: Name of rule to be applied + :return: Name of rule to be applied or None if not applicable """ if row["Has_open_episode_error"]: if row["Next_episode_is_duplicate"] | row["Previous_episode_is_duplicate"]: @@ -193,33 +220,6 @@ def _stage1_rule_to_apply(row: pd.Series) -> pd.Series: return "RULE_1" # Remains LAC, episode changes -def add_stage1_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame: - """ - Add columns to identify rows with open episodes that meet certain criteria - - :param dataframe: Dataframe with SSDA903 Episodes data - :return: Dataframe with columns showing true if certain conditions are met - """ - dataframe = dataframe.assign( - Has_open_episode_error=lambda row: (row.DEC.isnull()) - & (row.YEAR != row.YEAR_latest) - ) - dataframe["Has_next_episode"] = dataframe["DECOM_next"].notnull() - dataframe["Has_previous_episode"] = dataframe["DECOM_previous"].notnull() - dataframe = dataframe.assign( - Has_next_episode_with_RNE_equals_S=lambda row: (row.Has_next_episode) - & (row.RNE_next == "S") - ) - dataframe = dataframe.assign(Next_episode_is_duplicate=_is_next_episode_duplicate) - dataframe = dataframe.assign( - Previous_episode_is_duplicate=_is_previous_episode_duplicate - ) - dataframe = dataframe.assign( - Previous_episode_submitted_later=_is_previous_episode_submitted_later - ) - return dataframe - - def identify_stage1_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame: """ Add column to identify which stage 1 rule should be applied: @@ -239,7 +239,7 @@ def identify_stage1_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame: def _update_dec_stage1(row: pd.Series) -> pd.Series: """ Determine updated DEC value. Defaults to input DEC if no rule to apply - + :param row: Row from dataframe with SSDA903 Episodes data :return: Updated DEC date """ @@ -258,40 +258,37 @@ def _update_dec_stage1(row: pd.Series) -> pd.Series: def _update_rec_stage1(row: pd.Series) -> pd.Series: """ Determine updated REC value. Defaults to input REC if no rule to apply - + :param row: Row from dataframe with SSDA903 Episodes data :return: Updated REC value or the original value if no rule to apply """ - episode_ends_liia_fix = "E99" - episode_continues = "X1" if row["Has_open_episode_error"]: if row["Rule_to_apply"] == "RULE_1": - return episode_continues + return "X1" if row["Rule_to_apply"] in ("RULE_1A", "RULE_2"): - return episode_ends_liia_fix + return "E99" return row["REC"] def _update_reason_place_change_stage1(row: pd.Series) -> pd.Series: """ Determine updated REASON_PLACE_CHANGE value. Defaults to input value if no rule to apply - + :param row: Row from dataframe with SSDA903 Episodes data :return: Updated REASON_PLACE_CHANGE value or the original value if no rule to apply """ - reason_liia_fix = "LIIAF" if row["Has_open_episode_error"]: if (row["Rule_to_apply"] == "RULE_1") & ( row["RNE_next"] in ("P", "B", "T", "U") ): - return reason_liia_fix + return "LIIAF" return row["REASON_PLACE_CHANGE"] def _update_episode_source_stage1(row: pd.Series) -> pd.Series: """ Determine updated Episode_source value. Defaults to input value if no rule to apply - + :param row: Row from dataframe with SSDA903 Episodes data :return: Updated Episode_source value or the original value if no rule to apply """ @@ -351,7 +348,7 @@ def _stage2_rule_to_apply(row): def _update_dec_stage2(row: pd.Series) -> pd.Series: """ Determine updated DEC value. Defaults to input DEC if no rule to apply - + :param row: Row from dataframe with SSDA903 Episodes data :return: Updated DEC date """ @@ -363,7 +360,7 @@ def _update_dec_stage2(row: pd.Series) -> pd.Series: def _update_episode_source_stage2(row: pd.Series) -> pd.Series: """ Determine updated Episode_source value. Defaults to input value if no rule to apply - + :param row: Row from dataframe with SSDA903 Episodes data :return: Updated Episode_source value """ diff --git a/tests/s903/test_episodes_fix.py b/tests/s903/test_episodes_fix.py index 3bc2a0f5..81ea9227 100644 --- a/tests/s903/test_episodes_fix.py +++ b/tests/s903/test_episodes_fix.py @@ -82,6 +82,48 @@ def test_add_latest_year_and_source_for_la(): ] +def test_add_stage1_rule_identifier_columns(): + data = pd.DataFrame( + { + "DEC": [None, None], + "YEAR": [2019, 2019], + "YEAR_latest": [2022, 2019], + "DECOM_next": ["2019-10-10", None], + "RNE_next": ["S", None], + # following required in Dataframe but not part of test + "YEAR_previous": [None, None], + "DECOM": [None, None], + "DECOM_previous": [None, None], + "RNE": [None, None], + "RNE_previous": [None, None], + "LS": [None, None], + "LS_next": [None, None], + "LS_previous": [None, None], + "PLACE": [None, None], + "PLACE_next": [None, None], + "PLACE_previous": [None, None], + "PLACE_PROVIDER": [None, None], + "PLACE_PROVIDER_next": [None, None], + "PLACE_PROVIDER_previous": [None, None], + "PL_POST": [None, None], + "PL_POST_next": [None, None], + "PL_POST_previous": [None, None], + "URN": [None, None], + "URN_next": [None, None], + "URN_previous": [None, None], + } + ) + data_with_identifiers_added = add_stage1_rule_identifier_columns(data) + assert data_with_identifiers_added["Has_open_episode_error"].tolist() == [ + True, + False, + ] + assert data_with_identifiers_added["Has_next_episode"].tolist() == [True, False] + assert data_with_identifiers_added[ + "Has_next_episode_with_RNE_equals_S" + ].tolist() == [True, False] + + def test__is_the_same(): data = pd.DataFrame( { @@ -475,55 +517,228 @@ def test__is_previous_episode_duplicate(): def test__is_previous_episode_submitted_later(): - None + data = pd.DataFrame( + { + "DEC": ["2016-01-22", None, None, None], + "Has_previous_episode": [True, True, True, None], + "YEAR": [2018, 2018, 2018, 2018], + "YEAR_previous": [2018, 2019, 2017, None], + } + ) + data["Test result"] = _is_previous_episode_submitted_later(data) + assert data["Test result"].tolist() == [ + False, + True, + False, + False, + ] def test__stage1_rule_to_apply(): - None - - -def test_add_stage1_rule_identifier_columns(): - None - - -def test_identify_stage1_rule_to_apply(): - None + data = pd.DataFrame( + { + "Has_open_episode_error": [False, True, True, True, True, True], + "Next_episode_is_duplicate": [None, True, False, False, False, False], + "Previous_episode_is_duplicate": [None, True, False, False, False, False], + "Previous_episode_submitted_later": [ + None, + False, + True, + False, + False, + False, + ], + "Has_next_episode": [None, False, False, False, True, True], + "Has_next_episode_with_RNE_equals_S": [ + None, + False, + False, + False, + True, + False, + ], + } + ) + data["Rule_to_apply"] = data.apply(_stage1_rule_to_apply, axis=1) + assert data["Rule_to_apply"].tolist() == [ + None, + "RULE_3", + "RULE_3A", + "RULE_2", + "RULE_1A", + "RULE_1", + ] def test__update_dec_stage1(): - None + data = pd.DataFrame( + { + "DEC": [None, "2020-11-11", None, None, None, None], + "Has_open_episode_error": [False, False, True, True, True, True], + "Rule_to_apply": [None, None, "RULE_1", "RULE_1A", "RULE_1A", "RULE_2"], + "YEAR": [2022, 2022, 2022, 2022, 2022, 2022], + "DECOM_next": [ + "2021-05-05", + "2021-08-29", + "2022-01-01", + "2022-02-02", + "2022-05-20", + None, + ], + } + ) + data[["DEC", "DECOM_next"]] = data[["DEC", "DECOM_next"]].apply( + pd.to_datetime, format="%Y-%m-%d" + ) + data["DEC"] = data.apply(_update_dec_stage1, axis=1) + assert data["DEC"].astype(str).tolist() == [ + "NaT", + "2020-11-11", + "2022-01-01", + "2022-02-01", + "2022-03-31", + "2022-03-31", + ] def test__update_rec_stage1(): - None + data = pd.DataFrame( + { + "REC": [None, "E41", None, None, None], + "Has_open_episode_error": [False, False, True, True, True], + "Rule_to_apply": [None, None, "RULE_1", "RULE_1A", "RULE_2"], + } + ) + data["updated_REC"] = data.apply(_update_rec_stage1, axis=1) + assert data["updated_REC"].tolist() == [ + None, + "E41", + "X1", + "E99", + "E99", + ] def test__update_reason_place_change_stage1(): - None + data = pd.DataFrame( + { + "REASON_PLACE_CHANGE": [ + "CAREPL", + "CAREPL", + "CAREPL", + "CAREPL", + "CAREPL", + "CAREPL", + "OTHER", + ], + "RNE_next": ["P", "P", "P", "B", "T", "U", "S"], + "Has_open_episode_error": [False, False, True, True, True, True, True], + "Rule_to_apply": [ + None, + None, + "RULE_1", + "RULE_1", + "RULE_1", + "RULE_1", + "RULE_2", + ], + } + ) + data["REASON_PLACE_CHANGE"] = data.apply(_update_reason_place_change_stage1, axis=1) + assert data["REASON_PLACE_CHANGE"].tolist() == [ + "CAREPL", + "CAREPL", + "LIIAF", + "LIIAF", + "LIIAF", + "LIIAF", + "OTHER", + ] def test__update_episode_source_stage1(): - None - - -def test_apply_stage1_rules(): - None + data = pd.DataFrame( + { + "Episode_source": ["Original", "Original"], + "Has_open_episode_error": [False, True], + "Rule_to_apply": [None, "RULE_1"], + } + ) + data["Episode_source"] = data.apply(_update_episode_source_stage1, axis=1) + assert data["Episode_source"].tolist() == [ + "Original", + "RULE_1", + ] def test_overlaps_next_episode(): - None + data = pd.DataFrame( + { + "Has_next_episode": [False, True, True, True], + "YEAR": [2020, 2020, 2020, 2020], + "YEAR_next": [None, 2021, 2021, 2019], + "DEC": ["2021-01-31", "2021-01-31", "2021-01-31", "2021-01-31"], + "DECOM_next": [None, "2022-02-02", "2021-01-01", "2021-01-01"], + } + ) + data["test_result"] = data.apply(_overlaps_next_episode, axis=1) + assert data["test_result"].tolist() == [ + False, + False, + True, + False, + ] def test__has_x1_gap_before_next_episode(): - None + data = pd.DataFrame( + { + "Has_next_episode": [False, True, True, True], + "YEAR": [2020, 2020, 2020, 2020], + "YEAR_next": [None, 2021, 2021, 2019], + "DEC": ["2021-01-31", "2021-01-31", "2021-01-31", "2021-01-31"], + "DECOM_next": [None, "2021-01-31", "2021-03-01", "2021-03-01"], + "REC": ["E43", "X1", "X1", "X1"], + } + ) + data["test_result"] = data.apply(_has_x1_gap_before_next_episode, axis=1) + assert data["test_result"].tolist() == [ + False, + False, + True, + False, + ] def test__stage2_rule_to_apply(): - None + data = pd.DataFrame( + { + "Overlaps_next_episode": [False, True, False], + "Has_X1_gap_before_next_episode": [False, False, True], + } + ) + data["test_result"] = data.apply(_stage2_rule_to_apply, axis=1) + assert data["test_result"].tolist() == [ + None, + "RULE_4", + "RULE_5", + ] def test__update_dec_stage2(): - None + data = pd.DataFrame( + { + "DEC": ["2021-01-01", "2021-01-01", "2021-01-01"], + "DECOM_next": ["2022-11-11", "2022-11-11", "2022-11-11"], + "Rule_to_apply": [None, "RULE_4", "RULE_5"], + } + ) + data["test_result"] = data.apply(_update_dec_stage2, axis=1) + assert data["test_result"].tolist() == [ + "2021-01-01", + "2022-11-11", + "2022-11-11", + ] def test__update_episode_source_stage2(): From 6d7fa6b662f95195835e3080cea670b56d1d7822 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 3 Apr 2024 08:19:44 +0000 Subject: [PATCH 17/25] Add more unit tests for episode fix --- .../s903/lds_ssda903_episodes_fix/process.py | 2 +- tests/s903/test_episodes_fix.py | 25 +++++++++---------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index b2430c54..655de931 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -367,7 +367,7 @@ def _update_episode_source_stage2(row: pd.Series) -> pd.Series: if (row["Rule_to_apply"] == "RULE_4") | (row["Rule_to_apply"] == "RULE_5"): if row["Episode_source"] == "Original": return row["Rule_to_apply"] - return row["Episode_source"] & " | " & row["Rule_to_apply"] + return row["Episode_source"] + " | " + row["Rule_to_apply"] return row["Episode_source"] diff --git a/tests/s903/test_episodes_fix.py b/tests/s903/test_episodes_fix.py index 81ea9227..2b8299f7 100644 --- a/tests/s903/test_episodes_fix.py +++ b/tests/s903/test_episodes_fix.py @@ -742,19 +742,18 @@ def test__update_dec_stage2(): def test__update_episode_source_stage2(): - None - - -def test_add_stage2_rule_identifier_columns(): - None - - -def test_identify_stage2_rule_to_apply(): - None - - -def test_apply_stage2_rules(): - None + data = pd.DataFrame( + { + "Episode_source": ["Original", "Original", "RULE_1"], + "Rule_to_apply": [None, "RULE_4", "RULE_5"], + } + ) + data["test_result"] = data.apply(_update_episode_source_stage2, axis=1) + assert data["test_result"].tolist() == [ + "Original", + "RULE_4", + "RULE_1 | RULE_5", + ] # python -m black "/workspaces/liia-tools/tests/s903/" From 89490c2ac804b0ee9fcffbeea86e35cde16c6fff Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Mon, 20 May 2024 14:55:58 +0000 Subject: [PATCH 18/25] Implemented suggestions --- .../s903/lds_ssda903_episodes_fix/process.py | 61 +++++++-- .../datasets/s903/s903_main_functions.py | 116 ++---------------- tests/s903/test_episodes_fix.py | 8 +- 3 files changed, 65 insertions(+), 120 deletions(-) diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py index 655de931..44e96c71 100644 --- a/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py +++ b/liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py @@ -1,6 +1,6 @@ +from datetime import datetime, timedelta import numpy as np import pandas as pd -from datetime import datetime, timedelta __COLUMNS = [ "DECOM", @@ -77,7 +77,7 @@ def format_datetime(dataframe: pd.DataFrame, date_columns: list) -> pd.DataFrame Format date columns to datetime type :param dataframe: Dataframe with SSDA903 Episodes data - :param columns: List of columns containing dates + :param date_columns: List of columns containing dates :return: Dataframe with date columns showing as datetime data type """ dataframe[date_columns] = dataframe[date_columns].apply( @@ -201,7 +201,7 @@ def _is_previous_episode_submitted_later(row: pd.Series) -> bool: ) -def _stage1_rule_to_apply(row: pd.Series) -> pd.Series: +def _stage1_rule_to_apply(row: pd.Series) -> str: """ Determine which Stage 1 rule should be applied @@ -236,7 +236,7 @@ def identify_stage1_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame: return dataframe -def _update_dec_stage1(row: pd.Series) -> pd.Series: +def _update_dec_stage1(row: pd.Series) -> datetime: """ Determine updated DEC value. Defaults to input DEC if no rule to apply @@ -255,7 +255,7 @@ def _update_dec_stage1(row: pd.Series) -> pd.Series: return row["DEC"] -def _update_rec_stage1(row: pd.Series) -> pd.Series: +def _update_rec_stage1(row: pd.Series) -> str: """ Determine updated REC value. Defaults to input REC if no rule to apply @@ -270,7 +270,7 @@ def _update_rec_stage1(row: pd.Series) -> pd.Series: return row["REC"] -def _update_reason_place_change_stage1(row: pd.Series) -> pd.Series: +def _update_reason_place_change_stage1(row: pd.Series) -> str: """ Determine updated REASON_PLACE_CHANGE value. Defaults to input value if no rule to apply @@ -285,7 +285,7 @@ def _update_reason_place_change_stage1(row: pd.Series) -> pd.Series: return row["REASON_PLACE_CHANGE"] -def _update_episode_source_stage1(row: pd.Series) -> pd.Series: +def _update_episode_source_stage1(row: pd.Series) -> str: """ Determine updated Episode_source value. Defaults to input value if no rule to apply @@ -338,14 +338,14 @@ def _has_x1_gap_before_next_episode(row: pd.Series) -> bool: return False -def _stage2_rule_to_apply(row): +def _stage2_rule_to_apply(row: pd.Series) -> str: if row["Overlaps_next_episode"]: return "RULE_4" # Overlaps next episode and next episode was submitted later if row["Has_X1_gap_before_next_episode"]: return "RULE_5" # Ends before next episode but has reason "X1" - continuous and next ep was submitted later -def _update_dec_stage2(row: pd.Series) -> pd.Series: +def _update_dec_stage2(row: pd.Series) -> datetime: """ Determine updated DEC value. Defaults to input DEC if no rule to apply @@ -357,7 +357,7 @@ def _update_dec_stage2(row: pd.Series) -> pd.Series: return row["DEC"] -def _update_episode_source_stage2(row: pd.Series) -> pd.Series: +def _update_episode_source_stage2(row: pd.Series) -> str: """ Determine updated Episode_source value. Defaults to input value if no rule to apply @@ -412,3 +412,44 @@ def apply_stage2_rules(dataframe: pd.DataFrame) -> pd.DataFrame: # Apply rules 4, 5 dataframe["DEC"] = dataframe.apply(_update_dec_stage2, axis=1) return dataframe + + +def stage_1(s903_df: pd.DataFrame) -> pd.DataFrame: + """ + Accept an s903 episodes dataframe and apply the stage 1 rules + + :param s903_df: Dataframe with SSDA903 Episodes data + :return: Dataframe with stage 1 rules identified and applied + """ + # Add columns to dataframe to identify which rules should be applied at stage 1 + s903_df = s903_df.sort_values(["CHILD", "DECOM"], ignore_index=True) + s903_df_stage1 = create_previous_and_next_episode(s903_df, __COLUMNS) + s903_df_stage1 = format_datetime(s903_df_stage1, __DATES) + s903_df_stage1 = add_latest_year_and_source_for_la(s903_df_stage1) + s903_df_stage1 = add_stage1_rule_identifier_columns(s903_df_stage1) + s903_df_stage1 = identify_stage1_rule_to_apply(s903_df_stage1) + + # Apply the stage 1 rules + s903_df_stage1_applied = apply_stage1_rules(s903_df_stage1) + return s903_df_stage1_applied + + +def stage_2(s903_df: pd.DataFrame) -> pd.DataFrame: + """ + Accept an s903 episodes dataframe and apply the stage 2 rules + + :param s903_df: Dataframe with SSDA903 Episodes data + :return: Dataframe with stage 2 rules identified and applied + """ + s903_df_stage2 = s903_df[__COLUMNS_TO_KEEP] + s903_df_stage2 = create_previous_and_next_episode(s903_df_stage2, __COLUMNS) + s903_df_stage2 = format_datetime(s903_df_stage2, __DATES) + s903_df_stage2 = add_stage2_rule_identifier_columns(s903_df_stage2) + s903_df_stage2 = identify_stage2_rule_to_apply(s903_df_stage2) + + # Apply the stage 2 rules + s903_df_stage2_applied = apply_stage2_rules(s903_df_stage2) + + s903_df_final = s903_df_stage2_applied[__COLUMNS_TO_KEEP] + s903_df_final = s903_df_final.sort_values(["CHILD", "DECOM"], ignore_index=True) + return s903_df_final diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index 1450eb30..7777cebd 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -1,8 +1,8 @@ from pathlib import Path -import yaml import logging -import click_log from datetime import datetime +import yaml +import click_log # dependencies for cleanfile() from liiatools.datasets.s903.lds_ssda903_clean import ( @@ -25,7 +25,8 @@ from liiatools.datasets.s903.lds_ssda903_sufficiency import process as suff_process # dependencies for episodes fix() -from liiatools.datasets.s903.lds_ssda903_episodes_fix import process as episodes_process +# from liiatools.datasets.s903.lds_ssda903_episodes_fix import process as episodes_process +from liiatools.datasets.s903.lds_ssda903_episodes_fix.process import stage_1, stage_2 from liiatools.spec import common as common_asset_dir from liiatools.datasets.shared_functions import ( @@ -221,117 +222,22 @@ def episodes_fix(input, output): column_names = config["column_names"] table_name = common_process.match_load_file(s903_df, column_names) - # Process stage 1 rule fixes for Episodes table + # Process stage 1 and 2 rule fixes for Episodes table if table_name == "Episodes": - # Add columns to dataframe to identify which rules should be applied at stage 1 - s903_df = s903_df.sort_values(["CHILD", "DECOM"], ignore_index=True) - s903_df_stage1 = episodes_process.create_previous_and_next_episode( - s903_df, episodes_process.__COLUMNS - ) - s903_df_stage1 = episodes_process.format_datetime( - s903_df_stage1, episodes_process.__DATES - ) - s903_df_stage1 = episodes_process.add_latest_year_and_source_for_la( - s903_df_stage1 - ) - s903_df_stage1 = episodes_process.add_stage1_rule_identifier_columns( - s903_df_stage1 - ) - s903_df_stage1 = episodes_process.identify_stage1_rule_to_apply(s903_df_stage1) - - # Apply the stage 1 rules - s903_df_stage1_applied = episodes_process.apply_stage1_rules(s903_df_stage1) - - # Add columns to dataframe to identify which rules should be applied at stage 2 TODO - s903_df_stage2 = s903_df_stage1_applied[episodes_process.__COLUMNS_TO_KEEP] - s903_df_stage2 = episodes_process.create_previous_and_next_episode( - s903_df_stage2, episodes_process.__COLUMNS - ) - s903_df_stage2 = episodes_process.format_datetime( - s903_df_stage2, episodes_process.__DATES - ) - s903_df_stage2 = episodes_process.add_stage2_rule_identifier_columns( - s903_df_stage2 - ) - s903_df_stage2 = episodes_process.identify_stage2_rule_to_apply(s903_df_stage2) - - # Apply the stage 2 rules - s903_df_stage2_applied = episodes_process.apply_stage2_rules(s903_df_stage2) - - s903_df_final = s903_df_stage2_applied[episodes_process.__COLUMNS_TO_KEEP] - s903_df_final = s903_df_final.sort_values(["CHILD", "DECOM"], ignore_index=True) + s903_df_stage1_applied = stage_1(s903_df) + s903_df_final = stage_2(s903_df_stage1_applied) + output_path = Path(output, "SSDA903_episodes_fixed.csv") s903_df_final.to_csv( - r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv", + output_path, index=False, ) - # Following code used to print dataframe outputs during development - print_df = False - if print_df: - print("Dataframe with stage 1 rules identified:") - print( - s903_df_stage1[ - [ - "CHILD", - "YEAR", - "DECOM", - "DEC", - "RNE", - "REC", - "REASON_PLACE_CHANGE", - "Has_open_episode_error", - "Rule_to_apply", - ] - ] - ) - print("Dataframe with stage 1 rules applied:") - print( - s903_df_stage1_applied[ - [ - "CHILD", - "YEAR", - "DECOM", - "DEC", - "RNE", - "REC", - "REASON_PLACE_CHANGE", - "Episode_source", - "Has_open_episode_error", - "Rule_to_apply", - ] - ] - ) - print("Dataframe with stage 2 rules applied:") - print( - s903_df_stage2_applied[ - [ - "CHILD", - "YEAR", - "DECOM", - "DEC", - "RNE", - "REC", - "REASON_PLACE_CHANGE", - "Episode_source", - "DECOM_next", - "YEAR_next", - "Has_next_episode", - "Overlaps_next_episode", - "Has_X1_gap_before_next_episode", - "Rule_to_apply", - ] - ] - ) - - print("Final dataframe with all rules applied") - print(s903_df_final) - # Run episodes_fix() with our test file which contains examples of each rule (CHILD id indicates which rule) episodes_fix( r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv", - r"liiatools/datasets/s903/lds_ssda903_episodes_fix", + r"liiatools/datasets/s903/lds_ssda903_episodes_fix/", ) # poetry run python liiatools/datasets/s903/s903_main_functions.py -# python -m black "/workspaces/liia-tools/liiatools/datasets/s903/s903_main_functions.py" \ No newline at end of file +# python -m black "/workspaces/liia-tools/liiatools/datasets/s903/s903_main_functions.py" diff --git a/tests/s903/test_episodes_fix.py b/tests/s903/test_episodes_fix.py index 2b8299f7..fc2cda70 100644 --- a/tests/s903/test_episodes_fix.py +++ b/tests/s903/test_episodes_fix.py @@ -9,20 +9,15 @@ _is_previous_episode_submitted_later, _stage1_rule_to_apply, add_stage1_rule_identifier_columns, - identify_stage1_rule_to_apply, _update_dec_stage1, _update_rec_stage1, _update_reason_place_change_stage1, _update_episode_source_stage1, - apply_stage1_rules, _overlaps_next_episode, _has_x1_gap_before_next_episode, _stage2_rule_to_apply, _update_dec_stage2, _update_episode_source_stage2, - add_stage2_rule_identifier_columns, - identify_stage2_rule_to_apply, - apply_stage2_rules, ) @@ -122,6 +117,9 @@ def test_add_stage1_rule_identifier_columns(): assert data_with_identifiers_added[ "Has_next_episode_with_RNE_equals_S" ].tolist() == [True, False] + assert data_with_identifiers_added["Next_episode_is_duplicate"].tolist() == [False, False] + assert data_with_identifiers_added["Previous_episode_is_duplicate"].tolist() == [False, False] + assert data_with_identifiers_added["Previous_episode_submitted_later"].tolist() == [False, False] def test__is_the_same(): From d5111faa14b2b9e3638314f8c56d3be14ff5f8eb Mon Sep 17 00:00:00 2001 From: Patrick Troy Date: Fri, 24 May 2024 10:43:20 +0100 Subject: [PATCH 19/25] add temp unit test rewriter --- tests/s903/rewrite_unit_test.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/s903/rewrite_unit_test.py diff --git a/tests/s903/rewrite_unit_test.py b/tests/s903/rewrite_unit_test.py new file mode 100644 index 00000000..e69de29b From 8e425652da06a9de7e6f756caa33f56ec5103679 Mon Sep 17 00:00:00 2001 From: Patrick Troy Date: Fri, 24 May 2024 10:44:29 +0100 Subject: [PATCH 20/25] add temp unit test rewriter --- tests/s903/rewrite_unit_test.py | 174 ++++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) diff --git a/tests/s903/rewrite_unit_test.py b/tests/s903/rewrite_unit_test.py index e69de29b..f8548fbe 100644 --- a/tests/s903/rewrite_unit_test.py +++ b/tests/s903/rewrite_unit_test.py @@ -0,0 +1,174 @@ +import pandas as pd +data = pd.DataFrame( + { + "DEC": [ + None, + None, + None, + None, + None, + None, + None, + None, + None, + "2016-08-31", + None, + ], + "Has_next_episode": [ + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + False, + ], + "DECOM": [ + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + "2016-08-22", + ], + "DECOM_next": [ + "2016-11-22", + "2016-11-22", + "2016-11-22", + "2016-11-22", + "2016-11-22", + "2016-11-22", + "2016-11-22", + "2016-11-22", + "2016-11-22", + "2016-11-22", + None, + ], + "RNE": ["P", "P", "P", "P", "P", "P", "P", None, "P", "P", "P"], + "RNE_next": ["P", "DIFF", "P", "P", "P", "P", "P", None, None, "P", None], + "LS": ["C2", "C2", "C2", "C2", "C2", "C2", "C2", None, "C2", "C2", "C2"], + "LS_next": [ + "C2", + "C2", + "DIFF", + "C2", + "C2", + "C2", + "C2", + None, + None, + "C2", + None, + ], + "PLACE": ["U1", "U1", "U1", "U1", "U1", "U1", "U1", None, "U1", "U1", "U1"], + "PLACE_next": [ + "U1", + "U1", + "U1", + "DIFF", + "U1", + "U1", + "U1", + None, + None, + "U1", + None, + ], + "PLACE_PROVIDER": [ + "PR1", + "PR1", + "PR1", + "PR1", + "PR1", + "PR1", + "PR1", + None, + "PR1", + "PR1", + "PR1", + ], + "PLACE_PROVIDER_next": [ + "PR1", + "PR1", + "PR1", + "PR1", + "DIFF", + "PR1", + "PR1", + None, + None, + "PR1", + None, + ], + "PL_POST": [ + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "ABC1", + None, + "ABC1", + "ABC1", + "ABC1", + ], + "PL_POST_next": [ + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "ABC1", + "DIFF", + "ABC1", + None, + None, + "ABC1", + None, + ], + "URN": [ + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + None, + "SC1234", + "SC1234", + "SC1234", + ], + "URN_next": [ + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "SC1234", + "DIFF", + None, + None, + "SC1234", + None, + ], + } + ) + +for index, row in data.iterrows(): + print(f"\ndef test__is_next_episode_duplicate_():") + print(f" data = pd.DataFrame (\n {{") + for column, value in row.items(): + print(f' "{column}": [{value}],') + print(" }\n )") + print(f' data["Test result"] = _is_next_episode_duplicate(data)') + print(f' assert data["Test result"].tolist() == []') From 479135fd1619475ccee4885e92d30521c0eabffb Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Wed, 29 May 2024 14:15:02 +0000 Subject: [PATCH 21/25] Expanded unit tests for episode fixes --- tests/s903/test_episodes_fix.py | 1098 +++++++++++++++++++++---------- 1 file changed, 738 insertions(+), 360 deletions(-) diff --git a/tests/s903/test_episodes_fix.py b/tests/s903/test_episodes_fix.py index fc2cda70..85cb7b93 100644 --- a/tests/s903/test_episodes_fix.py +++ b/tests/s903/test_episodes_fix.py @@ -138,380 +138,758 @@ def test__is_the_same(): ] -def test__is_next_episode_duplicate(): +# def test__is_next_episode_duplicate(): +# data = pd.DataFrame( +# { +# "DEC": [ +# None, +# None, +# None, +# None, +# None, +# None, +# None, +# None, +# None, +# "2016-08-31", +# None, +# ], +# "Has_next_episode": [ +# True, +# True, +# True, +# True, +# True, +# True, +# True, +# True, +# True, +# True, +# False, +# ], +# "DECOM": [ +# "2016-08-22", +# "2016-08-22", +# "2016-08-22", +# "2016-08-22", +# "2016-08-22", +# "2016-08-22", +# "2016-08-22", +# "2016-08-22", +# "2016-08-22", +# "2016-08-22", +# "2016-08-22", +# ], +# "DECOM_next": [ +# "2016-11-22", +# "2016-11-22", +# "2016-11-22", +# "2016-11-22", +# "2016-11-22", +# "2016-11-22", +# "2016-11-22", +# "2016-11-22", +# "2016-11-22", +# "2016-11-22", +# None, +# ], +# "RNE": ["P", "P", "P", "P", "P", "P", "P", None, "P", "P", "P"], +# "RNE_next": ["P", "DIFF", "P", "P", "P", "P", "P", None, None, "P", None], +# "LS": ["C2", "C2", "C2", "C2", "C2", "C2", "C2", None, "C2", "C2", "C2"], +# "LS_next": [ +# "C2", +# "C2", +# "DIFF", +# "C2", +# "C2", +# "C2", +# "C2", +# None, +# None, +# "C2", +# None, +# ], +# "PLACE": ["U1", "U1", "U1", "U1", "U1", "U1", "U1", None, "U1", "U1", "U1"], +# "PLACE_next": [ +# "U1", +# "U1", +# "U1", +# "DIFF", +# "U1", +# "U1", +# "U1", +# None, +# None, +# "U1", +# None, +# ], +# "PLACE_PROVIDER": [ +# "PR1", +# "PR1", +# "PR1", +# "PR1", +# "PR1", +# "PR1", +# "PR1", +# None, +# "PR1", +# "PR1", +# "PR1", +# ], +# "PLACE_PROVIDER_next": [ +# "PR1", +# "PR1", +# "PR1", +# "PR1", +# "DIFF", +# "PR1", +# "PR1", +# None, +# None, +# "PR1", +# None, +# ], +# "PL_POST": [ +# "ABC1", +# "ABC1", +# "ABC1", +# "ABC1", +# "ABC1", +# "ABC1", +# "ABC1", +# None, +# "ABC1", +# "ABC1", +# "ABC1", +# ], +# "PL_POST_next": [ +# "ABC1", +# "ABC1", +# "ABC1", +# "ABC1", +# "ABC1", +# "DIFF", +# "ABC1", +# None, +# None, +# "ABC1", +# None, +# ], +# "URN": [ +# "SC1234", +# "SC1234", +# "SC1234", +# "SC1234", +# "SC1234", +# "SC1234", +# "SC1234", +# None, +# "SC1234", +# "SC1234", +# "SC1234", +# ], +# "URN_next": [ +# "SC1234", +# "SC1234", +# "SC1234", +# "SC1234", +# "SC1234", +# "SC1234", +# "DIFF", +# None, +# None, +# "SC1234", +# None, +# ], +# } +# ) + +# data["Test result"] = _is_next_episode_duplicate(data) +# assert data["Test result"].tolist() == [ +# True, +# False, +# False, +# False, +# False, +# False, +# False, +# True, +# False, +# False, +# False, +# ] + +def test__is_next_episode_duplicate_true(): data = pd.DataFrame( { - "DEC": [ - None, - None, - None, - None, - None, - None, - None, - None, - None, - "2016-08-31", - None, - ], - "Has_next_episode": [ - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - False, - ], - "DECOM": [ - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - ], - "DECOM_next": [ - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - None, - ], - "RNE": ["P", "P", "P", "P", "P", "P", "P", None, "P", "P", "P"], - "RNE_next": ["P", "DIFF", "P", "P", "P", "P", "P", None, None, "P", None], - "LS": ["C2", "C2", "C2", "C2", "C2", "C2", "C2", None, "C2", "C2", "C2"], - "LS_next": [ - "C2", - "C2", - "DIFF", - "C2", - "C2", - "C2", - "C2", - None, - None, - "C2", - None, - ], - "PLACE": ["U1", "U1", "U1", "U1", "U1", "U1", "U1", None, "U1", "U1", "U1"], - "PLACE_next": [ - "U1", - "U1", - "U1", - "DIFF", - "U1", - "U1", - "U1", - None, - None, - "U1", - None, - ], - "PLACE_PROVIDER": [ - "PR1", - "PR1", - "PR1", - "PR1", - "PR1", - "PR1", - "PR1", - None, - "PR1", - "PR1", - "PR1", - ], - "PLACE_PROVIDER_next": [ - "PR1", - "PR1", - "PR1", - "PR1", - "DIFF", - "PR1", - "PR1", - None, - None, - "PR1", - None, - ], - "PL_POST": [ - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "ABC1", - None, - "ABC1", - "ABC1", - "ABC1", - ], - "PL_POST_next": [ - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "DIFF", - "ABC1", - None, - None, - "ABC1", - None, - ], - "URN": [ - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - None, - "SC1234", - "SC1234", - "SC1234", - ], - "URN_next": [ - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "DIFF", - None, - None, - "SC1234", - None, - ], + "DEC": [None,], + "Has_next_episode": [True,], + "DECOM": ["2016-08-22",], + "DECOM_next": ["2016-11-22",], + "RNE": ["P",], + "RNE_next": ["P",], + "LS": ["C2",], + "LS_next": ["C2",], + "PLACE": ["U1",], + "PLACE_next": ["U1",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_next": ["PR1",], + "PL_POST": ["ABC1",], + "PL_POST_next": ["ABC1",], + "URN": ["SC1234",], + "URN_next": ["SC1234",], } ) data["Test result"] = _is_next_episode_duplicate(data) - assert data["Test result"].tolist() == [ - True, - False, - False, - False, - False, - False, - False, - True, - False, - False, - False, - ] + assert data["Test result"].tolist() == [True,] -def test__is_previous_episode_duplicate(): +def test__is_next_episode_duplicate_rne_diff(): data = pd.DataFrame( { - "DEC": [ - None, - None, - None, - None, - None, - None, - None, - None, - None, - "2016-08-31", - None, - ], - "Has_previous_episode": [ - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - False, - ], - "DECOM": [ - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - ], - "DECOM_previous": [ - "2016-01-22", - "2016-01-22", - "2016-01-22", - "2016-01-22", - "2016-01-22", - "2016-01-22", - "2016-01-22", - "2016-01-22", - "2016-01-22", - "2016-01-22", - None, - ], - "RNE": ["P", "P", "P", "P", "P", "P", "P", None, "P", "P", "P"], - "RNE_previous": [ - "P", - "DIFF", - "P", - "P", - "P", - "P", - "P", - None, - None, - "P", - None, - ], - "LS": ["C2", "C2", "C2", "C2", "C2", "C2", "C2", None, "C2", "C2", "C2"], - "LS_previous": [ - "C2", - "C2", - "DIFF", - "C2", - "C2", - "C2", - "C2", - None, - None, - "C2", - None, - ], - "PLACE": ["U1", "U1", "U1", "U1", "U1", "U1", "U1", None, "U1", "U1", "U1"], - "PLACE_previous": [ - "U1", - "U1", - "U1", - "DIFF", - "U1", - "U1", - "U1", - None, - None, - "U1", - None, - ], - "PLACE_PROVIDER": [ - "PR1", - "PR1", - "PR1", - "PR1", - "PR1", - "PR1", - "PR1", - None, - "PR1", - "PR1", - "PR1", - ], - "PLACE_PROVIDER_previous": [ - "PR1", - "PR1", - "PR1", - "PR1", - "DIFF", - "PR1", - "PR1", - None, - None, - "PR1", - None, - ], - "PL_POST": [ - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "ABC1", - None, - "ABC1", - "ABC1", - "ABC1", - ], - "PL_POST_previous": [ - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "DIFF", - "ABC1", - None, - None, - "ABC1", - None, - ], - "URN": [ - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - None, - "SC1234", - "SC1234", - "SC1234", - ], - "URN_previous": [ - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "DIFF", - None, - None, - "SC1234", - None, - ], + "DEC": [None,], + "Has_next_episode": [True,], + "DECOM": ["2016-08-22",], + "DECOM_next": ["2016-11-22",], + "RNE": ["P",], + "RNE_next": [ "DIFF",], + "LS": ["C2",], + "LS_next": ["C2",], + "PLACE": ["U1",], + "PLACE_next": ["U1",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_next": ["PR1",], + "PL_POST": ["ABC1",], + "PL_POST_next": ["ABC1",], + "URN": ["SC1234",], + "URN_next": ["SC1234",], + } + ) + + data["Test result"] = _is_next_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_next_episode_duplicate_ls_diff(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_next_episode": [True,], + "DECOM": ["2016-08-22",], + "DECOM_next": ["2016-11-22",], + "RNE": ["P",], + "RNE_next": ["P",], + "LS": ["C2",], + "LS_next": ["DIFF",], + "PLACE": ["U1",], + "PLACE_next": ["U1",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_next": ["PR1",], + "PL_POST": ["ABC1",], + "PL_POST_next": ["ABC1",], + "URN": ["SC1234",], + "URN_next": ["SC1234",], + } + ) + + data["Test result"] = _is_next_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_next_episode_duplicate_place_diff(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_next_episode": [True,], + "DECOM": ["2016-08-22",], + "DECOM_next": ["2016-11-22",], + "RNE": ["P",], + "RNE_next": ["P",], + "LS": ["C2",], + "LS_next": ["C2",], + "PLACE": ["U1",], + "PLACE_next": ["DIFF",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_next": ["PR1",], + "PL_POST": ["ABC1",], + "PL_POST_next": ["ABC1",], + "URN": ["SC1234",], + "URN_next": ["SC1234",], + } + ) + + data["Test result"] = _is_next_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_next_episode_duplicate_provider_diff(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_next_episode": [True,], + "DECOM": ["2016-08-22",], + "DECOM_next": ["2016-11-22",], + "RNE": ["P",], + "RNE_next": ["P",], + "LS": ["C2",], + "LS_next": ["C2",], + "PLACE": ["U1",], + "PLACE_next": ["U1",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_next": ["DIFF",], + "PL_POST": ["ABC1",], + "PL_POST_next": ["ABC1",], + "URN": ["SC1234",], + "URN_next": ["SC1234",], + } + ) + + data["Test result"] = _is_next_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_next_episode_duplicate_pl_post_diff(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_next_episode": [True,], + "DECOM": ["2016-08-22",], + "DECOM_next": ["2016-11-22",], + "RNE": ["P",], + "RNE_next": ["P",], + "LS": ["C2",], + "LS_next": ["C2",], + "PLACE": ["U1",], + "PLACE_next": ["U1",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_next": ["PR1",], + "PL_POST": ["ABC1",], + "PL_POST_next": ["DIFF",], + "URN": ["SC1234",], + "URN_next": ["SC1234",], + } + ) + + data["Test result"] = _is_next_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_next_episode_duplicate_urn_diff(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_next_episode": [True,], + "DECOM": ["2016-08-22",], + "DECOM_next": ["2016-11-22",], + "RNE": ["P",], + "RNE_next": ["P",], + "LS": ["C2",], + "LS_next": ["C2",], + "PLACE": ["U1",], + "PLACE_next": ["U1",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_next": ["PR1",], + "PL_POST": ["ABC1",], + "PL_POST_next": ["ABC1",], + "URN": ["SC1234",], + "URN_next": ["DIFF",], + } + ) + + data["Test result"] = _is_next_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_next_episode_duplicate_all_none(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_next_episode": [True,], + "DECOM": ["2016-08-22",], + "DECOM_next": ["2016-11-22",], + "RNE": [ None,], + "RNE_next": [ None,], + "LS": [ None,], + "LS_next": [None,], + "PLACE": [None,], + "PLACE_next": [None,], + "PLACE_PROVIDER": [None,], + "PLACE_PROVIDER_next": [None,], + "PL_POST": [None,], + "PL_POST_next": [None,], + "URN": [None,], + "URN_next": [None,], + } + ) + + data["Test result"] = _is_next_episode_duplicate(data) + assert data["Test result"].tolist() == [True,] + + +def test__is_next_episode_duplicate_next_none(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_next_episode": [True,], + "DECOM": ["2016-08-22",], + "DECOM_next": ["2016-11-22",], + "RNE": ["P",], + "RNE_next": [ None,], + "LS": ["C2",], + "LS_next": [None,], + "PLACE": ["U1",], + "PLACE_next": [None,], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_next": [None,], + "PL_POST": ["ABC1",], + "PL_POST_next": [None,], + "URN": ["SC1234",], + "URN_next": [None,], + } + ) + + data["Test result"] = _is_next_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_next_episode_duplicate_episode_closed(): + data = pd.DataFrame( + { + "DEC": ["2016-08-31",], + "Has_next_episode": [True,], + "DECOM": ["2016-08-22",], + "DECOM_next": ["2016-11-22",], + "RNE": ["P",], + "RNE_next": ["P",], + "LS": ["C2",], + "LS_next": ["C2",], + "PLACE": ["U1",], + "PLACE_next": ["U1",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_next": ["PR1",], + "PL_POST": ["ABC1",], + "PL_POST_next": ["ABC1",], + "URN": ["SC1234",], + "URN_next": ["SC1234",], + } + ) + + data["Test result"] = _is_next_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_next_episode_duplicate_no_next_ep(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_next_episode": [False,], + "DECOM": ["2016-08-22",], + "DECOM_next": [None,], + "RNE": ["P",], + "RNE_next": [ None,], + "LS": ["C2",], + "LS_next": [None,], + "PLACE": ["U1",], + "PLACE_next": [None,], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_next": [None,], + "PL_POST": ["ABC1",], + "PL_POST_next": [None,], + "URN": ["SC1234",], + "URN_next": [None,], + } + ) + + data["Test result"] = _is_next_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + + +def test__is_previous_episode_duplicate_true(): + data = pd.DataFrame( + { + "DEC": [None], + "Has_previous_episode": [True], + "DECOM": ["2018-01-01"], + "DECOM_previous": ["2016-01-22"], + "RNE": ["P"], + "RNE_previous": ["P"], + "LS": ["C2"], + "LS_previous": ["C2"], + "PLACE": ["U1"], + "PLACE_previous": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_previous": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_previous": ["ABC1"], + "URN": ["SC1234"], + "URN_previous": ["SC1234"], } ) data["Test result"] = _is_previous_episode_duplicate(data) - assert data["Test result"].tolist() == [ - True, - False, - False, - False, - False, - False, - False, - True, - False, - False, - False, - ] + assert data["Test result"].tolist() == [True] + + +def test__is_previous_episode_duplicate_rne_diff(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_previous_episode": [True,], + "DECOM": ["2018-01-01"], + "DECOM_previous": ["2016-01-22",], + "RNE": ["P",], + "RNE_previous": ["DIFF",], + "LS": ["C2",], + "LS_previous": ["C2",], + "PLACE": ["U1",], + "PLACE_previous": ["U1",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_previous": ["PR1",], + "PL_POST": ["ABC1",], + "PL_POST_previous": ["ABC1",], + "URN": ["SC1234",], + "URN_previous": ["SC1234",], + } + ) + + data["Test result"] = _is_previous_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_previous_episode_duplicate_ls_diff(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_previous_episode": [True,], + "DECOM": ["2018-01-01"], + "DECOM_previous": ["2016-01-22",], + "RNE": ["P",], + "RNE_previous": ["P",], + "LS": ["C2",], + "LS_previous": ["DIFF",], + "PLACE": ["U1",], + "PLACE_previous": ["U1",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_previous": ["PR1",], + "PL_POST": ["ABC1",], + "PL_POST_previous": ["ABC1",], + "URN": ["SC1234",], + "URN_previous": ["SC1234",], + } + ) + + data["Test result"] = _is_previous_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_previous_episode_duplicate_place_differ(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_previous_episode": [True,], + "DECOM": ["2018-01-01",], + "DECOM_previous": ["2016-01-22",], + "RNE": ["P",], + "RNE_previous": ["P",], + "LS": ["C2",], + "LS_previous": ["C2",], + "PLACE": ["U1",], + "PLACE_previous": ["DIFF",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_previous": ["PR1",], + "PL_POST": ["ABC1",], + "PL_POST_previous": ["ABC1",], + "URN": ["SC1234",], + "URN_previous": ["SC1234",], + } + ) + + data["Test result"] = _is_previous_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_previous_episode_duplicate_provider_diff(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_previous_episode": [True,], + "DECOM": ["2018-01-01",], + "DECOM_previous": ["2016-01-22",], + "RNE": ["P",], + "RNE_previous": ["P",], + "LS": ["C2",], + "LS_previous": ["C2",], + "PLACE": ["U1",], + "PLACE_previous": ["U1",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_previous": ["DIFF",], + "PL_POST": ["ABC1",], + "PL_POST_previous": ["ABC1",], + "URN": ["SC1234",], + "URN_previous": ["SC1234",], + } + ) + + data["Test result"] = _is_previous_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_previous_episode_duplicate_pl_post_diff(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_previous_episode": [True,], + "DECOM": ["2018-01-01",], + "DECOM_previous": ["2016-01-22",], + "RNE": ["P",], + "RNE_previous": ["P",], + "LS": ["C2",], + "LS_previous": ["C2",], + "PLACE": ["U1",], + "PLACE_previous": ["U1",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_previous": ["PR1",], + "PL_POST": ["ABC1",], + "PL_POST_previous": ["DIFF",], + "URN": ["SC1234",], + "URN_previous": ["SC1234",], + } + ) + + data["Test result"] = _is_previous_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_previous_episode_duplicate_urn_diff(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_previous_episode": [True,], + "DECOM": ["2018-01-01",], + "DECOM_previous": ["2016-01-22",], + "RNE": ["P",], + "RNE_previous": ["P",], + "LS": ["C2",], + "LS_previous": ["C2",], + "PLACE": ["U1",], + "PLACE_previous": ["U1",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_previous": ["PR1",], + "PL_POST": ["ABC1",], + "PL_POST_previous": ["ABC1",], + "URN": ["SC1234",], + "URN_previous": ["DIFF",], + } + ) + + data["Test result"] = _is_previous_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_previous_episode_duplicate_all_none(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_previous_episode": [True,], + "DECOM": ["2018-01-01",], + "DECOM_previous": ["2016-01-22",], + "RNE": [ None,], + "RNE_previous": [None,], + "LS": [ None,], + "LS_previous": [None,], + "PLACE": [ None,], + "PLACE_previous": [None,], + "PLACE_PROVIDER": [None,], + "PLACE_PROVIDER_previous": [None,], + "PL_POST": [None,], + "PL_POST_previous": [None,], + "URN": [None,], + "URN_previous": [None,], + } + ) + + data["Test result"] = _is_previous_episode_duplicate(data) + assert data["Test result"].tolist() == [True,] + + +def test__is_previous_episode_duplicate_prev_none(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_previous_episode": [True,], + "DECOM": ["2018-01-01",], + "DECOM_previous": ["2016-01-22",], + "RNE": [ "P",], + "RNE_previous": [None,], + "LS": [ "C2",], + "LS_previous": [None,], + "PLACE": [ "U1",], + "PLACE_previous": [None,], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_previous": [None,], + "PL_POST": ["ABC1",], + "PL_POST_previous": [None,], + "URN": ["SC1234",], + "URN_previous": [None,], + } + ) + + data["Test result"] = _is_previous_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_previous_episode_duplicate_episode_closed(): + data = pd.DataFrame( + { + "DEC": ["2016-08-31",], + "Has_previous_episode": [True,], + "DECOM": ["2018-01-01",], + "DECOM_previous": ["2016-01-22",], + "RNE": [ "P",], + "RNE_previous": ["P",], + "LS": [ "C2",], + "LS_previous": ["C2",], + "PLACE": [ "U1",], + "PLACE_previous": ["U1",], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_previous": ["PR1",], + "PL_POST": ["ABC1",], + "PL_POST_previous": ["ABC1",], + "URN": ["SC1234",], + "URN_previous": ["SC1234",], + } + ) + + data["Test result"] = _is_previous_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] + + +def test__is_previous_episode_duplicate_no_prev_ep(): + data = pd.DataFrame( + { + "DEC": [None,], + "Has_previous_episode": [False,], + "DECOM": [None,], + "DECOM_previous": [None,], + "RNE": [ "P",], + "RNE_previous": [None,], + "LS": [ "C2",], + "LS_previous": [None,], + "PLACE": [ "U1",], + "PLACE_previous": [None,], + "PLACE_PROVIDER": ["PR1",], + "PLACE_PROVIDER_previous": [None,], + "PL_POST": ["ABC1",], + "PL_POST_previous": [None,], + "URN": ["SC1234",], + "URN_previous": [None,], + } + ) + + data["Test result"] = _is_previous_episode_duplicate(data) + assert data["Test result"].tolist() == [False,] def test__is_previous_episode_submitted_later(): From bded68cab03a69b499645b388152449ea7cd83c4 Mon Sep 17 00:00:00 2001 From: Patrick Troy Date: Thu, 30 May 2024 15:33:19 +0100 Subject: [PATCH 22/25] cleanup tests, remove test_rewriter --- .../datasets/s903/lds_ssda903_clean/prep.py | 108 +-- tests/s903/rewrite_unit_test.py | 174 ---- tests/s903/test_episodes_fix.py | 911 +++++++----------- 3 files changed, 421 insertions(+), 772 deletions(-) delete mode 100644 tests/s903/rewrite_unit_test.py diff --git a/liiatools/datasets/s903/lds_ssda903_clean/prep.py b/liiatools/datasets/s903/lds_ssda903_clean/prep.py index f1bb29dd..2f12c4d7 100644 --- a/liiatools/datasets/s903/lds_ssda903_clean/prep.py +++ b/liiatools/datasets/s903/lds_ssda903_clean/prep.py @@ -1,65 +1,65 @@ import logging import pandas as pd from pathlib import Path -import cchardet as chardet +# import cchardet as chardet from datetime import datetime import pandas.errors -def file_encoding_audit( - data_folder: Path, -) -> pd.DataFrame: - """ - Function takes in a folder path object, it then uses the cchardet library to fast detect the file encoding types - - :param data_folder: Path object that is a folder containing files to be processed - :type data_folder: Path - :return: A Dataframe of low confidence encoded files - :rtype: pd.DataFrame - """ - - # TODO - Check csv encoding type of file, save to utf-8 - # TODO - Check xml encoding type of file - # Save as an acceptable format - result_out = [] - - for cdf in data_folder.glob("**/*"): - if cdf.is_file() and "log" not in cdf.root: - with open(cdf, "rb") as f: - msg = f.read() - result = chardet.detect(msg) - out = f"{cdf.parts[-3]}, {cdf.stem}, {result}" - # this is messy. - outt = ( - out.replace("}", "") - .replace("{", "") - .replace("confidence", "") - .replace("encoding", "") - .replace("'':", "") - ) - result_out.append(outt) - - # Save the outputs of the list generated by running cchardet on the file list, - # the result is then appended into a dataframe and filtered to return a list of files that - # have low confidence as to their encoding types. - encoding_series = pd.Series(result_out) - - encoding_df = pd.DataFrame(encoding_series, columns=["file_name"]) - - # Split out dataframe - encoding_df[ - ["local_authority", "file_name", "encoding", "confidence"] - ] = encoding_df.file_name.str.split(",", expand=True) - - # Filter out log files and drop high confidence files types - encoded_df = encoding_df[ - ~encoding_df["file_name"].str.contains("Logs") - & ~(encoding_df["confidence"].str.contains("1.0")) - ] - - encoded_df.to_csv("encoding_audit.csv", encoding="utf-8") - return encoded_df +# def file_encoding_audit( +# data_folder: Path, +# ) -> pd.DataFrame: +# """ +# Function takes in a folder path object, it then uses the cchardet library to fast detect the file encoding types +# +# :param data_folder: Path object that is a folder containing files to be processed +# :type data_folder: Path +# :return: A Dataframe of low confidence encoded files +# :rtype: pd.DataFrame +# """ +# +# # TODO - Check csv encoding type of file, save to utf-8 +# # TODO - Check xml encoding type of file +# # Save as an acceptable format +# result_out = [] +# +# for cdf in data_folder.glob("**/*"): +# if cdf.is_file() and "log" not in cdf.root: +# with open(cdf, "rb") as f: +# msg = f.read() +# result = chardet.detect(msg) +# out = f"{cdf.parts[-3]}, {cdf.stem}, {result}" +# # this is messy. +# outt = ( +# out.replace("}", "") +# .replace("{", "") +# .replace("confidence", "") +# .replace("encoding", "") +# .replace("'':", "") +# ) +# result_out.append(outt) +# +# # Save the outputs of the list generated by running cchardet on the file list, +# # the result is then appended into a dataframe and filtered to return a list of files that +# # have low confidence as to their encoding types. +# encoding_series = pd.Series(result_out) +# +# encoding_df = pd.DataFrame(encoding_series, columns=["file_name"]) +# +# # Split out dataframe +# encoding_df[ +# ["local_authority", "file_name", "encoding", "confidence"] +# ] = encoding_df.file_name.str.split(",", expand=True) +# +# # Filter out log files and drop high confidence files types +# encoded_df = encoding_df[ +# ~encoding_df["file_name"].str.contains("Logs") +# & ~(encoding_df["confidence"].str.contains("1.0")) +# ] +# +# encoded_df.to_csv("encoding_audit.csv", encoding="utf-8") +# return encoded_df def delete_unrequired_files(input: str, drop_file_list: list, la_log_dir: str): diff --git a/tests/s903/rewrite_unit_test.py b/tests/s903/rewrite_unit_test.py deleted file mode 100644 index f8548fbe..00000000 --- a/tests/s903/rewrite_unit_test.py +++ /dev/null @@ -1,174 +0,0 @@ -import pandas as pd -data = pd.DataFrame( - { - "DEC": [ - None, - None, - None, - None, - None, - None, - None, - None, - None, - "2016-08-31", - None, - ], - "Has_next_episode": [ - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - False, - ], - "DECOM": [ - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - "2016-08-22", - ], - "DECOM_next": [ - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - "2016-11-22", - None, - ], - "RNE": ["P", "P", "P", "P", "P", "P", "P", None, "P", "P", "P"], - "RNE_next": ["P", "DIFF", "P", "P", "P", "P", "P", None, None, "P", None], - "LS": ["C2", "C2", "C2", "C2", "C2", "C2", "C2", None, "C2", "C2", "C2"], - "LS_next": [ - "C2", - "C2", - "DIFF", - "C2", - "C2", - "C2", - "C2", - None, - None, - "C2", - None, - ], - "PLACE": ["U1", "U1", "U1", "U1", "U1", "U1", "U1", None, "U1", "U1", "U1"], - "PLACE_next": [ - "U1", - "U1", - "U1", - "DIFF", - "U1", - "U1", - "U1", - None, - None, - "U1", - None, - ], - "PLACE_PROVIDER": [ - "PR1", - "PR1", - "PR1", - "PR1", - "PR1", - "PR1", - "PR1", - None, - "PR1", - "PR1", - "PR1", - ], - "PLACE_PROVIDER_next": [ - "PR1", - "PR1", - "PR1", - "PR1", - "DIFF", - "PR1", - "PR1", - None, - None, - "PR1", - None, - ], - "PL_POST": [ - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "ABC1", - None, - "ABC1", - "ABC1", - "ABC1", - ], - "PL_POST_next": [ - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "ABC1", - "DIFF", - "ABC1", - None, - None, - "ABC1", - None, - ], - "URN": [ - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - None, - "SC1234", - "SC1234", - "SC1234", - ], - "URN_next": [ - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "SC1234", - "DIFF", - None, - None, - "SC1234", - None, - ], - } - ) - -for index, row in data.iterrows(): - print(f"\ndef test__is_next_episode_duplicate_():") - print(f" data = pd.DataFrame (\n {{") - for column, value in row.items(): - print(f' "{column}": [{value}],') - print(" }\n )") - print(f' data["Test result"] = _is_next_episode_duplicate(data)') - print(f' assert data["Test result"].tolist() == []') diff --git a/tests/s903/test_episodes_fix.py b/tests/s903/test_episodes_fix.py index 85cb7b93..2298c1df 100644 --- a/tests/s903/test_episodes_fix.py +++ b/tests/s903/test_episodes_fix.py @@ -117,9 +117,18 @@ def test_add_stage1_rule_identifier_columns(): assert data_with_identifiers_added[ "Has_next_episode_with_RNE_equals_S" ].tolist() == [True, False] - assert data_with_identifiers_added["Next_episode_is_duplicate"].tolist() == [False, False] - assert data_with_identifiers_added["Previous_episode_is_duplicate"].tolist() == [False, False] - assert data_with_identifiers_added["Previous_episode_submitted_later"].tolist() == [False, False] + assert data_with_identifiers_added["Next_episode_is_duplicate"].tolist() == [ + False, + False, + ] + assert data_with_identifiers_added["Previous_episode_is_duplicate"].tolist() == [ + False, + False, + ] + assert data_with_identifiers_added["Previous_episode_submitted_later"].tolist() == [ + False, + False, + ] def test__is_the_same(): @@ -138,472 +147,290 @@ def test__is_the_same(): ] -# def test__is_next_episode_duplicate(): -# data = pd.DataFrame( -# { -# "DEC": [ -# None, -# None, -# None, -# None, -# None, -# None, -# None, -# None, -# None, -# "2016-08-31", -# None, -# ], -# "Has_next_episode": [ -# True, -# True, -# True, -# True, -# True, -# True, -# True, -# True, -# True, -# True, -# False, -# ], -# "DECOM": [ -# "2016-08-22", -# "2016-08-22", -# "2016-08-22", -# "2016-08-22", -# "2016-08-22", -# "2016-08-22", -# "2016-08-22", -# "2016-08-22", -# "2016-08-22", -# "2016-08-22", -# "2016-08-22", -# ], -# "DECOM_next": [ -# "2016-11-22", -# "2016-11-22", -# "2016-11-22", -# "2016-11-22", -# "2016-11-22", -# "2016-11-22", -# "2016-11-22", -# "2016-11-22", -# "2016-11-22", -# "2016-11-22", -# None, -# ], -# "RNE": ["P", "P", "P", "P", "P", "P", "P", None, "P", "P", "P"], -# "RNE_next": ["P", "DIFF", "P", "P", "P", "P", "P", None, None, "P", None], -# "LS": ["C2", "C2", "C2", "C2", "C2", "C2", "C2", None, "C2", "C2", "C2"], -# "LS_next": [ -# "C2", -# "C2", -# "DIFF", -# "C2", -# "C2", -# "C2", -# "C2", -# None, -# None, -# "C2", -# None, -# ], -# "PLACE": ["U1", "U1", "U1", "U1", "U1", "U1", "U1", None, "U1", "U1", "U1"], -# "PLACE_next": [ -# "U1", -# "U1", -# "U1", -# "DIFF", -# "U1", -# "U1", -# "U1", -# None, -# None, -# "U1", -# None, -# ], -# "PLACE_PROVIDER": [ -# "PR1", -# "PR1", -# "PR1", -# "PR1", -# "PR1", -# "PR1", -# "PR1", -# None, -# "PR1", -# "PR1", -# "PR1", -# ], -# "PLACE_PROVIDER_next": [ -# "PR1", -# "PR1", -# "PR1", -# "PR1", -# "DIFF", -# "PR1", -# "PR1", -# None, -# None, -# "PR1", -# None, -# ], -# "PL_POST": [ -# "ABC1", -# "ABC1", -# "ABC1", -# "ABC1", -# "ABC1", -# "ABC1", -# "ABC1", -# None, -# "ABC1", -# "ABC1", -# "ABC1", -# ], -# "PL_POST_next": [ -# "ABC1", -# "ABC1", -# "ABC1", -# "ABC1", -# "ABC1", -# "DIFF", -# "ABC1", -# None, -# None, -# "ABC1", -# None, -# ], -# "URN": [ -# "SC1234", -# "SC1234", -# "SC1234", -# "SC1234", -# "SC1234", -# "SC1234", -# "SC1234", -# None, -# "SC1234", -# "SC1234", -# "SC1234", -# ], -# "URN_next": [ -# "SC1234", -# "SC1234", -# "SC1234", -# "SC1234", -# "SC1234", -# "SC1234", -# "DIFF", -# None, -# None, -# "SC1234", -# None, -# ], -# } -# ) - -# data["Test result"] = _is_next_episode_duplicate(data) -# assert data["Test result"].tolist() == [ -# True, -# False, -# False, -# False, -# False, -# False, -# False, -# True, -# False, -# False, -# False, -# ] - def test__is_next_episode_duplicate_true(): data = pd.DataFrame( { - "DEC": [None,], - "Has_next_episode": [True,], - "DECOM": ["2016-08-22",], - "DECOM_next": ["2016-11-22",], - "RNE": ["P",], - "RNE_next": ["P",], - "LS": ["C2",], - "LS_next": ["C2",], - "PLACE": ["U1",], - "PLACE_next": ["U1",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_next": ["PR1",], - "PL_POST": ["ABC1",], - "PL_POST_next": ["ABC1",], - "URN": ["SC1234",], - "URN_next": ["SC1234",], + "DEC": [None], + "Has_next_episode": [True], + "DECOM": ["2016-08-22"], + "DECOM_next": ["2016-11-22"], + "RNE": ["P"], + "RNE_next": ["P"], + "LS": ["C2"], + "LS_next": ["C2"], + "PLACE": ["U1"], + "PLACE_next": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_next": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_next": ["ABC1"], + "URN": ["SC1234"], + "URN_next": ["SC1234"], } ) data["Test result"] = _is_next_episode_duplicate(data) - assert data["Test result"].tolist() == [True,] + assert data["Test result"].tolist() == [True] def test__is_next_episode_duplicate_rne_diff(): data = pd.DataFrame( { - "DEC": [None,], - "Has_next_episode": [True,], - "DECOM": ["2016-08-22",], - "DECOM_next": ["2016-11-22",], - "RNE": ["P",], - "RNE_next": [ "DIFF",], - "LS": ["C2",], - "LS_next": ["C2",], - "PLACE": ["U1",], - "PLACE_next": ["U1",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_next": ["PR1",], - "PL_POST": ["ABC1",], - "PL_POST_next": ["ABC1",], - "URN": ["SC1234",], - "URN_next": ["SC1234",], + "DEC": [None], + "Has_next_episode": [True], + "DECOM": ["2016-08-22"], + "DECOM_next": ["2016-11-22"], + "RNE": ["P"], + "RNE_next": ["DIFF"], + "LS": ["C2"], + "LS_next": ["C2"], + "PLACE": ["U1"], + "PLACE_next": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_next": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_next": ["ABC1"], + "URN": ["SC1234"], + "URN_next": ["SC1234"], } ) data["Test result"] = _is_next_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_next_episode_duplicate_ls_diff(): data = pd.DataFrame( { - "DEC": [None,], - "Has_next_episode": [True,], - "DECOM": ["2016-08-22",], - "DECOM_next": ["2016-11-22",], - "RNE": ["P",], - "RNE_next": ["P",], - "LS": ["C2",], - "LS_next": ["DIFF",], - "PLACE": ["U1",], - "PLACE_next": ["U1",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_next": ["PR1",], - "PL_POST": ["ABC1",], - "PL_POST_next": ["ABC1",], - "URN": ["SC1234",], - "URN_next": ["SC1234",], + "DEC": [None], + "Has_next_episode": [True], + "DECOM": ["2016-08-22"], + "DECOM_next": ["2016-11-22"], + "RNE": ["P"], + "RNE_next": ["P"], + "LS": ["C2"], + "LS_next": ["DIFF"], + "PLACE": ["U1"], + "PLACE_next": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_next": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_next": ["ABC1"], + "URN": ["SC1234"], + "URN_next": ["SC1234"], } ) data["Test result"] = _is_next_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_next_episode_duplicate_place_diff(): data = pd.DataFrame( { - "DEC": [None,], - "Has_next_episode": [True,], - "DECOM": ["2016-08-22",], - "DECOM_next": ["2016-11-22",], - "RNE": ["P",], - "RNE_next": ["P",], - "LS": ["C2",], - "LS_next": ["C2",], - "PLACE": ["U1",], - "PLACE_next": ["DIFF",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_next": ["PR1",], - "PL_POST": ["ABC1",], - "PL_POST_next": ["ABC1",], - "URN": ["SC1234",], - "URN_next": ["SC1234",], + "DEC": [None], + "Has_next_episode": [True], + "DECOM": ["2016-08-22"], + "DECOM_next": ["2016-11-22"], + "RNE": ["P"], + "RNE_next": ["P"], + "LS": ["C2"], + "LS_next": ["C2"], + "PLACE": ["U1"], + "PLACE_next": ["DIFF"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_next": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_next": ["ABC1"], + "URN": ["SC1234"], + "URN_next": ["SC1234"], } ) data["Test result"] = _is_next_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_next_episode_duplicate_provider_diff(): data = pd.DataFrame( { - "DEC": [None,], - "Has_next_episode": [True,], - "DECOM": ["2016-08-22",], - "DECOM_next": ["2016-11-22",], - "RNE": ["P",], - "RNE_next": ["P",], - "LS": ["C2",], - "LS_next": ["C2",], - "PLACE": ["U1",], - "PLACE_next": ["U1",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_next": ["DIFF",], - "PL_POST": ["ABC1",], - "PL_POST_next": ["ABC1",], - "URN": ["SC1234",], - "URN_next": ["SC1234",], + "DEC": [None], + "Has_next_episode": [True], + "DECOM": ["2016-08-22"], + "DECOM_next": ["2016-11-22"], + "RNE": ["P"], + "RNE_next": ["P"], + "LS": ["C2"], + "LS_next": ["C2"], + "PLACE": ["U1"], + "PLACE_next": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_next": ["DIFF"], + "PL_POST": ["ABC1"], + "PL_POST_next": ["ABC1"], + "URN": ["SC1234"], + "URN_next": ["SC1234"], } ) data["Test result"] = _is_next_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_next_episode_duplicate_pl_post_diff(): data = pd.DataFrame( { - "DEC": [None,], - "Has_next_episode": [True,], - "DECOM": ["2016-08-22",], - "DECOM_next": ["2016-11-22",], - "RNE": ["P",], - "RNE_next": ["P",], - "LS": ["C2",], - "LS_next": ["C2",], - "PLACE": ["U1",], - "PLACE_next": ["U1",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_next": ["PR1",], - "PL_POST": ["ABC1",], - "PL_POST_next": ["DIFF",], - "URN": ["SC1234",], - "URN_next": ["SC1234",], + "DEC": [None], + "Has_next_episode": [True], + "DECOM": ["2016-08-22"], + "DECOM_next": ["2016-11-22"], + "RNE": ["P"], + "RNE_next": ["P"], + "LS": ["C2"], + "LS_next": ["C2"], + "PLACE": ["U1"], + "PLACE_next": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_next": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_next": ["DIFF"], + "URN": ["SC1234"], + "URN_next": ["SC1234"], } ) data["Test result"] = _is_next_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_next_episode_duplicate_urn_diff(): data = pd.DataFrame( { - "DEC": [None,], - "Has_next_episode": [True,], - "DECOM": ["2016-08-22",], - "DECOM_next": ["2016-11-22",], - "RNE": ["P",], - "RNE_next": ["P",], - "LS": ["C2",], - "LS_next": ["C2",], - "PLACE": ["U1",], - "PLACE_next": ["U1",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_next": ["PR1",], - "PL_POST": ["ABC1",], - "PL_POST_next": ["ABC1",], - "URN": ["SC1234",], - "URN_next": ["DIFF",], + "DEC": [None], + "Has_next_episode": [True], + "DECOM": ["2016-08-22"], + "DECOM_next": ["2016-11-22"], + "RNE": ["P"], + "RNE_next": ["P"], + "LS": ["C2"], + "LS_next": ["C2"], + "PLACE": ["U1"], + "PLACE_next": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_next": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_next": ["ABC1"], + "URN": ["SC1234"], + "URN_next": ["DIFF"], } ) data["Test result"] = _is_next_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_next_episode_duplicate_all_none(): data = pd.DataFrame( { - "DEC": [None,], - "Has_next_episode": [True,], - "DECOM": ["2016-08-22",], - "DECOM_next": ["2016-11-22",], - "RNE": [ None,], - "RNE_next": [ None,], - "LS": [ None,], - "LS_next": [None,], - "PLACE": [None,], - "PLACE_next": [None,], - "PLACE_PROVIDER": [None,], - "PLACE_PROVIDER_next": [None,], - "PL_POST": [None,], - "PL_POST_next": [None,], - "URN": [None,], - "URN_next": [None,], + "DEC": [None], + "Has_next_episode": [True], + "DECOM": ["2016-08-22"], + "DECOM_next": ["2016-11-22"], + "RNE": [None], + "RNE_next": [None], + "LS": [None], + "LS_next": [None], + "PLACE": [None], + "PLACE_next": [None], + "PLACE_PROVIDER": [None], + "PLACE_PROVIDER_next": [None], + "PL_POST": [None], + "PL_POST_next": [None], + "URN": [None], + "URN_next": [None], } ) data["Test result"] = _is_next_episode_duplicate(data) - assert data["Test result"].tolist() == [True,] + assert data["Test result"].tolist() == [True] def test__is_next_episode_duplicate_next_none(): data = pd.DataFrame( { - "DEC": [None,], - "Has_next_episode": [True,], - "DECOM": ["2016-08-22",], - "DECOM_next": ["2016-11-22",], - "RNE": ["P",], - "RNE_next": [ None,], - "LS": ["C2",], - "LS_next": [None,], - "PLACE": ["U1",], - "PLACE_next": [None,], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_next": [None,], - "PL_POST": ["ABC1",], - "PL_POST_next": [None,], - "URN": ["SC1234",], - "URN_next": [None,], + "DEC": [None], + "Has_next_episode": [True], + "DECOM": ["2016-08-22"], + "DECOM_next": ["2016-11-22"], + "RNE": ["P"], + "RNE_next": [None], + "LS": ["C2"], + "LS_next": [None], + "PLACE": ["U1"], + "PLACE_next": [None], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_next": [None], + "PL_POST": ["ABC1"], + "PL_POST_next": [None], + "URN": ["SC1234"], + "URN_next": [None], } ) data["Test result"] = _is_next_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_next_episode_duplicate_episode_closed(): data = pd.DataFrame( { - "DEC": ["2016-08-31",], - "Has_next_episode": [True,], - "DECOM": ["2016-08-22",], - "DECOM_next": ["2016-11-22",], - "RNE": ["P",], - "RNE_next": ["P",], - "LS": ["C2",], - "LS_next": ["C2",], - "PLACE": ["U1",], - "PLACE_next": ["U1",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_next": ["PR1",], - "PL_POST": ["ABC1",], - "PL_POST_next": ["ABC1",], - "URN": ["SC1234",], - "URN_next": ["SC1234",], + "DEC": ["2016-08-31"], + "Has_next_episode": [True], + "DECOM": ["2016-08-22"], + "DECOM_next": ["2016-11-22"], + "RNE": ["P"], + "RNE_next": ["P"], + "LS": ["C2"], + "LS_next": ["C2"], + "PLACE": ["U1"], + "PLACE_next": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_next": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_next": ["ABC1"], + "URN": ["SC1234"], + "URN_next": ["SC1234"], } ) data["Test result"] = _is_next_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_next_episode_duplicate_no_next_ep(): data = pd.DataFrame( { - "DEC": [None,], - "Has_next_episode": [False,], - "DECOM": ["2016-08-22",], - "DECOM_next": [None,], - "RNE": ["P",], - "RNE_next": [ None,], - "LS": ["C2",], - "LS_next": [None,], - "PLACE": ["U1",], - "PLACE_next": [None,], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_next": [None,], - "PL_POST": ["ABC1",], - "PL_POST_next": [None,], - "URN": ["SC1234",], - "URN_next": [None,], + "DEC": [None], + "Has_next_episode": [False], + "DECOM": ["2016-08-22"], + "DECOM_next": [None], + "RNE": ["P"], + "RNE_next": [None], + "LS": ["C2"], + "LS_next": [None], + "PLACE": ["U1"], + "PLACE_next": [None], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_next": [None], + "PL_POST": ["ABC1"], + "PL_POST_next": [None], + "URN": ["SC1234"], + "URN_next": [None], } ) data["Test result"] = _is_next_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] - + assert data["Test result"].tolist() == [False] def test__is_previous_episode_duplicate_true(): @@ -635,261 +462,261 @@ def test__is_previous_episode_duplicate_true(): def test__is_previous_episode_duplicate_rne_diff(): data = pd.DataFrame( { - "DEC": [None,], - "Has_previous_episode": [True,], + "DEC": [None], + "Has_previous_episode": [True], "DECOM": ["2018-01-01"], - "DECOM_previous": ["2016-01-22",], - "RNE": ["P",], - "RNE_previous": ["DIFF",], - "LS": ["C2",], - "LS_previous": ["C2",], - "PLACE": ["U1",], - "PLACE_previous": ["U1",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_previous": ["PR1",], - "PL_POST": ["ABC1",], - "PL_POST_previous": ["ABC1",], - "URN": ["SC1234",], - "URN_previous": ["SC1234",], + "DECOM_previous": ["2016-01-22"], + "RNE": ["P"], + "RNE_previous": ["DIFF"], + "LS": ["C2"], + "LS_previous": ["C2"], + "PLACE": ["U1"], + "PLACE_previous": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_previous": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_previous": ["ABC1"], + "URN": ["SC1234"], + "URN_previous": ["SC1234"], } ) data["Test result"] = _is_previous_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_previous_episode_duplicate_ls_diff(): data = pd.DataFrame( { - "DEC": [None,], - "Has_previous_episode": [True,], + "DEC": [None], + "Has_previous_episode": [True], "DECOM": ["2018-01-01"], - "DECOM_previous": ["2016-01-22",], - "RNE": ["P",], - "RNE_previous": ["P",], - "LS": ["C2",], - "LS_previous": ["DIFF",], - "PLACE": ["U1",], - "PLACE_previous": ["U1",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_previous": ["PR1",], - "PL_POST": ["ABC1",], - "PL_POST_previous": ["ABC1",], - "URN": ["SC1234",], - "URN_previous": ["SC1234",], + "DECOM_previous": ["2016-01-22"], + "RNE": ["P"], + "RNE_previous": ["P"], + "LS": ["C2"], + "LS_previous": ["DIFF"], + "PLACE": ["U1"], + "PLACE_previous": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_previous": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_previous": ["ABC1"], + "URN": ["SC1234"], + "URN_previous": ["SC1234"], } ) data["Test result"] = _is_previous_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_previous_episode_duplicate_place_differ(): data = pd.DataFrame( { - "DEC": [None,], - "Has_previous_episode": [True,], - "DECOM": ["2018-01-01",], - "DECOM_previous": ["2016-01-22",], - "RNE": ["P",], - "RNE_previous": ["P",], - "LS": ["C2",], - "LS_previous": ["C2",], - "PLACE": ["U1",], - "PLACE_previous": ["DIFF",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_previous": ["PR1",], - "PL_POST": ["ABC1",], - "PL_POST_previous": ["ABC1",], - "URN": ["SC1234",], - "URN_previous": ["SC1234",], + "DEC": [None], + "Has_previous_episode": [True], + "DECOM": ["2018-01-01"], + "DECOM_previous": ["2016-01-22"], + "RNE": ["P"], + "RNE_previous": ["P"], + "LS": ["C2"], + "LS_previous": ["C2"], + "PLACE": ["U1"], + "PLACE_previous": ["DIFF"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_previous": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_previous": ["ABC1"], + "URN": ["SC1234"], + "URN_previous": ["SC1234"], } ) data["Test result"] = _is_previous_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_previous_episode_duplicate_provider_diff(): data = pd.DataFrame( { - "DEC": [None,], - "Has_previous_episode": [True,], - "DECOM": ["2018-01-01",], - "DECOM_previous": ["2016-01-22",], - "RNE": ["P",], - "RNE_previous": ["P",], - "LS": ["C2",], - "LS_previous": ["C2",], - "PLACE": ["U1",], - "PLACE_previous": ["U1",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_previous": ["DIFF",], - "PL_POST": ["ABC1",], - "PL_POST_previous": ["ABC1",], - "URN": ["SC1234",], - "URN_previous": ["SC1234",], + "DEC": [None], + "Has_previous_episode": [True], + "DECOM": ["2018-01-01"], + "DECOM_previous": ["2016-01-22"], + "RNE": ["P"], + "RNE_previous": ["P"], + "LS": ["C2"], + "LS_previous": ["C2"], + "PLACE": ["U1"], + "PLACE_previous": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_previous": ["DIFF"], + "PL_POST": ["ABC1"], + "PL_POST_previous": ["ABC1"], + "URN": ["SC1234"], + "URN_previous": ["SC1234"], } ) data["Test result"] = _is_previous_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_previous_episode_duplicate_pl_post_diff(): data = pd.DataFrame( { - "DEC": [None,], - "Has_previous_episode": [True,], - "DECOM": ["2018-01-01",], - "DECOM_previous": ["2016-01-22",], - "RNE": ["P",], - "RNE_previous": ["P",], - "LS": ["C2",], - "LS_previous": ["C2",], - "PLACE": ["U1",], - "PLACE_previous": ["U1",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_previous": ["PR1",], - "PL_POST": ["ABC1",], - "PL_POST_previous": ["DIFF",], - "URN": ["SC1234",], - "URN_previous": ["SC1234",], + "DEC": [None], + "Has_previous_episode": [True], + "DECOM": ["2018-01-01"], + "DECOM_previous": ["2016-01-22"], + "RNE": ["P"], + "RNE_previous": ["P"], + "LS": ["C2"], + "LS_previous": ["C2"], + "PLACE": ["U1"], + "PLACE_previous": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_previous": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_previous": ["DIFF"], + "URN": ["SC1234"], + "URN_previous": ["SC1234"], } ) data["Test result"] = _is_previous_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_previous_episode_duplicate_urn_diff(): data = pd.DataFrame( { - "DEC": [None,], - "Has_previous_episode": [True,], - "DECOM": ["2018-01-01",], - "DECOM_previous": ["2016-01-22",], - "RNE": ["P",], - "RNE_previous": ["P",], - "LS": ["C2",], - "LS_previous": ["C2",], - "PLACE": ["U1",], - "PLACE_previous": ["U1",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_previous": ["PR1",], - "PL_POST": ["ABC1",], - "PL_POST_previous": ["ABC1",], - "URN": ["SC1234",], - "URN_previous": ["DIFF",], + "DEC": [None], + "Has_previous_episode": [True], + "DECOM": ["2018-01-01"], + "DECOM_previous": ["2016-01-22"], + "RNE": ["P"], + "RNE_previous": ["P"], + "LS": ["C2"], + "LS_previous": ["C2"], + "PLACE": ["U1"], + "PLACE_previous": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_previous": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_previous": ["ABC1"], + "URN": ["SC1234"], + "URN_previous": ["DIFF"], } ) data["Test result"] = _is_previous_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_previous_episode_duplicate_all_none(): data = pd.DataFrame( { - "DEC": [None,], - "Has_previous_episode": [True,], - "DECOM": ["2018-01-01",], - "DECOM_previous": ["2016-01-22",], - "RNE": [ None,], - "RNE_previous": [None,], - "LS": [ None,], - "LS_previous": [None,], - "PLACE": [ None,], - "PLACE_previous": [None,], - "PLACE_PROVIDER": [None,], - "PLACE_PROVIDER_previous": [None,], - "PL_POST": [None,], - "PL_POST_previous": [None,], - "URN": [None,], - "URN_previous": [None,], + "DEC": [None], + "Has_previous_episode": [True], + "DECOM": ["2018-01-01"], + "DECOM_previous": ["2016-01-22"], + "RNE": [None], + "RNE_previous": [None], + "LS": [None], + "LS_previous": [None], + "PLACE": [None], + "PLACE_previous": [None], + "PLACE_PROVIDER": [None], + "PLACE_PROVIDER_previous": [None], + "PL_POST": [None], + "PL_POST_previous": [None], + "URN": [None], + "URN_previous": [None], } ) data["Test result"] = _is_previous_episode_duplicate(data) - assert data["Test result"].tolist() == [True,] + assert data["Test result"].tolist() == [True] def test__is_previous_episode_duplicate_prev_none(): data = pd.DataFrame( { - "DEC": [None,], - "Has_previous_episode": [True,], - "DECOM": ["2018-01-01",], - "DECOM_previous": ["2016-01-22",], - "RNE": [ "P",], - "RNE_previous": [None,], - "LS": [ "C2",], - "LS_previous": [None,], - "PLACE": [ "U1",], - "PLACE_previous": [None,], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_previous": [None,], - "PL_POST": ["ABC1",], - "PL_POST_previous": [None,], - "URN": ["SC1234",], - "URN_previous": [None,], + "DEC": [None], + "Has_previous_episode": [True], + "DECOM": ["2018-01-01"], + "DECOM_previous": ["2016-01-22"], + "RNE": ["P"], + "RNE_previous": [None], + "LS": ["C2"], + "LS_previous": [None], + "PLACE": ["U1"], + "PLACE_previous": [None], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_previous": [None], + "PL_POST": ["ABC1"], + "PL_POST_previous": [None], + "URN": ["SC1234"], + "URN_previous": [None], } ) data["Test result"] = _is_previous_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_previous_episode_duplicate_episode_closed(): data = pd.DataFrame( { - "DEC": ["2016-08-31",], - "Has_previous_episode": [True,], - "DECOM": ["2018-01-01",], - "DECOM_previous": ["2016-01-22",], - "RNE": [ "P",], - "RNE_previous": ["P",], - "LS": [ "C2",], - "LS_previous": ["C2",], - "PLACE": [ "U1",], - "PLACE_previous": ["U1",], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_previous": ["PR1",], - "PL_POST": ["ABC1",], - "PL_POST_previous": ["ABC1",], - "URN": ["SC1234",], - "URN_previous": ["SC1234",], + "DEC": ["2016-08-31"], + "Has_previous_episode": [True], + "DECOM": ["2018-01-01"], + "DECOM_previous": ["2016-01-22"], + "RNE": ["P"], + "RNE_previous": ["P"], + "LS": ["C2"], + "LS_previous": ["C2"], + "PLACE": ["U1"], + "PLACE_previous": ["U1"], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_previous": ["PR1"], + "PL_POST": ["ABC1"], + "PL_POST_previous": ["ABC1"], + "URN": ["SC1234"], + "URN_previous": ["SC1234"], } ) data["Test result"] = _is_previous_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_previous_episode_duplicate_no_prev_ep(): data = pd.DataFrame( { - "DEC": [None,], - "Has_previous_episode": [False,], - "DECOM": [None,], - "DECOM_previous": [None,], - "RNE": [ "P",], - "RNE_previous": [None,], - "LS": [ "C2",], - "LS_previous": [None,], - "PLACE": [ "U1",], - "PLACE_previous": [None,], - "PLACE_PROVIDER": ["PR1",], - "PLACE_PROVIDER_previous": [None,], - "PL_POST": ["ABC1",], - "PL_POST_previous": [None,], - "URN": ["SC1234",], - "URN_previous": [None,], + "DEC": [None], + "Has_previous_episode": [False], + "DECOM": [None], + "DECOM_previous": [None], + "RNE": ["P"], + "RNE_previous": [None], + "LS": ["C2"], + "LS_previous": [None], + "PLACE": ["U1"], + "PLACE_previous": [None], + "PLACE_PROVIDER": ["PR1"], + "PLACE_PROVIDER_previous": [None], + "PL_POST": ["ABC1"], + "PL_POST_previous": [None], + "URN": ["SC1234"], + "URN_previous": [None], } ) data["Test result"] = _is_previous_episode_duplicate(data) - assert data["Test result"].tolist() == [False,] + assert data["Test result"].tolist() == [False] def test__is_previous_episode_submitted_later(): @@ -1130,7 +957,3 @@ def test__update_episode_source_stage2(): "RULE_4", "RULE_1 | RULE_5", ] - - -# python -m black "/workspaces/liia-tools/tests/s903/" -# poetry run coverage run -m pytest From 6e85e293df8a749a8f10bc44f06069c51c768df5 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Fri, 31 May 2024 08:20:52 +0000 Subject: [PATCH 23/25] Update pan-agg yml --- liiatools/datasets/s903/s903_cli.py | 25 +++++++++++++++++++ .../datasets/s903/s903_main_functions.py | 2 +- liiatools/spec/s903/pan-agg.yml | 2 ++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/liiatools/datasets/s903/s903_cli.py b/liiatools/datasets/s903/s903_cli.py index 0ae14d6a..89fd5dd8 100644 --- a/liiatools/datasets/s903/s903_cli.py +++ b/liiatools/datasets/s903/s903_cli.py @@ -90,6 +90,31 @@ def la_agg(input, output): s903_main_functions.la_agg(input, output) +@s903.command() +@click.option( + "--i", + "input", + required=True, + type=str, + help="A string specifying the input file location, including the file name and suffix, usable by a pathlib Path function", +) +@click.option( + "--o", + "output", + required=True, + type=str, + help="A string specifying the output directory location", +) +def episodes_fix(input, output): + """ + Applies fixes to la_agg SSDA903 Episodes files + :param input: should specify the input file location, including file name and suffix, and be usable by a Path function + :param output: should specify the path to the output folder + :return: None + """ + s903_main_functions.episodes_fix(input, output) + + @s903.command() @click.option( "--i", diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index 7777cebd..59303298 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -207,7 +207,7 @@ def sufficiency_output(input, output): def episodes_fix(input, output): - """ " + """ Applies fixes to la_agg SSDA903 Episodes files :param input: should specify the input file location, including file name and suffix, and be usable by a Path function :param output: should specify the path to the output folder diff --git a/liiatools/spec/s903/pan-agg.yml b/liiatools/spec/s903/pan-agg.yml index 38b3dee2..983f6116 100644 --- a/liiatools/spec/s903/pan-agg.yml +++ b/liiatools/spec/s903/pan-agg.yml @@ -25,6 +25,8 @@ column_names: - URN - LA - YEAR + - YEAR_latest + - Episode_source Reviews: - CHILD - DOB From 0127ed26f62e40882d763dbeb272406a4fa54518 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Fri, 31 May 2024 12:25:10 +0000 Subject: [PATCH 24/25] Episode fixes minor tidy up --- liiatools/datasets/s903/lds_ssda903_clean/prep.py | 1 + liiatools/datasets/s903/s903_cli.py | 4 ++-- liiatools/datasets/s903/s903_main_functions.py | 10 ---------- .../SSDA903_episodes_for_testing_fixes_INPUT.csv | 0 4 files changed, 3 insertions(+), 12 deletions(-) rename liiatools/{datasets/s903/lds_ssda903_episodes_fix => spec/s903/samples}/SSDA903_episodes_for_testing_fixes_INPUT.csv (100%) diff --git a/liiatools/datasets/s903/lds_ssda903_clean/prep.py b/liiatools/datasets/s903/lds_ssda903_clean/prep.py index 2f12c4d7..df07d767 100644 --- a/liiatools/datasets/s903/lds_ssda903_clean/prep.py +++ b/liiatools/datasets/s903/lds_ssda903_clean/prep.py @@ -1,6 +1,7 @@ import logging import pandas as pd from pathlib import Path + # import cchardet as chardet from datetime import datetime diff --git a/liiatools/datasets/s903/s903_cli.py b/liiatools/datasets/s903/s903_cli.py index 89fd5dd8..6a982234 100644 --- a/liiatools/datasets/s903/s903_cli.py +++ b/liiatools/datasets/s903/s903_cli.py @@ -113,8 +113,8 @@ def episodes_fix(input, output): :return: None """ s903_main_functions.episodes_fix(input, output) - - + + @s903.command() @click.option( "--i", diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index 59303298..c77be888 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -231,13 +231,3 @@ def episodes_fix(input, output): output_path, index=False, ) - - -# Run episodes_fix() with our test file which contains examples of each rule (CHILD id indicates which rule) -episodes_fix( - r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv", - r"liiatools/datasets/s903/lds_ssda903_episodes_fix/", -) - -# poetry run python liiatools/datasets/s903/s903_main_functions.py -# python -m black "/workspaces/liia-tools/liiatools/datasets/s903/s903_main_functions.py" diff --git a/liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv b/liiatools/spec/s903/samples/SSDA903_episodes_for_testing_fixes_INPUT.csv similarity index 100% rename from liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv rename to liiatools/spec/s903/samples/SSDA903_episodes_for_testing_fixes_INPUT.csv From d2c70ce8f87ca4fd0519e7d88749926470485b46 Mon Sep 17 00:00:00 2001 From: Stephen C <127780498+StephenCarterLIIA@users.noreply.github.com> Date: Fri, 7 Jun 2024 10:21:10 +0000 Subject: [PATCH 25/25] Removed comment --- liiatools/datasets/s903/s903_main_functions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/liiatools/datasets/s903/s903_main_functions.py b/liiatools/datasets/s903/s903_main_functions.py index c77be888..25c70b42 100644 --- a/liiatools/datasets/s903/s903_main_functions.py +++ b/liiatools/datasets/s903/s903_main_functions.py @@ -25,7 +25,6 @@ from liiatools.datasets.s903.lds_ssda903_sufficiency import process as suff_process # dependencies for episodes fix() -# from liiatools.datasets.s903.lds_ssda903_episodes_fix import process as episodes_process from liiatools.datasets.s903.lds_ssda903_episodes_fix.process import stage_1, stage_2 from liiatools.spec import common as common_asset_dir