Skip to content

Commit

Permalink
WIP - stage 1 rules working
Browse files Browse the repository at this point in the history
  • Loading branch information
StephenCarterLIIA authored Feb 26, 2024
1 parent 1632b25 commit 892f6aa
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 11 deletions.
127 changes: 124 additions & 3 deletions liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

__COLUMNS = [
"DECOM",
Expand All @@ -26,6 +27,25 @@
"DEC_next",
]

__COLUMNS_TO_KEEP = [
"CHILD",
"LA",
"DECOM",
"RNE",
"LS",
"CIN",
"PLACE",
"PLACE_PROVIDER",
"DEC",
"REC",
"REASON_PLACE_CHANGE",
"HOME_POST",
"PL_POST",
"URN",
"YEAR",
"YEAR_latest",
"Episode_source",
]

def create_previous_and_next_episode(dataframe: pd.DataFrame, columns: list) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -161,6 +181,64 @@ def identify_stage1_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame:
return dataframe


def _update_dec(row):
"""
Determine updated DEC value. Defaults to input DEC if no rule to apply
:param row: Row from dataframe with SSDA903 Episodes data
:return: Updated DEC date
"""
end_of_year = datetime(row["YEAR"], 3, 31)
if row["Has_open_episode_error"]:
if row["Rule_to_apply"] == "RULE_1":
return row["DECOM_next"]
if row["Rule_to_apply"] == "RULE_1A":
day_before_next_decom = row["DECOM_next"] - timedelta(days = 1)
return min(end_of_year, day_before_next_decom) # get earliest date
if row["Rule_to_apply"] == "RULE_2":
return end_of_year
return row["DEC"]


def _update_rec(row):
"""
Determine updated REC value. Defaults to input REC if no rule to apply
:param row: Row from dataframe with SSDA903 Episodes data
:return: Updated REC value
"""
episode_ends_liia_fix = "E99"
episode_continues = "X1"
if row["Has_open_episode_error"]:
if row["Rule_to_apply"] == "RULE_1":
return episode_continues
if row["Rule_to_apply"] in ("RULE_1A", "RULE_2"):
return episode_ends_liia_fix
return row["REC"]


def _update_reason_place_change(row):
"""
Determine updated REASON_PLACE_CHANGE value. Defaults to input value if no rule to apply
:param row: Row from dataframe with SSDA903 Episodes data
:return: Updated REASON_PLACE_CHANGE value
"""
reason_liia_fix = "LIIAF"
if row["Has_open_episode_error"]:
if (row["Rule_to_apply"] == "RULE_1") & (row["RNE_next"] in ("P", "B", "T", "U")):
return reason_liia_fix
return row["REASON_PLACE_CHANGE"]


def _update_episode_source(row):
"""
Determine updated Episode_source value. Defaults to input value if no rule to apply
:param row: Row from dataframe with SSDA903 Episodes data
:return: Updated Episode_source value
"""
if row["Has_open_episode_error"]:
return row["Rule_to_apply"]
return row["Episode_source"]


def apply_stage1_rules(dataframe: pd.DataFrame) -> pd.DataFrame:
"""
Apply stage 1 rules:
Expand All @@ -173,10 +251,53 @@ def apply_stage1_rules(dataframe: pd.DataFrame) -> pd.DataFrame:
:param dataframe: Dataframe with SSDA903 Episodes data
:return: Dataframe with stage 1 rules applied
"""
print("apply_stage1_rules...TODO")
print("apply_stage1_rules")
# Apply rules 3, 3A to delete rows
episodes_to_delete = dataframe["Rule_to_apply"].isin( ['RULE_3', 'RULE_3A'])
dataframe = dataframe.drop(dataframe[episodes_to_delete].index)

# write code here for rules 1, 1A, 2

# Apply rules 1, 1A, 2
dataframe["DEC"] = dataframe.apply(_update_dec, axis=1)
dataframe["REC"] = dataframe.apply(_update_rec, axis=1)
dataframe["REASON_PLACE_CHANGE"] = dataframe.apply(_update_reason_place_change, axis=1)
dataframe["Episode_source"] = dataframe.apply(_update_episode_source, axis=1)

return dataframe


def add_stage2_rule_identifier_columns(dataframe: pd.DataFrame) -> pd.DataFrame:
"""
Add columns to identify rows which overlap or underlap surrounding episodes
:param dataframe: Dataframe with SSDA903 Episodes data
:return: Dataframe with columns showing true if certain conditions are met
"""
print("add_stage2_rule_identifier_columns...TODO")

return dataframe


def identify_stage2_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame:
"""
Add column to identify which stage 2 rule should be applied:
RULE_4: Overlaps with next episode
RULE_5: End reason is "X1" - episode continues - but there is gap before next episode
:param dataframe: Dataframe with SSDA903 Episodes data
:return: Dataframe with column showing stage 2 rule to be applied
"""
print("identify_stage2_rule_to_apply...TODO")
return dataframe


def apply_stage2_rules(dataframe: pd.DataFrame) -> pd.DataFrame:
"""
Apply stage 2 rules:
RULE_4: Overlaps with next episode
RULE_5: End reason is "X1" - episode continues - but there is gap before next episode
:param dataframe: Dataframe with SSDA903 Episodes data
:return: Dataframe with stage 2 rules applied
"""
print("apply_stage2_rules...TODO")
return dataframe
27 changes: 19 additions & 8 deletions liiatools/datasets/s903/s903_main_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def episodes_fix(input, output):

# Process stage 1 rule fixes for Episodes table
if table_name == "Episodes":
# Add columns to dataframe to identify which rules should be applied
# Add columns to dataframe to identify which rules should be applied at stage 1
s903_df = s903_df.sort_values(["CHILD", "DECOM"], ignore_index=True)
s903_df_stage1 = episodes_process.create_previous_and_next_episode(s903_df, episodes_process.__COLUMNS)
s903_df_stage1 = episodes_process.format_datetime(s903_df_stage1, episodes_process.__DATES)
Expand All @@ -233,15 +233,26 @@ def episodes_fix(input, output):

# Apply the stage 1 rules
s903_df_stage1_applied = episodes_process.apply_stage1_rules(s903_df_stage1)


# Add columns to dataframe to identify which rules should be applied at stage 2 TODO
s903_df_stage2 = s903_df_stage1_applied[episodes_process.__COLUMNS_TO_KEEP]
s903_df_stage2 = episodes_process.create_previous_and_next_episode(s903_df_stage2, episodes_process.__COLUMNS)
s903_df_stage2 = episodes_process.format_datetime(s903_df_stage2, episodes_process.__DATES)
s903_df_stage2 = episodes_process.add_stage2_rule_identifier_columns(s903_df_stage2)
s903_df_stage2 = episodes_process.identify_stage2_rule_to_apply(s903_df_stage2)

# Apply the stage 2 rules TODO
s903_df_stage2_applied = episodes_process.apply_stage2_rules(s903_df_stage2)

# Following code used to test outputs during development
print("Dataframe with rules identified:")
print(s903_df_stage1[["CHILD", "YEAR", "DECOM", "DEC", "Has_open_episode_error", "Rule_to_apply"]])
print("Dataframe with stage 1 rules applied (Incomplete - more rules to apply):")
print(s903_df_stage1_applied[["CHILD", "YEAR", "DECOM", "DEC", "Has_open_episode_error", "Rule_to_apply"]])

s903_df_stage1_applied = s903_df_stage1_applied.sort_values(["CHILD", "DECOM"], ignore_index=True)
s903_df_stage1_applied.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv",
print(s903_df_stage1[["CHILD", "YEAR", "DECOM", "DEC", "RNE", "REC", "REASON_PLACE_CHANGE", "Has_open_episode_error", "Rule_to_apply"]])
print("Dataframe with stage 1 rules applied:")
print(s903_df_stage1_applied[["CHILD", "YEAR", "DECOM", "DEC", "RNE", "REC", "REASON_PLACE_CHANGE", "Episode_source", "Has_open_episode_error", "Rule_to_apply"]])
print("Dataframe with stage 2 rules applied...to be developed:")
print(s903_df_stage2_applied[["CHILD", "YEAR", "DECOM", "DEC", "RNE", "REC", "REASON_PLACE_CHANGE", "Episode_source"]])
s903_df_stage2_applied = s903_df_stage2_applied.sort_values(["CHILD", "DECOM"], ignore_index=True)
s903_df_stage2_applied.to_csv(r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv",
index=False)

# Run episodes_fix() with our test file which contains examples of each rule (CHILD id indicates which rule)
Expand Down

0 comments on commit 892f6aa

Please sign in to comment.