Skip to content

Commit

Permalink
Implemented suggestions
Browse files Browse the repository at this point in the history
  • Loading branch information
StephenCarterLIIA authored May 20, 2024
1 parent 7f28a1a commit 89490c2
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 120 deletions.
61 changes: 51 additions & 10 deletions liiatools/datasets/s903/lds_ssda903_episodes_fix/process.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

__COLUMNS = [
"DECOM",
Expand Down Expand Up @@ -77,7 +77,7 @@ def format_datetime(dataframe: pd.DataFrame, date_columns: list) -> pd.DataFrame
Format date columns to datetime type
:param dataframe: Dataframe with SSDA903 Episodes data
:param columns: List of columns containing dates
:param date_columns: List of columns containing dates
:return: Dataframe with date columns showing as datetime data type
"""
dataframe[date_columns] = dataframe[date_columns].apply(
Expand Down Expand Up @@ -201,7 +201,7 @@ def _is_previous_episode_submitted_later(row: pd.Series) -> bool:
)


def _stage1_rule_to_apply(row: pd.Series) -> pd.Series:
def _stage1_rule_to_apply(row: pd.Series) -> str:
"""
Determine which Stage 1 rule should be applied
Expand Down Expand Up @@ -236,7 +236,7 @@ def identify_stage1_rule_to_apply(dataframe: pd.DataFrame) -> pd.DataFrame:
return dataframe


def _update_dec_stage1(row: pd.Series) -> pd.Series:
def _update_dec_stage1(row: pd.Series) -> datetime:
"""
Determine updated DEC value. Defaults to input DEC if no rule to apply
Expand All @@ -255,7 +255,7 @@ def _update_dec_stage1(row: pd.Series) -> pd.Series:
return row["DEC"]


def _update_rec_stage1(row: pd.Series) -> pd.Series:
def _update_rec_stage1(row: pd.Series) -> str:
"""
Determine updated REC value. Defaults to input REC if no rule to apply
Expand All @@ -270,7 +270,7 @@ def _update_rec_stage1(row: pd.Series) -> pd.Series:
return row["REC"]


def _update_reason_place_change_stage1(row: pd.Series) -> pd.Series:
def _update_reason_place_change_stage1(row: pd.Series) -> str:
"""
Determine updated REASON_PLACE_CHANGE value. Defaults to input value if no rule to apply
Expand All @@ -285,7 +285,7 @@ def _update_reason_place_change_stage1(row: pd.Series) -> pd.Series:
return row["REASON_PLACE_CHANGE"]


def _update_episode_source_stage1(row: pd.Series) -> pd.Series:
def _update_episode_source_stage1(row: pd.Series) -> str:
"""
Determine updated Episode_source value. Defaults to input value if no rule to apply
Expand Down Expand Up @@ -338,14 +338,14 @@ def _has_x1_gap_before_next_episode(row: pd.Series) -> bool:
return False


def _stage2_rule_to_apply(row):
def _stage2_rule_to_apply(row: pd.Series) -> str:
if row["Overlaps_next_episode"]:
return "RULE_4" # Overlaps next episode and next episode was submitted later
if row["Has_X1_gap_before_next_episode"]:
return "RULE_5" # Ends before next episode but has reason "X1" - continuous and next ep was submitted later


def _update_dec_stage2(row: pd.Series) -> pd.Series:
def _update_dec_stage2(row: pd.Series) -> datetime:
"""
Determine updated DEC value. Defaults to input DEC if no rule to apply
Expand All @@ -357,7 +357,7 @@ def _update_dec_stage2(row: pd.Series) -> pd.Series:
return row["DEC"]


def _update_episode_source_stage2(row: pd.Series) -> pd.Series:
def _update_episode_source_stage2(row: pd.Series) -> str:
"""
Determine updated Episode_source value. Defaults to input value if no rule to apply
Expand Down Expand Up @@ -412,3 +412,44 @@ def apply_stage2_rules(dataframe: pd.DataFrame) -> pd.DataFrame:
# Apply rules 4, 5
dataframe["DEC"] = dataframe.apply(_update_dec_stage2, axis=1)
return dataframe


def stage_1(s903_df: pd.DataFrame) -> pd.DataFrame:
"""
Accept an s903 episodes dataframe and apply the stage 1 rules
:param s903_df: Dataframe with SSDA903 Episodes data
:return: Dataframe with stage 1 rules identified and applied
"""
# Add columns to dataframe to identify which rules should be applied at stage 1
s903_df = s903_df.sort_values(["CHILD", "DECOM"], ignore_index=True)
s903_df_stage1 = create_previous_and_next_episode(s903_df, __COLUMNS)
s903_df_stage1 = format_datetime(s903_df_stage1, __DATES)
s903_df_stage1 = add_latest_year_and_source_for_la(s903_df_stage1)
s903_df_stage1 = add_stage1_rule_identifier_columns(s903_df_stage1)
s903_df_stage1 = identify_stage1_rule_to_apply(s903_df_stage1)

# Apply the stage 1 rules
s903_df_stage1_applied = apply_stage1_rules(s903_df_stage1)
return s903_df_stage1_applied


def stage_2(s903_df: pd.DataFrame) -> pd.DataFrame:
"""
Accept an s903 episodes dataframe and apply the stage 2 rules
:param s903_df: Dataframe with SSDA903 Episodes data
:return: Dataframe with stage 2 rules identified and applied
"""
s903_df_stage2 = s903_df[__COLUMNS_TO_KEEP]
s903_df_stage2 = create_previous_and_next_episode(s903_df_stage2, __COLUMNS)
s903_df_stage2 = format_datetime(s903_df_stage2, __DATES)
s903_df_stage2 = add_stage2_rule_identifier_columns(s903_df_stage2)
s903_df_stage2 = identify_stage2_rule_to_apply(s903_df_stage2)

# Apply the stage 2 rules
s903_df_stage2_applied = apply_stage2_rules(s903_df_stage2)

s903_df_final = s903_df_stage2_applied[__COLUMNS_TO_KEEP]
s903_df_final = s903_df_final.sort_values(["CHILD", "DECOM"], ignore_index=True)
return s903_df_final
116 changes: 11 additions & 105 deletions liiatools/datasets/s903/s903_main_functions.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from pathlib import Path
import yaml
import logging
import click_log
from datetime import datetime
import yaml
import click_log

# dependencies for cleanfile()
from liiatools.datasets.s903.lds_ssda903_clean import (
Expand All @@ -25,7 +25,8 @@
from liiatools.datasets.s903.lds_ssda903_sufficiency import process as suff_process

# dependencies for episodes fix()
from liiatools.datasets.s903.lds_ssda903_episodes_fix import process as episodes_process
# from liiatools.datasets.s903.lds_ssda903_episodes_fix import process as episodes_process
from liiatools.datasets.s903.lds_ssda903_episodes_fix.process import stage_1, stage_2

from liiatools.spec import common as common_asset_dir
from liiatools.datasets.shared_functions import (
Expand Down Expand Up @@ -221,117 +222,22 @@ def episodes_fix(input, output):
column_names = config["column_names"]
table_name = common_process.match_load_file(s903_df, column_names)

# Process stage 1 rule fixes for Episodes table
# Process stage 1 and 2 rule fixes for Episodes table
if table_name == "Episodes":
# Add columns to dataframe to identify which rules should be applied at stage 1
s903_df = s903_df.sort_values(["CHILD", "DECOM"], ignore_index=True)
s903_df_stage1 = episodes_process.create_previous_and_next_episode(
s903_df, episodes_process.__COLUMNS
)
s903_df_stage1 = episodes_process.format_datetime(
s903_df_stage1, episodes_process.__DATES
)
s903_df_stage1 = episodes_process.add_latest_year_and_source_for_la(
s903_df_stage1
)
s903_df_stage1 = episodes_process.add_stage1_rule_identifier_columns(
s903_df_stage1
)
s903_df_stage1 = episodes_process.identify_stage1_rule_to_apply(s903_df_stage1)

# Apply the stage 1 rules
s903_df_stage1_applied = episodes_process.apply_stage1_rules(s903_df_stage1)

# Add columns to dataframe to identify which rules should be applied at stage 2 TODO
s903_df_stage2 = s903_df_stage1_applied[episodes_process.__COLUMNS_TO_KEEP]
s903_df_stage2 = episodes_process.create_previous_and_next_episode(
s903_df_stage2, episodes_process.__COLUMNS
)
s903_df_stage2 = episodes_process.format_datetime(
s903_df_stage2, episodes_process.__DATES
)
s903_df_stage2 = episodes_process.add_stage2_rule_identifier_columns(
s903_df_stage2
)
s903_df_stage2 = episodes_process.identify_stage2_rule_to_apply(s903_df_stage2)

# Apply the stage 2 rules
s903_df_stage2_applied = episodes_process.apply_stage2_rules(s903_df_stage2)

s903_df_final = s903_df_stage2_applied[episodes_process.__COLUMNS_TO_KEEP]
s903_df_final = s903_df_final.sort_values(["CHILD", "DECOM"], ignore_index=True)
s903_df_stage1_applied = stage_1(s903_df)
s903_df_final = stage_2(s903_df_stage1_applied)
output_path = Path(output, "SSDA903_episodes_fixed.csv")
s903_df_final.to_csv(
r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_OUTPUT.csv",
output_path,
index=False,
)

# Following code used to print dataframe outputs during development
print_df = False
if print_df:
print("Dataframe with stage 1 rules identified:")
print(
s903_df_stage1[
[
"CHILD",
"YEAR",
"DECOM",
"DEC",
"RNE",
"REC",
"REASON_PLACE_CHANGE",
"Has_open_episode_error",
"Rule_to_apply",
]
]
)
print("Dataframe with stage 1 rules applied:")
print(
s903_df_stage1_applied[
[
"CHILD",
"YEAR",
"DECOM",
"DEC",
"RNE",
"REC",
"REASON_PLACE_CHANGE",
"Episode_source",
"Has_open_episode_error",
"Rule_to_apply",
]
]
)
print("Dataframe with stage 2 rules applied:")
print(
s903_df_stage2_applied[
[
"CHILD",
"YEAR",
"DECOM",
"DEC",
"RNE",
"REC",
"REASON_PLACE_CHANGE",
"Episode_source",
"DECOM_next",
"YEAR_next",
"Has_next_episode",
"Overlaps_next_episode",
"Has_X1_gap_before_next_episode",
"Rule_to_apply",
]
]
)

print("Final dataframe with all rules applied")
print(s903_df_final)


# Run episodes_fix() with our test file which contains examples of each rule (CHILD id indicates which rule)
episodes_fix(
r"liiatools/datasets/s903/lds_ssda903_episodes_fix/SSDA903_episodes_for_testing_fixes_INPUT.csv",
r"liiatools/datasets/s903/lds_ssda903_episodes_fix",
r"liiatools/datasets/s903/lds_ssda903_episodes_fix/",
)

# poetry run python liiatools/datasets/s903/s903_main_functions.py
# python -m black "/workspaces/liia-tools/liiatools/datasets/s903/s903_main_functions.py"
# python -m black "/workspaces/liia-tools/liiatools/datasets/s903/s903_main_functions.py"
8 changes: 3 additions & 5 deletions tests/s903/test_episodes_fix.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,15 @@
_is_previous_episode_submitted_later,
_stage1_rule_to_apply,
add_stage1_rule_identifier_columns,
identify_stage1_rule_to_apply,
_update_dec_stage1,
_update_rec_stage1,
_update_reason_place_change_stage1,
_update_episode_source_stage1,
apply_stage1_rules,
_overlaps_next_episode,
_has_x1_gap_before_next_episode,
_stage2_rule_to_apply,
_update_dec_stage2,
_update_episode_source_stage2,
add_stage2_rule_identifier_columns,
identify_stage2_rule_to_apply,
apply_stage2_rules,
)


Expand Down Expand Up @@ -122,6 +117,9 @@ def test_add_stage1_rule_identifier_columns():
assert data_with_identifiers_added[
"Has_next_episode_with_RNE_equals_S"
].tolist() == [True, False]
assert data_with_identifiers_added["Next_episode_is_duplicate"].tolist() == [False, False]
assert data_with_identifiers_added["Previous_episode_is_duplicate"].tolist() == [False, False]
assert data_with_identifiers_added["Previous_episode_submitted_later"].tolist() == [False, False]


def test__is_the_same():
Expand Down

0 comments on commit 89490c2

Please sign in to comment.