Skip to content

Commit

Permalink
add analysis reports
Browse files Browse the repository at this point in the history
  • Loading branch information
patrick-troy committed Jan 31, 2024
1 parent 290adb9 commit f320775
Show file tree
Hide file tree
Showing 23 changed files with 1,004 additions and 280 deletions.
54 changes: 0 additions & 54 deletions liiatools/cin_census_pipeline/_reports_referrals.py

This file was deleted.

Empty file.
21 changes: 14 additions & 7 deletions liiatools/cin_census_pipeline/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import logging
from fs import open_fs
from fs.base import FS

from liiatools.common import pipeline as pl
from liiatools.common.archive import DataframeArchive
from liiatools.common.constants import ProcessNames, SessionNames
from liiatools.common.data import (
DataContainer,
ErrorContainer,
FileLocator,
PipelineConfig,
Expand All @@ -18,8 +18,8 @@
load_schema,
load_schema_path,
)

from liiatools.cin_census_pipeline.stream_pipeline import task_cleanfile
from liiatools.cin_census_pipeline.reports import reports


logger = logging.getLogger()
Expand Down Expand Up @@ -58,7 +58,7 @@ def process_file(
schema = load_schema(year=year)
schema_path = load_schema_path(year=year)
metadata = dict(year=year, schema=schema, la_code=la_code)

# Normalise the data and export to the session 'cleaned' folder
try:
cleanfile_result = task_cleanfile(file_locator, schema, schema_path)
Expand Down Expand Up @@ -154,9 +154,16 @@ def process_session(source_fs: FS, output_fs: FS, la_code: str):
report_folder = export_folder.makedirs(report, recreate=True)
report_data.data.export(report_folder, "cin_census_", "csv")

# Run report analysis
analysis_data = report_data.data["CIN"]

process_session(
open_fs(r"C:\Users\patrick.troy\OneDrive - Social Finance Ltd\Work\LIIA\LIIA tests\CIN\pipeline\input"),
open_fs(r"C:\Users\patrick.troy\OneDrive - Social Finance Ltd\Work\LIIA\LIIA tests\CIN\pipeline\output"),
la_code="BAR"
expanded_assessment_factors = reports.expanded_assessment_factors(analysis_data)
referral_outcomes = reports.referral_outcomes(analysis_data)
s47_journeys = reports.s47_journeys(analysis_data)

analysis_data = DataContainer(
{"factors": expanded_assessment_factors, "referrals": referral_outcomes, "S47_journeys": s47_journeys}
)

analysis_folder = export_folder.makedirs("REPORTS", recreate=True)
analysis_data.export(analysis_folder, "cin_census_", "csv")
43 changes: 43 additions & 0 deletions liiatools/cin_census_pipeline/reports/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pandas as pd
import numpy as np


def _time_between_date_series(
later_date: pd.Series,
earlier_date: pd.Series,
years: bool = False,
days: bool = False,
) -> pd.Series:
"""
Returns the number of days between two date series.
:param later_date: The later date.
:param earlier_date: The earlier date.
:param years: If True, returns the number of years between the two dates. The default is False.
:param days: If True, returns the number of days between the two dates. The default is True.
:returns: The number of days between the dates.
"""
time = later_date - earlier_date
time = time.dt.days

if days:
time = time.astype("Int64")
return time

elif years:
time = (time / 365).apply(np.floor)
time = time.astype("Int64")
return time


def _filter_events(data: pd.DataFrame, day_column: str, max_days: int) -> pd.DataFrame:
"""
Filters the data to only include events that occur within the specified maximum days.
:param data: The data to filter.
:param day_column: The column containing the date.
:param max_days: The maximum number of days to include.
:returns: The filtered data.
"""
data = data[((data[day_column] <= max_days) & (data[day_column] >= 0))]
return data
74 changes: 74 additions & 0 deletions liiatools/cin_census_pipeline/reports/_reports_referrals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import numpy as np
import pandas as pd

from liiatools.cin_census_pipeline.spec import load_reports
from liiatools.cin_census_pipeline.reports import _time_between_date_series, _filter_events


def referral_outcomes(data: pd.DataFrame) -> pd.DataFrame:
"""
Add referral outcomes to the data based on assessment and S47 dates. These can be;
NFA, S17, S47 or BOTH
:param data: The data calculate referral outcomes.
:returns: The data with referral outcomes attached.
"""
reports_config = load_reports()

s17_dates = data[data["AssessmentActualStartDate"].notna()][
["LAchildID", "CINreferralDate", "AssessmentActualStartDate"]
].drop_duplicates()

s17_dates["days_to_s17"] = _time_between_date_series(
s17_dates["CINreferralDate"], s17_dates["AssessmentActualStartDate"], days=True
)

# Only assessments within config-specified period following referral are valid
s17_dates = _filter_events(
s17_dates, "days_to_s17", max_days=reports_config["ref_assessment"]
)

s47_dates = data[data["S47ActualStartDate"].notna()][
["LAchildID", "CINreferralDate", "S47ActualStartDate"]
].drop_duplicates()

s47_dates["days_to_s47"] = _time_between_date_series(
s47_dates["CINreferralDate"], s47_dates["S47ActualStartDate"], days=True
)

# Only S47s within config-specified period following referral are valid
s47_dates = _filter_events(
s47_dates, "days_to_s47", max_days=reports_config["ref_assessment"]
)

merged = data[["LAchildID", "CINreferralDate", "PersonBirthDate"]].drop_duplicates()
merged = merged.merge(s17_dates, how="left", on=["LAchildID", "CINreferralDate"])
merged = merged.merge(s47_dates, how="left", on=["LAchildID", "CINreferralDate"])

neither = (
merged["AssessmentActualStartDate"].isna() & merged["S47ActualStartDate"].isna()
)
s17_set = (
merged["AssessmentActualStartDate"].notna()
& merged["S47ActualStartDate"].isna()
)
s47_set = (
merged["AssessmentActualStartDate"].isna()
& merged["S47ActualStartDate"].notna()
)
both_set = (
merged["AssessmentActualStartDate"].notna()
& merged["S47ActualStartDate"].notna()
)

merged["referral_outcome"] = np.select(
[neither, s17_set, s47_set, both_set],
["NFA", "S17", "S47", "BOTH"],
default=None,
)

merged["Age at referral"] = _time_between_date_series(
merged["CINreferralDate"], merged["PersonBirthDate"], years=True
)

return merged
107 changes: 107 additions & 0 deletions liiatools/cin_census_pipeline/reports/_reports_s47_journeys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import pandas as pd
import numpy as np
from datetime import datetime

from liiatools.cin_census_pipeline.spec import load_reports
from liiatools.cin_census_pipeline.reports import (
_time_between_date_series,
)


def s47_journeys(data: pd.DataFrame) -> pd.DataFrame:
"""
Creates an output that can generate a Sankey diagram of outcomes from S47 events
:param data: The data to calculate S47 event outcomes.
:return: The data with S47 outcomes attached.
"""
reports_config = load_reports()

s47_dates = data[data["S47ActualStartDate"].notna()][
["LAchildID", "CINreferralDate", "S47ActualStartDate"]
].drop_duplicates()

cpp_dates = data[data["CPPstartDate"].notna()][
["LAchildID", "CINreferralDate", "CPPstartDate"]
].drop_duplicates()

merged = data[
[
"LAchildID",
"CINreferralDate",
"PersonBirthDate",
"DateOfInitialCPC",
"Year",
]
].drop_duplicates()

merged = merged.merge(s47_dates, how="left", on=["LAchildID", "CINreferralDate"])
merged = merged.merge(cpp_dates, how="left", on=["LAchildID", "CINreferralDate"])

merged["icpc_to_cpp"] = _time_between_date_series(
merged["CPPstartDate"], merged["DateOfInitialCPC"], days=True
)

merged["s47_to_cpp"] = _time_between_date_series(
merged["CPPstartDate"], merged["S47ActualStartDate"], days=True
)

# Only keep logically consistent events (as defined in config variables)
merged = merged[
(
(merged["icpc_to_cpp"] >= 0)
& (merged["icpc_to_cpp"] <= reports_config["icpc_cpp_days"])
)
| (
(merged["s47_to_cpp"] >= 0)
& (merged["s47_to_cpp"] <= reports_config["s47_cpp_days"])
)
]

# Dates used to define window for S47 events where outcome may not be known because CIN Census is too recent
for y in merged["Year"]:
merged["cin_census_close"] = datetime(int(y), 3, 31)

merged["s47_max_date"] = merged["cin_census_close"] - pd.Timedelta(
reports_config["s47_day_limit"]
)
merged["icpc_max_date"] = merged["cin_census_close"] - pd.Timedelta(
reports_config["icpc_day_limit"]
)

merged["Source"] = "S47 strategy discussion"

icpc = merged["DateOfInitialCPC"].notna()

cpp_start = merged["DateOfInitialCPC"].isna() & merged["CPPstartDate"].notna()

# TODO: Check if this (and the default=No ICPC or CPP) ever actually comes up
# (I think they're removed when checking for logical events)
tbd = merged["S47ActualStartDate"] >= merged["s47_max_date"]

merged["Destination"] = np.select(
[icpc, cpp_start, tbd],
["ICPC", "CPP Start", "TBD - S47 too recent"],
default="No ICPC or CPP",
)

icpc_destination = merged[merged["Destination"] == "ICPC"]
icpc_destination["Source"] = "ICPC"

cpp_start_2 = icpc_destination["CPPstartDate"].notna()

tbd_2 = icpc_destination["DateOfInitialCPC"] >= icpc_destination["icpc_max_date"]

icpc_destination["Destination"] = np.select(
[cpp_start_2, tbd_2],
["CPP Start", "TBD - ICPC too recent"],
default="No CPP",
)

s47_journey = pd.concat([merged, icpc_destination])

s47_journey["Age at S47"] = _time_between_date_series(
s47_journey["S47ActualStartDate"], s47_journey["PersonBirthDate"], years=True
)

return s47_journey
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
from ._reports_assessment_factors import expanded_assessment_factors
from ._reports_referrals import referral_outcomes
from ._reports_s47_journeys import s47_journeys
from liiatools.cin_census_pipeline.reports import _time_between_date_series, _filter_events

__ALL__ = [
"expanded_assessment_factors",
"referral_outcomes",
"s47_journeys",
]



8 changes: 8 additions & 0 deletions liiatools/cin_census_pipeline/spec/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from functools import lru_cache
from pathlib import Path
import yaml

import xmlschema

Expand All @@ -21,5 +22,12 @@ def load_schema(year: int) -> xmlschema.XMLSchema:
return xmlschema.XMLSchema(SCHEMA_DIR / f"CIN_schema_{year:04d}.xsd")


@lru_cache
def load_schema_path(year: int) -> Path:
return Path(SCHEMA_DIR, f"CIN_schema_{year:04d}.xsd")


@lru_cache
def load_reports():
with open(SCHEMA_DIR / "reports.yml", "rt") as FILE:
return yaml.load(FILE, Loader=yaml.FullLoader)
Loading

0 comments on commit f320775

Please sign in to comment.