diff --git a/src/scripts/consumables_analyses/descriptive_analysis.py b/src/scripts/consumables_analyses/descriptive_analysis.py new file mode 100644 index 0000000000..61780bf128 --- /dev/null +++ b/src/scripts/consumables_analyses/descriptive_analysis.py @@ -0,0 +1,446 @@ +''' +Produce plots and estimates for the manuscript "Estimating the health gains and value for money of reducing drug stock-outs in Malawi: an individual-based modelling study" +''' + +import datetime +import os +import textwrap +from pathlib import Path + +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +from typing import Literal, Sequence, Optional, Union, List + +from tlo import Date +from tlo.analysis.utils import ( + extract_params, + get_scenario_info, + get_scenario_outputs, + load_pickled_dataframes, +) + +# Define a timestamp for script outputs +timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") + +# Print the start time of the script +print('Script Start', datetime.datetime.now().strftime('%H:%M')) + +# Create folders to store results +resourcefilepath = Path("./resources") +consumable_resourcefilepath = resourcefilepath / "healthsystem/consumables" +simulationfilepath = Path('./outputs/sakshi.mohan@york.ac.uk') +outputfilepath = Path('./outputs/consumables_impact_analysis') +if not os.path.exists(outputfilepath): + os.makedirs(outputfilepath) + +# Utility functions +# Assign item names to item codes +def assign_consumable_names_to_item_codes(df): + # Create dictionary mapping item_codes to consumables names + consumables_df = pd.read_csv(consumable_resourcefilepath / "ResourceFile_Consumables_Items_and_Packages.csv")[['Item_Code', 'Items']] + consumables_df = consumables_df[consumables_df['Item_Code'].notna()] + consumables_dict = dict(zip(consumables_df['Item_Code'], consumables_df['Items'])) + + # Add consumable_name to df + df = df.copy() + df['item_name'] = df['item_code'].map(consumables_dict) + + return df + +# Prepare availability data +def prepare_availability_dataset_for_plots( + _df: pd.DataFrame, + scenario_list: Optional[list[int]] = None, + scenario_names_dict: Optional[dict[str, str]] = None, + consumable_resourcefilepath: Path = None, + resourcefilepath: Path = None +) -> pd.DataFrame: + """ + Prepares a consumable availability dataset by merging facility and item category data, + renaming columns for scenarios, and cleaning category names for plotting. + """ + if scenario_list is None: + scenario_list = [] + if scenario_names_dict is None: + scenario_names_dict = {} + + # Load item category mapping + program_item_mapping = pd.read_csv( + consumable_resourcefilepath / 'ResourceFile_Consumables_Item_Designations.csv', + usecols=['Item_Code', 'item_category'] + ) + program_item_mapping = program_item_mapping.rename(columns={'Item_Code': 'item_code'}) + program_item_mapping = program_item_mapping[program_item_mapping['item_category'].notna()] + + # Load facility list + mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") + + # Merge facility and program info + _df = _df.merge( + mfl[['District', 'Facility_Level', 'Facility_ID']], + on='Facility_ID', how='left' + ) + _df = _df.merge(program_item_mapping, on='item_code', how='left') + + # Rename scenario columns + _df = _df.rename(columns=scenario_names_dict) + + # Clean item category names + clean_category_names = { + 'cancer': 'Cancer', + 'cardiometabolicdisorders': 'Cardiometabolic Disorders', + 'contraception': 'Contraception', + 'general': 'General', + 'hiv': 'HIV', + 'malaria': 'Malaria', + 'ncds': 'Non-communicable Diseases', + 'neonatal_health': 'Neonatal Health', + 'other_childhood_illnesses': 'Other Childhood Illnesses', + 'reproductive_health': 'Reproductive Health', + 'road_traffic_injuries': 'Road Traffic Injuries', + 'tb': 'Tuberculosis', + 'undernutrition': 'Undernutrition', + 'epi': 'Expanded programme on immunization' + } + _df['item_category'] = _df['item_category'].map(clean_category_names) + + return _df + +# Wrap Labels +def wrap_labels(labels, width=15): + """Wrap each label to the given character width.""" + return [textwrap.fill(str(lab), width) if lab is not None else "" for lab in labels] + +# Generate heatmaps of average availability +def generate_heatmap( + df: pd.DataFrame, + include_levels: Optional[List[str]] = None, + value_col: str = "Actual", + row: str = "item_category", + col: str = "Facility_Level", + row_order: Optional[Sequence[str]] = None, + col_order: Optional[Sequence[str]] = None, + figurespath: Optional[Path] = None, + filename: str = "heatmap_consumable_availability.png", + figsize: tuple[int, int] = (10, 8), + cmap: str = "RdYlGn", + annot: bool = True, + fmt: Optional[str] = None, # None -> auto choose + font_scale: float = 0.9, + cbar_label: str = "Proportion of days on which consumable is available", + xlabel: Optional[str] = None, + ylabel: Optional[str] = None, + as_pct: bool = True, # format annotations as percentages + round_decimals: int = 2, + # option to plot scenarios on the x_axis + scenario_axis: bool = False, # if True, columns become scenarios + scenario_cols: Optional[Sequence[str]] = None, +): + """ + Build a heatmap either by a single column (e.g., Facility_Level) or across multiple + scenario columns placed on the x-axis. + """ + if include_levels is not None: + df = df[df.Facility_Level.isin(include_levels)] + if scenario_axis: + aggregated = ( + df.groupby([row], dropna=True)[scenario_cols] + .mean() + .reset_index() + ) + # Add perfect scenario + aggregated['Perfect'] = 1 # Add a column representing the perfect scenario + heatmap_df = aggregated.set_index('item_category') + else: + # Standard mode: columns = `col`, values = mean(value_col) + aggregated = ( + df.groupby([row, col], dropna=True)[value_col] + .mean() + .reset_index() + ) + heatmap_df = aggregated.pivot(index=row, columns=col, values=value_col) + + # Optional ordering + if row_order is not None: + heatmap_df = heatmap_df.reindex(row_order) + if col_order is not None: + heatmap_df = heatmap_df.reindex(columns=col_order) + + # 2) Totals (means across the raw data, not the pivot means) + if scenario_axis: + # Means by row across all programs for each scenario + row_means = heatmap_df.mean(axis=0) # average per scenario across programs + avg_row = row_means.copy() + heatmap_df.loc["Average"] = avg_row + else: + # Compute from raw df to avoid double-averaging + col_means = df.groupby(row, dropna=False)[value_col].mean() + row_means = df.groupby(col, dropna=False)[value_col].mean() + overall_mean = df[value_col].mean() + + heatmap_df["Average"] = col_means.reindex(heatmap_df.index) + avg_row = row_means.reindex(heatmap_df.columns).copy() + avg_row.loc["Average"] = overall_mean + heatmap_df.loc["Average"] = avg_row + + # 3) Annotation formatting + if as_pct: + # If values are 0–1 proportions, annotate as percentages + display_df = (heatmap_df * 100).round(round_decimals) + if fmt is None: + fmt = ".0f" if round_decimals == 0 else f".{round_decimals}f" + annot_kws = {"fmt": fmt} + # Build string labels with % sign + annot_data = display_df.astype(float) + else: + display_df = heatmap_df.round(round_decimals) + if fmt is None: + fmt = f".{round_decimals}f" + annot_kws = {"fmt": fmt} + annot_data = display_df.astype(float) + + # 4) Plot + sns.set(font_scale=font_scale) + fig, ax = plt.subplots(figsize=figsize) + hm = sns.heatmap( + annot_data, + annot=annot, + cmap=cmap, + cbar_kws={"label": cbar_label}, + ax=ax, + fmt=".2f" + ) + + # If percentage labels requested, overwrite text with % suffix + if annot and as_pct: + for t in ax.texts: + t.set_text(f"{t.get_text()}%") + + # 5) Labels & ticks + xlab = (xlabel or ("Scenario" if scenario_axis else col.replace("_", " ").title())) + ylab = (ylabel or row.replace("_", " ").title()) + ax.set_xlabel(xlab) + ax.set_ylabel(ylab) + ax.set_xticklabels(ax.get_xticklabels(), rotation=90) + ax.set_yticklabels(ax.get_yticklabels(), rotation=0) + + # 6) Save (optional) + if figurespath is not None: + figurespath = Path(figurespath) + figurespath.mkdir(parents=True, exist_ok=True) + outpath = figurespath / filename + plt.savefig(outpath, dpi=300, bbox_inches="tight") + + plt.show() + plt.close(fig) + return fig, ax, heatmap_df + +# Generate LaTex-compatible detailed table of availability +def generate_detail_availability_table_by_scenario( + df: pd.DataFrame, + groupby_var: str, + scenario_cols: Sequence[str], + include_levels: Union[str, Sequence[str]], + longtable: bool = False, + outputpath: Path = None, + decimals: int = 2, + caption: Optional[str] = None, + label_prefix: str = "tab:availability_by_", + col_width_groupbyvar: str = "4cm", + col_width_scenario: str = "1.8cm", +) -> str: + """ + Create a LaTeX table of average availability (as percentages) for each consumable across scenarios, + filtered to selected facility levels. + + Returns the LaTeX string and writes it to figurespath / f"availability_by_{groupby_var}.tex" + """ + + # --- Setup --- + if outputpath is None: + outputpath = outputfilepath / "appendix" # falls back to your existing default + outputpath.mkdir(parents=True, exist_ok=True) + + # Accept str OR list for include_levels + if isinstance(include_levels, str): + include_levels = [include_levels] + + table_df = df.copy() + + # Filter by facility level if the column is present + if "Facility_Level" in table_df.columns: + table_df = table_df[table_df["Facility_Level"].isin(include_levels)] + + # Aggregate means per item + grouped = ( + table_df + .groupby([groupby_var], dropna=False)[list(scenario_cols)] + .mean() + .reset_index() + ) + + # Rename first column to "Consumable" + grouped.rename(columns={groupby_var: "Consumable"}, inplace=True) + + # Escape LaTeX in Consumable names + def _latex_escape(s): + if pd.isna(s): + return "" + s = str(s) + # Order matters for backslashes; escape backslash first + s = s.replace("\\", r"\\") + s = s.replace("&", r"\&").replace("%", r"\%").replace("_", r"\_") + s = s.replace("#", r"\#").replace("{", r"\{").replace("}", r"\}") + s = s.replace("$", r"\$").replace("^", r"\^{}").replace("~", r"\~{}") + return s + + grouped["Consumable"] = grouped["Consumable"].map(_latex_escape) + + # Convert proportions -> percentage strings with escaped % + def pct_format(x: float) -> str: + if pd.isna(x): + return "" + return f"{x * 100:.{decimals}f}\\%" + + for c in scenario_cols: + grouped[c] = grouped[c].map(pct_format) + + # Build column format dynamically + # First col wider for names, then one col per scenario + column_format = ( + f"|R{{{col_width_groupbyvar}}}|" + + "|".join([f"R{{{col_width_scenario}}}"] * len(scenario_cols)) + + "|" + ) + + # Caption/label + if caption is None: + caption = "Summarized availability by consumable" + label = f"{label_prefix}{groupby_var}_{include_levels}" + + # Export to LaTeX (escape=False since we already escaped) + latex_table = grouped.to_latex( + longtable=longtable, + column_format=column_format, + caption=caption, + label=label, + position="h", + index=False, + escape=False, + header=True, + ) + + # Add \hline after each row + latex_table = latex_table.replace("\\\\\n", "\\\\ \\hline\n") + + # Save + outpath = outputpath / f"availability_by_{groupby_var}_{include_levels}.tex" + with open(outpath, "w", encoding="utf-8") as f: + f.write(latex_table) + + return latex_table + + +# Import and clean data files +#********************************** +# Import TLO model availability data +tlo_availability_df = pd.read_csv(consumable_resourcefilepath / "ResourceFile_Consumables_availability_small.csv") +scenario_names_dict={ + 'available_prop': 'Actual', + 'available_prop_scenario1': 'Non-therapeutic consumables', + 'available_prop_scenario2': 'Vital medicines', + 'available_prop_scenario3': 'Pharmacist- managed', + 'available_prop_scenario4': 'Level 1b', + 'available_prop_scenario5': 'CHAM', + 'available_prop_scenario6': '75th percentile facility', + 'available_prop_scenario7': '90th percentile facility', + 'available_prop_scenario8': 'Best facility', + 'available_prop_scenario9': 'Best facility (including DHO)', + 'available_prop_scenario10': 'HIV supply chain', + 'available_prop_scenario11': 'EPI supply chain', + 'available_prop_scenario12': 'HIV moved to Govt supply chain (Avg by Level)', + 'available_prop_scenario13': 'HIV moved to Govt supply chain (Avg by Facility_ID)', + 'available_prop_scenario14': 'HIV moved to Govt supply chain (Avg by Facility_ID times 1.25)', + 'available_prop_scenario15': 'HIV moved to Govt supply chain (Avg by Facility_ID times 0.75)' + } + +tlo_availability_df = prepare_availability_dataset_for_plots( + _df=tlo_availability_df, + scenario_list=[1, 2, 3, 6, 7, 8, 10, 11, 12, 13, 14, 15], + scenario_names_dict=scenario_names_dict, + consumable_resourcefilepath=consumable_resourcefilepath, + resourcefilepath=resourcefilepath +) + +# Generate figures for manuscript +#********************************** +# Figure 1: Average probability of consumable availability in public and CHAM health facilities in Malawi +_ = generate_heatmap( + df=tlo_availability_df, + value_col="Actual", + row="item_category", + col="Facility_Level", + figurespath = outputfilepath / 'manuscript', + filename="heatmap_program_and_level_actual.png", + figsize=(10, 8), + cmap="RdYlGn", + round_decimals=4, + cbar_label="Proportion of days on which consumable is available", + xlabel="Facility Level", + ylabel="Program", +) + +# Figure 3: Comparison of consumable availability across modelled scenarios +scenario_cols = ['Actual', 'Non-therapeutic consumables', 'Vital medicines', 'Pharmacist- managed','75th percentile facility', '90th percentile facility', 'Best facility'] +for level in ['1a', '1b']: + _ = generate_heatmap( + df=tlo_availability_df, + include_levels = [level], + value_col="Actual", + row="item_category", + col="Facility_Level", + figurespath=outputfilepath / 'manuscript', + filename=f"scenarios_heatmap_{level}.png", + figsize=(10, 8), + cmap="RdYlGn", + round_decimals=4, + cbar_label="Proportion of days on which consumable is available", + xlabel="Facility Level", + ylabel="Program", + scenario_axis = True, # if True, columns become scenarios + scenario_cols = scenario_cols, + ) + + +# Figure A.1: Trend in average consumable availability by facility level + +# Figure A.2: Comparison of consumable availability as per Open Logistics Management Information System (OpenLMIS), 2018 and Harmonised +# Health Facility Assessment, 2018-19 + +# Table B.1: Average probability of availability for each consumable under all scenarios (Level 1a) +tlo_availability_df = assign_consumable_names_to_item_codes(tlo_availability_df) + +# Table B.1: Average probability of availability for each consumable under all scenarios (Level 1a) +availablity_by_item_1a = generate_detail_availability_table_by_scenario( + df=tlo_availability_df, + groupby_var="item_name", + scenario_cols=scenario_cols, + include_levels="1a", + longtable=True, + outputpath=outputfilepath / "appendix", + decimals=2, + caption="Average probability of availability for each consumable under all scenarios (Level 1a)", +) + +# Table B.2: Average probability of availability for each consumable under all scenarios (Level 1b) +availablity_by_item_1b = generate_detail_availability_table_by_scenario( + df=tlo_availability_df, + groupby_var="item_name", + scenario_cols=scenario_cols, + include_levels="1b", + longtable=True, + outputpath=outputfilepath / "appendix", + decimals=2, + caption="Average probability of availability for each consumable under all scenarios (Level 1b)", +) diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py new file mode 100644 index 0000000000..cc37c26629 --- /dev/null +++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py @@ -0,0 +1,1461 @@ +import datetime +from pathlib import Path +import pickle +import calendar + +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import pandas as pd + +from typing import Literal, Optional, Dict, Tuple +import requests + +from pulp import LpProblem, LpMaximize, LpVariable, LpBinary, LpStatus, value, lpSum, LpContinuous, PULP_CBC_CMD +from math import ceil + +# define a timestamp for script outputs +timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") + +# print the start time of the script +print('Script Start', datetime.datetime.now().strftime('%H:%M')) + +# define folder pathways +outputfilepath = Path("./outputs/consumables_impact_analysis") +resourcefilepath = Path("./resources") +path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" +# Set local shared drive source +path_to_share = Path( # <-- point to the shared folder + '/Users/sm2511/CloudStorage/OneDrive-SharedLibraries-ImperialCollegeLondon/TLOModel - WP - Documents/' +) + +# 1. Import and clean data files +#********************************** +# Import Cleaned OpenLMIS data from 2018 +lmis = (pd.read_csv(outputfilepath / "ResourceFile_Consumables_availability_and_usage.csv") + [['district', 'fac_type_tlo', 'fac_name', 'month', 'item_code', 'available_prop', + 'closing_bal', 'amc', 'dispensed', 'received']]) + +# Drop duplicated facility, item, month combinations +print(lmis.shape, "rows before collapsing duplicates") +key_cols = ["district", "item_code", "fac_name", "month"] # keys that define a unique record + +# helper to keep one facility level per group (mode → most common; fallback to first non-null) +def _mode_or_first(s: pd.Series): + s = s.dropna() + if s.empty: + return np.nan + m = s.mode() + return m.iloc[0] if not m.empty else s.iloc[0] + +lmis = ( + lmis + .groupby(key_cols, as_index=False) + .agg( + closing_bal=("closing_bal", "sum"), + dispensed=("dispensed", "sum"), + received=("received", "sum"), + amc=("amc", "sum"), + available_prop=("available_prop", "mean"), + fac_type_tlo=("fac_type_tlo", _mode_or_first), # optional; remove if not needed + ) +) + +print(lmis.shape, "rows after collapsing duplicates") + +# Import data on facility location +location = (pd.read_excel(path_to_share / "07 - Data/Facility_GPS_Coordinates/gis_data_for_openlmis/LMISFacilityLocations_raw.xlsx") + [['LMIS Facility List', 'LATITUDE', 'LONGITUDE']]) +# Find duplicates in facilty names in the location dataset +duplicates = location[location['LMIS Facility List'].duplicated(keep=False)] +location = location.drop(duplicates[duplicates['LATITUDE'].isna()].index).reset_index(drop=True) # Drop those duplicates where location is missing +# Import ownership data +ownership = (pd.read_csv(path_to_share / "07 - Data/Consumables data/OpenLMIS/lmis_facility_ownership.csv"))[['fac_name', 'fac_owner']] +ownership = ownership.drop_duplicates(subset=['fac_name']) + +# Merge OpenLMIS and location and ownership data +lmis = lmis.merge(location, left_on='fac_name', right_on = 'LMIS Facility List', how = 'left', validate='m:1') +lmis = lmis.merge(ownership, on='fac_name', how = 'left', validate='m:1') +lmis.rename(columns = {'LATITUDE':'lat', 'LONGITUDE':'long', 'fac_type_tlo': 'Facility_Level'}, inplace = True) + +# Cleaning to match date to the same format as consumable availability RF in TLO model +month_map = { + "January": 1, "February": 2, "March": 3, "April": 4, + "May": 5, "June": 6, "July": 7, "August": 8, + "September": 9, "October": 10, "November": 11, "December": 12 +} +lmis["month"] = lmis["month"].map(month_map) +lmis["Facility_Level"] = lmis["Facility_Level"].str.replace("Facility_level_", "", regex=False) + +# Clean data types before analysis +# 1) Normalize fac_name +lmis["fac_name"] = ( + lmis["fac_name"] + .astype("string") # Pandas string dtype (not just object) + .str.normalize("NFKC") # unify unicode forms + .str.strip() # trim leading/trailing spaces + .str.replace(r"\s+", "_", regex=True) # collapse internal whitespace +) + +# 2) Normalize other key columns used in grouping/joins +lmis["item_code"] = lmis["item_code"].astype("string").str.strip() +lmis["district"] = lmis["district"].astype("string").str.strip().str.replace(r"\s+", "_", regex=True) +lmis["Facility_Level"] = lmis["Facility_Level"].astype("string").str.strip() + +# 3) Ensure numeric types (quietly coerce bad strings to NaN) +lmis["amc"] = pd.to_numeric(lmis["amc"], errors="coerce") +lmis["closing_bal"] = pd.to_numeric(lmis["closing_bal"], errors="coerce") + +# Keep only those facilities whose location is available +old_facility_count = lmis.fac_name.nunique() +lmis = lmis[lmis.lat.notna()] +new_facility_count = lmis.fac_name.nunique() +print(f"{old_facility_count - new_facility_count} facilities out of {old_facility_count} in the lmis data dropped due to " + f"missing location information") + +# Explore missingness +def compute_opening_balance(df: pd.DataFrame) -> pd.Series: + """ + Opening balance from same-month records: + OB = closing_bal - received + dispensed. + This is equivalent to OB_(m) = CB_(m-1) + """ + return df["closing_bal"] - df["received"] + df["dispensed"] + +# Drop the rows which seem suspicious +# - 1. Rows with AMC and OB = 0 (these would be excluded from the redistribution algorithm anyway) +# - 2. Rows where the (fac_name, item_code) pair only reported 0 AMC throughout the year +lmis.reset_index(inplace=True, drop=True) +lmis['opening_bal'] = compute_opening_balance(lmis) +cb0 = (lmis.closing_bal == 0)|(lmis.closing_bal.isna()) +ob0 = (lmis.opening_bal == 0)|(lmis.opening_bal.isna()) +amc0 = (lmis.amc == 0)|(lmis.amc.isna()) +d0 = (lmis.dispensed == 0)|(lmis.dispensed.isna()) +r0 = (lmis.received == 0)|(lmis.received.isna()) +prob0 = (lmis.available_prop == 0)|(lmis.available_prop.isna()) +non_zero_amc_groups = lmis.loc[~amc0, ["fac_name", "item_code"]].drop_duplicates() +non0_amc = lmis[['fac_name', 'item_code']].apply(tuple, axis=1).isin(non_zero_amc_groups.apply(tuple, axis=1)) +non_zero_ob_groups = lmis.loc[~ob0, ["fac_name", "item_code"]].drop_duplicates() +non0_ob = lmis[['fac_name', 'item_code']].apply(tuple, axis=1).isin(non_zero_ob_groups.apply(tuple, axis=1)) + +print(f"{(amc0 & non0_amc).sum()} rows ({(amc0 & non0_amc).sum()/len(lmis) * 100:.2f}%) where AMC is zero even though the same facility recorded a non-zero AMC for a specific item" + f"for a different month in the year") +print(f"Among these, {(amc0 & cb0 & non0_amc).sum()} rows ({(amc0 & cb0 & non0_amc).sum()/len(lmis) * 100:.2f}% of " + f"total) also recorded CB as zero even though CB was non-zero for a different month") +print(f"Among these, {(amc0 & cb0 & non0_amc & ~prob0).sum()} rows " + f"({(amc0 & cb0 & non0_amc& ~prob0).sum()/len(lmis) * 100:.2f}% of " + f"total) have been recorded as having a non-zero probability of drug availability based on data from other months.") + +old_row_count = len(lmis) +drop_mask = (amc0 & ob0) | (~non0_amc) +lmis = lmis[~drop_mask] +new_row_count = len(lmis) + +print(f"{old_row_count - new_row_count} ({(old_row_count - new_row_count)/old_row_count * 100:.2f}%)" + f"rows out of {old_row_count} in the lmis data dropped due to seemingly incorrect entries") +lmis.reset_index(inplace=True, drop = True) + +# TODO assume something else about location of these facilities with missing location - eg. centroid of district? +# only 16 facilties have missing information + +# ----------------------------------- +# 1) Data exploration +# ----------------------------------- +def generate_stock_adequacy_heatmap( + df: pd.DataFrame, + figures_path: Path = Path("figures"), + filename: str = "heatmap_adequacy_opening_vs_3xamc.png", + y_var: str = "district", # the variable on the y-axis of the heatmap + value_var: str = "item_code", # the count variable on the basis of which the values are calculated + value_label: str = "", # label describing the values in the heatmap + include_missing_as_fail: bool = False, # if True, items with NaN opening/amc count as NOT adequate + amc_threshold: float = 3.0, + compare: str = "ge" , # "ge" for >= threshold*AMC, "le" for <= threshold*AMC + decimals: int = 0, + cmap: str = "RdYlGn", + figsize= None, + xtick_rotation: int = 45, + ytick_rotation: int = 0, + annotation: bool = True, +): + """ + Heatmap values: for each (month, district), the % of item_code groups where + sum(opening_balance over Facility_ID) >= 3 * sum(amc over Facility_ID). + """ + + df = df.copy() + df["opening_bal"] = compute_opening_balance(df).replace([np.inf, -np.inf], np.nan).fillna(0.0) + + # --- 1. Ensure month is int and build label --- + df["month"] = pd.to_numeric(df["month"], errors="coerce").astype("Int64") + df = df.dropna(subset=["month"]) + df["month"] = df["month"].astype(int) + + df["_month_label"] = df["month"].map(lambda m: calendar.month_abbr[m]) + + # ---- 2) Aggregate to (month, district, item_code) over facilities ---- + agg = ( + df.groupby(["month", "_month_label", y_var, value_var], dropna=False) + .agg(opening_bal=("opening_bal", "sum"), + amc=("amc", "sum")) + .reset_index() + ) + + # ---- 3) Adequacy indicator per (month, district, item_code) ---- + if include_missing_as_fail: + # NaNs treated as fail -> fill with NaN-safe compare: set False when either missing + ok = agg[["opening_bal", "amc"]].notna().all(axis=1) + left = agg["opening_bal"] + right = amc_threshold * agg["amc"] + if compare == "le": + cond = (left <= right) & ok + else: # default to ">=" + cond = (left >= right) & ok + else: + valid = agg.dropna(subset=["opening_bal", "amc"]) + cond = pd.Series(False, index=agg.index) + left = valid["opening_bal"] + right = amc_threshold * valid["amc"] + if compare == "le": + cond.loc[valid.index] = left <= right + else: + cond.loc[valid.index] = left >= right + + agg["condition_met"] = cond.astype(int) + + # --- % meeting condition per (month, district) across item_code --- + if include_missing_as_fail: + denom = agg.groupby(["month", "_month_label", y_var])[value_var].nunique() + numer = agg.groupby(["month", "_month_label", y_var])["condition_met"].sum() + else: + valid_mask = agg[["opening_bal", "amc"]].notna().all(axis=1) + denom = agg[valid_mask].groupby(["month", "_month_label", y_var])[value_var].nunique() + numer = agg[valid_mask].groupby(["month", "_month_label", y_var])["condition_met"].sum() + + pct = (numer / denom * 100).replace([np.inf, -np.inf], np.nan).reset_index(name="pct_meeting") + + # ---- 5) Pivot: districts (rows) x months (columns) ---- + # Sort months by _month_sort and use _month_label as the displayed column name + month_order = ( + pct[["month", "_month_label"]] + .drop_duplicates() + .sort_values("month") + ["_month_label"] + .tolist() + ) + heatmap_df = pct.pivot(index=y_var, columns="_month_label", values="pct_meeting") + heatmap_df = heatmap_df.reindex(columns=month_order) + + # --- Add average row and column --- + # Column average (mean of each month) + heatmap_df.loc["Average"] = heatmap_df.mean(axis=0) + # Row average (mean of each y_var) + heatmap_df["Average"] = heatmap_df.mean(axis=1) + + # Fix overall average (bottom-right) + overall_avg = heatmap_df.loc["Average", "Average"] + heatmap_df.loc["Average", "Average"] = overall_avg + + # Optional rounding for nicer colorbar ticks (doesn't affect color) + if decimals is not None: + heatmap_df = heatmap_df.round(decimals) + + # --- Dynamic figure size --- + if figsize is None: + n_rows = len(heatmap_df) + n_cols = len(heatmap_df.columns) + height = max(6, n_rows * 0.2) # taller if many rows + width = max(8, n_cols * 0.6) + figsize = (width, height) + + # ---- 6) Plot heatmap ---- + sns.set(font_scale=1.0) + fig, ax = plt.subplots(figsize=figsize) + + cbar_kws = value_label + hm = sns.heatmap( + heatmap_df, + cmap=cmap, + cbar_kws={"label": value_label}, + ax=ax, + annot=annotation, annot_kws={"size": 10}, + vmin = 0, vmax = 100) + + ax.set_xlabel("Month") + ax.set_ylabel(f"{y_var}") + ax.set_xticklabels(ax.get_xticklabels(), rotation=xtick_rotation) + ax.set_yticklabels(ax.get_yticklabels(), rotation=ytick_rotation) + + # Keep colorbar ticks plain (no scientific notation) + try: + cbar_ax = ax.figure.axes[-1] + cbar_ax.ticklabel_format(style="plain") + except Exception: + pass + + # ---- 7) Save & return ---- + figures_path.mkdir(parents=True, exist_ok=True) + outpath = figures_path / filename + plt.savefig(outpath, dpi=300, bbox_inches="tight") + plt.show() + plt.close(fig) + + return fig, ax, heatmap_df + +# ----------------------------------- +# 2) Estimate travel time matrix +# ----------------------------------- + +def _chunk_indices(n: int, chunk: int): + """Yield (start, end) index pairs for chunking 0..n-1.""" + for s in range(0, n, chunk): + e = min(n, s + chunk) + yield s, e + +def build_travel_time_matrix( + fac_df: pd.DataFrame, + *, + id_col: str = "fac_name", + lat_col: str = "lat", + lon_col: str = "long", + mode: Literal["car", "bicycle"] = "car", + backend: Literal["ors", "osrm"] = "ors", + # ORS options + ors_api_key: Optional[str] = None, + ors_base_url: str = "https://api.openrouteservice.org/v2/matrix", + # OSRM options (self-hosted or public; note: public often has only 'car') + osrm_base_url: str = "https://router.project-osrm.org", + osrm_profile_map: dict = None, + # matrix request chunking (keeps requests within API limits) + max_chunk: int = 40, + timeout: int = 60, +) -> pd.DataFrame: + """ + Build an NxN *road* travel-time matrix (minutes) for facilities, by CAR or BICYCLE + + backends: + - 'ors' -> uses OpenRouteService Matrix API (profiles: driving-car, cycling-regular). + Requires ors_api_key. Has rate/size limits; we auto-chunk. + - 'osrm' -> uses OSRM 'table' service (profiles typically 'car' or 'bike'). + For bicycle, you'll likely need a self-hosted OSRM with the bicycle profile. + + Parameters + ---------- + mode : 'car' | 'bicycle' + Travel mode on roads. + max_chunk : int + Max #origins (and #destinations) per sub-matrix request to keep within API limits. + + Returns + ------- + pd.DataFrame + Square DataFrame (minutes), index/columns = facility names. + """ + facs = fac_df[[id_col, lat_col, lon_col]].dropna().drop_duplicates().reset_index(drop=True) + ids = facs[id_col].tolist() + lats = facs[lat_col].to_numpy() + lons = facs[lon_col].to_numpy() + n = len(ids) + + T = pd.DataFrame(np.full((n, n), np.nan, dtype=float), index=ids, columns=ids) + np.fill_diagonal(T.values, 0.0) + + if n == 0: + return T + + if backend == "ors": + if ors_api_key is None: + raise ValueError("OpenRouteService requires ors_api_key.") + profile = "driving-car" if mode == "car" else "cycling-regular" + + # ORS expects [lon, lat] + coords = [[float(lons[i]), float(lats[i])] for i in range(n)] + + headers = {"Authorization": ors_api_key, "Content-Type": "application/json"} + + # Chunk both sources and destinations to stay under limits + for si, sj in _chunk_indices(n, max_chunk): + for di, dj in _chunk_indices(n, max_chunk): + sources = list(range(si, sj)) + destinations = list(range(di, dj)) + + body = { + "locations": coords, + "sources": sources, + "destinations": destinations, + "metrics": ["duration"], + } + url = f"{ors_base_url}/{profile}" + r = requests.post(url, json=body, headers=headers, timeout=timeout) + r.raise_for_status() + data = r.json() + + # durations in seconds; fill submatrix + durs = data.get("durations") + if durs is None: + raise RuntimeError(f"ORS returned no durations for block {si}:{sj} x {di}:{dj}") + sub = (np.array(durs, dtype=float) / 60.0) # minutes + T.iloc[si:sj, di:dj] = sub + + elif backend == "osrm": + # Map desired mode to OSRM profile names + if osrm_profile_map is None: + osrm_profile_map = {"car": "car", "bicycle": "bike"} + profile = osrm_profile_map.get(mode) + if profile is None: + raise ValueError(f"No OSRM profile mapped for mode='{mode}'.") + + # NOTE: The public OSRM demo often supports *car only*. + # For bicycle, run your own OSRM with the bicycle profile. + # We use the OSRM 'table' service; POST with 'sources' and 'destinations' indices. + + coords = ";".join([f"{lons[i]},{lats[i]}" for i in range(n)]) # lon,lat + + for si, sj in _chunk_indices(n, max_chunk): + for di, dj in _chunk_indices(n, max_chunk): + sources = ";".join(map(str, range(si, sj))) + destinations = ";".join(map(str, range(di, dj))) + + url = ( + f"{osrm_base_url}/table/v1/{profile}/{coords}" + f"?sources={sources}&destinations={destinations}&annotations=duration" + ) + r = requests.get(url, timeout=timeout) + r.raise_for_status() + data = r.json() + + durs = data.get("durations") + if durs is None: + raise RuntimeError(f"OSRM returned no durations for block {si}:{sj} x {di}:{dj}") + sub = (np.array(durs, dtype=float) / 60.0) # minutes + T.iloc[si:sj, di:dj] = sub + + else: + raise ValueError("backend must be 'ors' or 'osrm'.") + + # Safety: replace any residual NaNs (unroutable pairs) with inf or a large number + T = T.fillna(np.inf) + return T + +# Because ORS and ORSM can only handle a limited number of elements at a time, it's better to run this by district +# each of which already has under 50 facilities +def build_time_matrices_by_district( + df: pd.DataFrame, + *, + district_col: str = "district", + id_col: str = "fac_name", + lat_col: str = "lat", + lon_col: str = "long", + mode: str = "car", + backend: str = "osrm", + osrm_base_url: str = "https://router.project-osrm.org", + ors_api_key: str | None = None, + max_chunk: int = 50, # safe for both OSRM/ORS +) -> dict[str, pd.DataFrame]: + """ + Returns a dict: {district -> square minutes matrix DataFrame}, computed within each district only. + """ + matrices = {} + # unique facilities per district (drop duplicates in case of repeated rows) + fac_cols = [district_col, id_col, lat_col, lon_col] + fac_coords = df[fac_cols].dropna().drop_duplicates() + + for d, facs_d in fac_coords.groupby(district_col): + # Skip tiny groups + if len(facs_d) < 2: + continue + + T = build_travel_time_matrix( + fac_df=facs_d[[id_col, lat_col, lon_col]], + id_col=id_col, lat_col=lat_col, lon_col=lon_col, + mode=mode, backend=backend, + osrm_base_url=osrm_base_url, + ors_api_key=ors_api_key, + max_chunk=max_chunk, + ) + matrices[d] = T + + return matrices + +# ----------------------------------------------- +# 3) Data prep for redistribution linear program +# ----------------------------------------------- +def presumed_availability(ob, amc, eps=1e-9) -> float: + """ + Presumed likelihood of cons availability = p = min(1, OB/AMC) at month start (no additional receipts considered + at this point in time). + """ + return float(min(1.0, max(0.0, (ob / max(eps, amc))))) + + +def build_edges_within_radius( + time_matrix: pd.DataFrame, + max_minutes: float +) -> Dict[str, set]: + """ + For each facility g, return set of receivers f such that T[g,f] <= max_minutes, f != g. + """ + edges = {} + for g in time_matrix.index: + feasible = set(time_matrix.columns[(time_matrix.loc[g] <= max_minutes) & (time_matrix.columns != g)]) + edges[g] = feasible + return edges + +def build_edges_within_radius_flat(T_by_dist: dict, max_minutes: float) -> dict[str, set[str]]: + """ + Takes the district-wise dictionary of time travel matrices and converts it into a flat dictionary of facilities + and their edge neighbours depending on the maximum allowable travel distance. + T_by_dist: {district -> square DataFrame of minutes (index/cols = facility IDs)} + Returns: {facility_id -> set(of facility_ids)} for all districts combined. + """ + edges: dict[str, set[str]] = {} + for _, T in T_by_dist.items(): + for g in T.index: + row = T.loc[g].to_numpy() + feasible_mask = (row <= max_minutes) & np.isfinite(row) + # Exclude self + feasible = [f for f in T.columns[feasible_mask] if f != g] + if g not in edges: + edges[g] = set() + edges[g].update(feasible) + return edges + +# a = build_edges_within_radius(T_car, max_minutes = 18) + +# Defining clusters of health facilities within district +# This function helps find the facilities which would be appropriate cluster centers +def _farthest_first_seeds(T: pd.DataFrame, k: int, big: float = 1e9) -> list: + """ + Pick k seed medoids via farthest-first traversal on a travel-time matrix. + Treat inf/NaN distances as 'big' so disconnected components get separate seeds. + """ + n = T.shape[0] + facs = T.index.tolist() + D = T.to_numpy().astype(float) + D[~np.isfinite(D)] = big + + # Start at the row with largest average distance (covers sparse areas first) + start = int(np.nanargmax(np.nanmean(D, axis=1))) # the remotest facility + seeds_idx = [start] + + # Iteratively add the point with max distance to its nearest seed + for _ in range(1, k): + # min distance to any existing seed for every point + min_to_seed = np.min(D[:, seeds_idx], axis=1) # this has a dimension of [facs, number of seeds] + next_idx = int(np.argmax(min_to_seed)) # for each facility find the distance to its nearest seed + # and the facility farthest from the nearest seed becomes the next seed + if next_idx in seeds_idx: + # Fallback: pick any non-seed with highest min_to_seed + candidates = [i for i in range(n) if i not in seeds_idx] + if not candidates: + break + next_idx = int(candidates[np.argmax(min_to_seed[candidates])]) + seeds_idx.append(next_idx) + + return [facs[i] for i in seeds_idx] # list of length k representing the clustering points + +# Assign each facility to its nearest seed subject to a hard cluster capacity (≤ X members) +def _assign_to_cluster_with_fixed_capacity(T: pd.DataFrame, seeds: list, capacity: int, big: float = 1e9) -> Dict[str, int]: + """ + Greedy assignment of facilities to nearest seed that still has capacity (based on maximum cluster size). + Returns: mapping facility -> seed_index (position in seeds list). + """ + facs = T.index.tolist() + D = T.loc[facs, seeds].to_numpy().astype(float) # Distance of all facilities from the k seeds + D[~np.isfinite(D)] = big + + # each facility: nearest distance to any seed (for stable ordering) + nearest = D.min(axis=1) # find the shortest distance to a cluster for each facility + order = np.argsort(nearest) # sort all facilities in ascending order of their distance from the nearest facility + + cap_left = {j: capacity for j in range(len(seeds))} # the capacity left for each seed + assign = {} + + for idx in order: + f = facs[idx] + # try seeds in ascending distance + seq = np.argsort(D[idx, :]) # the sequence of seeds most suitable for idx + placed = False + for j in seq: + if cap_left[j] > 0: + assign[f] = j + cap_left[j] -= 1 + placed = True + break + if not placed: + # total capacity >= n, so this should be rare; if it happens, put in least-loaded seed + j = min(cap_left, key=lambda jj: cap_left[jj]) + assign[f] = j + cap_left[j] -= 1 + + return assign + +def capacity_clusters_for_district( + T_d: pd.DataFrame, cluster_size: int = 5, big: float = 1e9, refine_swaps: int = 0 +) -> Dict[str, str]: + """ + Build ~equal-size clusters (size<=cluster_size) from a district's travel-time matrix via + capacity-constrained k-medoids (farthest-first seeds + greedy capacity assignment). + + Returns: {facility_id -> cluster_id} (cluster ids like 'C00','C01',...) + """ + facs = T_d.index.tolist() + n = len(facs) + if n == 0: + return {} + if n <= cluster_size: + return {f: "C00" for f in facs} + + k = ceil(n / cluster_size) + seeds = _farthest_first_seeds(T_d, k=k, big=big) + assign_seed_idx = _assign_to_cluster_with_fixed_capacity(T_d, seeds=seeds, capacity=cluster_size, big=big) + + # Optional tiny refinement: (off by default) + # You could add 1–2 passes of medoid swap within clusters to reduce intra-cluster travel. + + # Build cluster ids in seed order + seed_to_cid = {j: f"C{j:02d}" for j in range(len(seeds))} + return {f: seed_to_cid[assign_seed_idx[f]] for f in facs} + +def build_capacity_clusters_all( + T_by_dist: Dict[str, pd.DataFrame], cluster_size: int = 5 +) -> pd.Series: + """ + Apply capacity clustering to all districts. + Args: + T_by_dist: {'DistrictName': time_matrix (minutes, square DF with facility ids)} + cluster_size: desired max cluster size (e.g., 5) + + Returns: + pd.Series mapping facility_id -> cluster_id names scoped by district, e.g. 'Nkhotakota#C03' + """ + mappings = [] + for d, T_d in T_by_dist.items(): + if T_d is None or T_d.empty: + continue + local_map = capacity_clusters_for_district(T_d, cluster_size=cluster_size) + if not local_map: + continue + s = pd.Series(local_map, name="cluster_id") + s = s.map(lambda cid: f"{d}#{cid}") # scope cluster name by district + mappings.append(s) + if not mappings: + return pd.Series(dtype=object) + return pd.concat(mappings) + +# ----------------------------------------------- +# 3) LP/MILP Redistribution +# ----------------------------------------------- +def redistribute_radius_lp( + df: pd.DataFrame, + time_matrix: pd.DataFrame, + radius_minutes: float, + tau_keep: float = 2.0, # Keeper minimum (2 --> Donors want to keep at least 2 X AMC) + tau_tar: float = 1.0, # Receiver target (1 --> Receivers want OB = AMC) + alpha: float = 0.7, # Donation cap (between 0 and 1) + # alpha can be set to 1 if donors only want to keep a min of tau_keep X AMC + K_in: int = 2, # max donors per receiver (per item) + K_out: int = 2, # max receivers per donor (per item) + # TODO the above limits should be across items not per item + Qmin_days: float = 7.0, # minimum lot size in "days of demand" at receiver + # i.e. at least Qmin_days worth of stock should be supplied for the transfer to be justifiable + id_cols=("district","month","item_code"), + facility_col="fac_name", + amc_col="amc", + close_cols=("closing_bal","received","dispensed"), + return_edge_log: bool = True, +) -> pd.DataFrame: + """ + Scenario 1 or 2: local swaps within a time radius. + Presumed p' = min(1, OB'/AMC). + Objective: maximize total filled deficits (sum of transfers), which is a proxy for maximizing total availability. + + Returns a copy of df with a new column 'available_prop_redis' for redistributed probability per row. + """ + closing_bal, received, dispensed = close_cols + out = df.copy() + out["OB"] = compute_opening_balance(out).replace([np.inf, -np.inf], np.nan).fillna(0.0) + out["OB_prime"] = out["OB"] # container for updated OB by (district, month, item, facility) + + # iterate per (district, month, item) + group_cols = list(id_cols) + EPS = 1e-9 + EPS_AMC = 1e-6 + + edge_logs = [] # will log dicts with district, month, item_code, donor, receiver, units_moved, travel_minutes + + for (d, m, i), g in out.groupby(group_cols, sort=False): + # facilities in this slice + g = g.copy() + + # ---------- (1) Select the correct time matrix for this district / slice ---------- + if isinstance(time_matrix, dict): + T_d = time_matrix.get(d) + if T_d is None or T_d.empty: + continue + else: + T_d = time_matrix + + # facilities present in this slice AND in the matrix index/cols + slice_facs = g[facility_col].dropna().unique().tolist() + present = [f for f in slice_facs if (f in T_d.index and f in T_d.columns)] + if len(present) < 2: + continue + + # subset and sanitize travel-times (NaN -> inf) + # This is to make sure that the redistribution LP is run only for those pairs for which a travel time could be calculated + T_sub = T_d.loc[present, present].replace(np.nan, np.inf) + + # ---------- (2) Clean inputs (AMC, OB) ---------- + # build per-fac arrays + AMC = (g.set_index(facility_col)[amc_col] + .astype(float) + .replace([np.inf, -np.inf], np.nan) + .fillna(0.0)) + AMC[AMC <= 0.0] = EPS_AMC # avoid zero AMC + + OB0 = (g.set_index(facility_col)["OB"] + .astype(float) + .replace([np.inf, -np.inf], np.nan) + .fillna(0.0)) + + # keep only present facilities + AMC = AMC.reindex(present).fillna(EPS_AMC) + OB0 = OB0.reindex(present).fillna(0.0) + + # ---------- (3) Surplus/deficit with finite guards ---------- + # donor surplus cap and receiver deficit + surplus_cap = alpha * np.maximum(0.0, OB0.values - tau_keep * AMC.values) + deficit = np.maximum(0.0, tau_tar * AMC.values - OB0.values) + surplus_cap = np.where(np.isfinite(surplus_cap), surplus_cap, 0.0) + deficit = np.where(np.isfinite(deficit), deficit, 0.0) + + donors_mask = surplus_cap > EPS + recv_mask = deficit > EPS + if not donors_mask.any() or not recv_mask.any(): + continue + + donors = list(AMC.index[donors_mask]) + receivers = list(AMC.index[recv_mask]) + + smax_map = {fac: float(x) for fac, x in zip(AMC.index, surplus_cap)} # creates a dict with key = facility and value = donatable surplus + # this is the upper bound for how much a facility can donate + deficit_map = {fac: float(x) for fac, x in zip(AMC.index, deficit)} # creates a dict with key = facility and value = deficit + # this is the upper bound for how much a facility can receive + + # ---------- (4) Build feasible edges within radius (finite & <= radius) ---------- + feasible = {} + for g_fac in present: + row = T_sub.loc[g_fac].to_numpy() + mask = (row <= radius_minutes) & np.isfinite(row) + feas = set(T_sub.columns[mask]) - {g_fac} + feasible[g_fac] = feas + + # ---------- (5) Edge set with tight M_edge and min-lot per receiver ---------- + Qmin_units = {f: float(max(Qmin_days * AMC[f], 0.0)) for f in present} + M_edge = {} + for g_fac in donors: + smax = smax_map.get(g_fac, 0.0) + if smax <= EPS: + continue + for f_fac in feasible.get(g_fac, set()): + if f_fac not in receivers: + continue + h = deficit_map.get(f_fac, 0.0) + M = min(h, smax) + if np.isfinite(M) and M > EPS: + M_edge[(g_fac, f_fac)] = M + + if not M_edge: + continue # nothing feasible to move + + # Build MILP + prob = LpProblem(f"Redistribution_{d}_{m}_{i}", LpMaximize) + + # Variables + t = {} # transfer amounts per (g,f): continuous >=0 + y = {} # edge-activation binaries + for (g_fac, f_fac), M in M_edge.items(): + t[(g_fac, f_fac)] = LpVariable(f"t_{g_fac}->{f_fac}", lowBound=0, upBound=M, cat=LpContinuous) + y[(g_fac, f_fac)] = LpVariable(f"y_{g_fac}->{f_fac}", lowBound=0, upBound=1, cat=LpBinary) + + # objective: maximize total shipped (proxy for availability gain) + prob += lpSum(t.values()) + + # donor outflow caps + for g_fac in donors: + prob += lpSum(t[(g_fac, f_fac)] for f_fac in receivers if (g_fac, f_fac) in t) <= smax_map.get(g_fac, 0.0) + + # receiver inflow caps + for f_fac in receivers: + prob += lpSum(t[(g_fac, f_fac)] for g_fac in donors if (g_fac, f_fac) in t) <= deficit_map.get(f_fac, 0.0) + + # link t and y; enforce min-lot clipped by edge capacity + for (g_fac, f_fac), M in M_edge.items(): + prob += t[(g_fac, f_fac)] <= M * y[(g_fac, f_fac)] + qmin = min(Qmin_units.get(f_fac, 0.0), M) + # if qmin is basically zero, don't force a min lot + if qmin > EPS: + prob += t[(g_fac, f_fac)] >= qmin * y[(g_fac, f_fac)] + + # cardinality limits + for f_fac in receivers: + prob += lpSum(y[(g_fac, f_fac)] for g_fac in donors if (g_fac, f_fac) in y) <= K_in + for g_fac in donors: + prob += lpSum(y[(g_fac, f_fac)] for f_fac in receivers if (g_fac, f_fac) in y) <= K_out + + # solve + prob.solve(PULP_CBC_CMD(msg=False, cuts=0, presolve=True, threads=1)) + + if LpStatus[prob.status] not in ("Optimal", "Optimal Infeasible", "Not Solved"): + continue + + # apply transfers & LOG + delta = {fac: 0.0 for fac in present} + for (g_fac, f_fac), var in t.items(): + moved = float(value(var) or 0.0) + if moved > EPS_AMC: + delta[g_fac] -= moved + delta[f_fac] += moved + if return_edge_log: + edge_logs.append({ + "district": d, "month": m, "item_code": i, + "donor_fac": g_fac, "receiver_fac": f_fac, + "units_moved": moved, + "travel_minutes": float(T_sub.loc[g_fac, f_fac]) if np.isfinite( + T_sub.loc[g_fac, f_fac]) else np.nan + }) + + # write OB' + sel = (out["district"].eq(d) & out["month"].eq(m) & out["item_code"].eq(i)) + out.loc[sel, "OB_prime"] = out.loc[sel].apply( + lambda r: (r["OB"] + delta.get(r[facility_col], 0.0)), axis=1 + ) + + print(d, m, i, + "donors:", len(donors), + "receivers:", len(receivers), + "edges:", len(M_edge)) + + # ---------- (8) Mechanistic availability after redistribution ---------- + out["available_prop_redis"] = np.minimum( + 1.0, + np.maximum(0.0, out["OB_prime"] / np.maximum(EPS_AMC, out[amc_col].astype(float).values)) + ) + + # build log DataFrames + edge_log_df = pd.DataFrame(edge_logs, + columns=["district", "month", "item_code", "donor_fac", "receiver_fac", "units_moved", + "travel_minutes"]) if return_edge_log else None + + return out, edge_log_df + +def redistribute_pooling_lp( + df: pd.DataFrame, + tau_min: float = 0.25, # lower floor in "months of demand" (≈ 7–8 days) - minimum transfer required + tau_max: float = 3.0, # upper ceiling (storage/policy max) + tau_donor_keep: float = 3.0, # minimum the donor keeps before donating + id_cols=("district","month","item_code"), + facility_col="fac_name", + level_col="Facility_Level", + amc_col="amc", + close_cols=("closing_bal","received","dispensed"), + keep_baseline_for_amc0: bool = True, # leave baseline availability where AMC≈0 + amc_eps: float = 1e-6, # threshold to treat AMC as "zero" + return_move_log: bool = True, # return a detailed df showing net movement of consumables after redistribution + pooling_level: str = "district", # "district" or "cluster" + cluster_map: pd.Series | None = None, # required if pooling_level=="cluster"; this specifes which cluster each facility belongs to + enforce_no_harm: bool = True, # if True, enforce p_f >= baseline at each facility (hard constraint) + floor_to_baseline: bool = True # if True, report availability floored to baseline (no decrease in outputs) +) -> pd.DataFrame: + """ + Scenario 3: district-level pooling/push . + Maximizes total availability with: + - NaN/inf guards on AMC/OB + - duplicate facility IDs collapsed within group + - floors scaled if total stock < sum floors + - optional 'excess' sink if total stock > sum ceilings + - availability computed safely; AMC≈0 rows keep baseline (optional) + + Pooling redistribution that can operate at the district level (default) + or within fixed-size clusters inside districts. + + If pooling_level == "cluster", you must pass cluster_map: Series mapping facility_id -> cluster_id + (cluster ids should already be district-scoped, e.g., "Dedza#C01"). + + Returns: + - out: original df plus columns: + OB, OB_prime, available_prop_redis, received_from_pool + where received_from_pool = OB_prime - OB (pos=received, neg=donated) + - (optional) move_log: per (district, month, item, facility) net movement summary + """ + if pooling_level not in ("district", "cluster"): + raise ValueError("pooling_level must be 'district' or 'cluster'.") + + closing_bal, received, dispensed = close_cols + out = df.copy() + + # Safe opening balance + out["OB"] = compute_opening_balance(out).replace([np.inf, -np.inf], np.nan).fillna(0.0) + + # Default (will overwrite per group) + out["OB_prime"] = out["OB"] + + # Attach cluster if needed + if pooling_level == "cluster": + if cluster_map is None: + raise ValueError("cluster_map is required when pooling_level='cluster'.") + # cluster_map: index = facility_id (facility_col), value = cluster_id (already district-scoped) + out = out.merge( + cluster_map.rename("cluster_id"), + how="left", + left_on=facility_col, + right_index=True, + ) + if out["cluster_id"].isna().any(): + # facilities missing a cluster—assign singleton clusters to keep them + out["cluster_id"] = out["cluster_id"].fillna( + out["district"].astype(str) + "#CXX_" + out[facility_col].astype(str)) + + group_cols = list(id_cols) + node_label = "district" + if pooling_level == "cluster": + group_cols = ["cluster_id", "month", "item_code"] + node_label = "cluster_id" + move_rows = [] # optional movement log + # TODO could remove the movement log + + skipped_nodes = [] # collect nodes that did NOT solve optimally + for keys, g in out.groupby(group_cols, sort=False): + g = g.copy() + # Resolve node ID for logging and selection masks + if pooling_level == "district": + node_val, m, i = g["district"].iloc[0], keys[1], keys[2] + else: + node_val, m, i = keys + + # Build per-facility Series (unique index) for AMC, OB, Level, Baseline p + AMC = (g.set_index(facility_col)[amc_col] + .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)) + OB0 = (g.set_index(facility_col)["OB"] + .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)) + LVL = (g.set_index(facility_col)[level_col].astype(str) + .replace({np.nan: ""})) + Pbase = (g.set_index(facility_col)["available_prop"] + .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)) + + # collapse duplicates if any + if AMC.index.duplicated().any(): + AMC = AMC[~AMC.index.duplicated(keep="first")] + if LVL.index.duplicated().any(): + LVL = LVL[~LVL.index.duplicated(keep="first")] + if Pbase.index.duplicated().any(): + Pbase = Pbase.groupby(level=0).mean() # average baseline if duplicates + if OB0.index.duplicated().any(): + OB0 = OB0.groupby(level=0).sum() + + total_stock = float(OB0.sum()) + if total_stock <= 1e-9: + continue + + # Participants (positive demand) + mask_pos = AMC >= amc_eps + facs_pos = AMC.index[mask_pos].tolist() + if not facs_pos: + # nothing to reallocate to; they will be donors only (handled by OB' defaults) + continue + + AMC_pos = AMC.loc[facs_pos] + OB0_pos = OB0.loc[facs_pos] + LVL_pos = LVL.reindex(facs_pos) + Pbase_pos = Pbase.reindex(facs_pos).fillna(0.0) + + # policy floors/ceilings (raw) + tau_min_floor = (tau_min * AMC_pos).astype(float) + tau_max_ceiling = (tau_max * AMC_pos).astype(float) + + # ---- Donor protection: x_f >= pLB_f = min(OB, tau_donor_keep*AMC) + pLB = np.minimum(OB0_pos, tau_donor_keep * AMC_pos) # the lower bound for a donor facility is OB if that is less than + #tau_donor_keep X AMC or tau_donor_keep X AMC is that is lower than OB (i.e. only OB - 3 X AMC can be donated, if tau_donor_keep = 3) + + # ---- Receiver eligibility: only 1a/1b may increase above OB + eligible_mask = LVL_pos.isin(["1a", "1b"]) + # For eligible: UB = tau_max*AMC + # For ineligible: UB = min(OB, tau_max*AMC) (can donate, cannot receive above OB) + UB = tau_max_ceiling.copy() + UB.loc[~eligible_mask] = np.minimum(OB0_pos.loc[~eligible_mask], tau_max_ceiling.loc[~eligible_mask]) + + # ---- Lower bound assembly BEFORE scaling tau_min + LB0 = np.maximum(pLB, tau_min_floor) # enforces receiver's lower bound + + # ---- Optional "no-harm": p_f >= baseline -> x_f >= AMC_f * p_base_f + if enforce_no_harm: + no_harm_lb = (AMC_pos * Pbase_pos).astype(float) + LB0 = np.maximum(LB0, no_harm_lb) + + # ---- Feasibility: scale ONLY the tau_min component if needed + sum_LB0 = float(LB0.sum()) + if total_stock + 1e-9 < sum_LB0: + # We want LB = max(pLB, tau_min_scaled, [no_harm_lb]) + # Scale only the portion of tau_min_floor that lies above pLB (and above no_harm if on). + base_guard = pLB.copy() + if enforce_no_harm: + base_guard = np.maximum(base_guard, (AMC_pos * Pbase_pos)) + + # Decompose tau_min contribution above base_guard + tau_min_above = np.maximum(0.0, tau_min_floor - np.minimum(tau_min_floor, base_guard)) + need = float(tau_min_above.sum()) + # Available stock for that component: + budget = total_stock - float(base_guard.sum()) + scale = 0.0 if need <= 1e-12 else max(0.0, min(1.0, budget / max(1e-9, need))) + + tau_min_scaled = np.minimum(base_guard, tau_min_floor) + tau_min_above * scale + LB = np.maximum(base_guard, tau_min_scaled) + else: + LB = LB0 + + # ---- Excess sink if ceilings bind + sum_UB = float(UB.sum()) + allow_excess_sink = total_stock > sum_UB + 1e-9 + + # ---------- LP ---------- + prob = LpProblem(f"Pooling_{node_val}_{m}_{i}", LpMaximize) + x = {f: LpVariable(f"x_{f}", lowBound=0) for f in facs_pos} + p = {f: LpVariable(f"p_{f}", lowBound=0, upBound=1) for f in facs_pos} + excess = LpVariable("excess", lowBound=0) if allow_excess_sink else None + + # Objective: maximize total availability + prob += lpSum(p.values()) + + # Conservation + if excess is None: + prob += lpSum(x.values()) == total_stock + else: + prob += lpSum(x.values()) + excess == total_stock + + # Bounds + linearization + for f in facs_pos: + prob += x[f] >= float(LB.loc[f]) # donor protection + tau_min (scaled) + (optional) no-harm + prob += x[f] <= float(UB.loc[f]) # eligibility-aware ceiling + prob += float(AMC_pos.loc[f]) * p[f] <= x[f] # p <= x/AMC + + # Solve + prob.solve(PULP_CBC_CMD(msg=False, cuts=0, presolve=True, threads=1)) + if LpStatus[prob.status] != "Optimal": + skipped_nodes.append((node_val, m, i)) + #print("NO Optimal solution found", node_val, m, i) + continue + #else: + #print("Optimal solution found", node_val, m, i) + + # Apply solution to OB' + x_sol = {f: float(value(var) or 0.0) for f, var in x.items()} + + # Selection mask for writing back + if pooling_level == "district": + sel = (out["district"].eq(node_val) & out["month"].eq(m) & out["item_code"].eq(i)) + else: + sel = (out["cluster_id"].eq(node_val) & out["month"].eq(m) & out["item_code"].eq(i)) + + # Facilities with AMC>=eps get x_f + mask_rows_pos = sel & out[facility_col].isin(facs_pos) + out.loc[mask_rows_pos, "OB_prime"] = out.loc[mask_rows_pos, facility_col].map(x_sol).values + + # Facilities with AMC OB' = 0 + # (this matches the "donate to pool" assumption) + mask_rows_zero = sel & ~out[facility_col].isin(facs_pos) + out.loc[mask_rows_zero, "OB_prime"] = 0.0 + + if return_move_log: + for f in AMC.index: # include amc≈0 facilities (x_f=0) + x_f = x_sol.get(f, 0.0) if f in facs_pos else 0.0 + net = x_f - float(OB0.get(f, 0.0)) + move_rows.append({ + node_label: node_val, + "month": m, + "item_code": i, + "facility": f, + "received_from_pool": net, + "x_allocated": x_f, + "OB0_agg": float(OB0.get(f, 0.0)), + "eligible_receiver": bool(LVL.get(f, "") in {"1a", "1b"}), + }) + + # ---------- Availability after redistribution ---------- + amc_safe = out[amc_col].astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0) + pos_mask_all = amc_safe >= amc_eps + + p_mech = np.zeros(len(out), dtype=float) # mechanistically calculated p based on balances and AMC + # (different from the base which was calculated from actual stkout_days recorded_ + denom = np.maximum(amc_eps, amc_safe[pos_mask_all].to_numpy()) + p_mech[pos_mask_all.values] = np.minimum( + 1.0, np.maximum(0.0, out.loc[pos_mask_all, "OB_prime"].to_numpy() / denom) + ) + + if keep_baseline_for_amc0: + base = (out.loc[~pos_mask_all, "available_prop"] + .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)).to_numpy() + p_mech[~pos_mask_all.values] = base + else: + p_mech[~pos_mask_all.values] = (out.loc[~pos_mask_all, "OB_prime"].to_numpy() > 0.0).astype(float) + + # For levels 1a and 1b force availability to be max of old (actual) versus new (mechnistically calculated) + out["available_prop_redis_raw"] = p_mech + out["available_prop_redis"] = ( + np.maximum(p_mech, out["available_prop"].astype(float).values) + if floor_to_baseline else p_mech + ) + # For levels other than 1a and 1b force availability to be equal to old + mask_non_elig = ~out["Facility_Level"].isin(["1a", "1b"]) + out.loc[mask_non_elig, "available_prop_redis"] = out.loc[mask_non_elig,"available_prop"] # this should ideally happen automatically + # however, there are facilities at levels 2-4 whether some stock out was experienced even though OB > AMC + # We want to retain the original probability in these cases because our overall analysis is restricted to levels 1a and 1b + + # Per-row movement + out["received_from_pool"] = out["OB_prime"] - out["OB"] + + # Check if the rules are correctly applied + # --- Build masks for skipping --- + # 1. Nodes that failed to solve optimally + # Exclude any node/month/item combinations that didn't solve optimally or which were skipped due to AMC == 0 + if skipped_nodes: + skipped_df = pd.DataFrame(skipped_nodes, columns=[node_label, "month", "item_code"]) + + # Merge to flag rows belonging to skipped groups + out = out.merge( + skipped_df.assign(skip_flag=True), + on=[node_label, "month", "item_code"], + how="left", + ) + mask_skip_solution = out["skip_flag"].fillna(False) + else: + mask_skip_solution = pd.Series(False, index=out.index) + + # 2. Facilities with AMC effectively zero + mask_skip_amc = out["amc"].astype(float) <= 1e-9 + + # Combined skip mask + mask_skip = mask_skip_solution | mask_skip_amc + mask_solved = ~mask_skip + print(f"Skipping {mask_skip.sum()} rows due to non-optimal LPs or AMC≈0") + + # No facility should end below min(OB, tau_donor_keep*AMC) (# Lower bound check) + tol = 1e-6 #tolerance + viol_lb = ( + (out.loc[mask_solved, "OB_prime"] < + (np.minimum(out.loc[mask_solved, "OB"], tau_donor_keep * out.loc[mask_solved, "amc"]) - tol)) + ) + + # No facility ends up above upper bounds (# Upper bound check) + elig = out.loc[mask_solved, "Facility_Level"].isin(["1a", "1b"]).values + ub = np.where( + elig, + tau_max * out.loc[mask_solved, "amc"], + np.minimum(out.loc[mask_solved, "OB"], tau_max * out.loc[mask_solved, "amc"]) + ) + viol_ub = out.loc[mask_solved, "OB_prime"].values > (ub + tol) + + #assert not viol_lb.any(), "viol_lb violated" + #assert not viol_ub.any(), "viol_ub violated" + temp = out[mask_solved] + if viol_lb.any(): + print("For the following rows (facility, item and month combinations), unclear why OB_prime < tau_donor_keep * AMC " + "which violates a constraint in the LPP") + # TODO see if this problem needs to be resolved + print(temp[viol_lb][['Facility_Level', 'amc', 'OB', 'OB_prime']]) + temp[viol_lb][['Facility_Level', 'fac_name', 'amc', 'OB', 'OB_prime']].to_csv('violates_lb.csv') + if viol_ub.any(): + print("For the following rows (facility, item and month combinations), unclear why OB_prime > tau_max * AMC " + "which violates a constraint in the LPP") + # TODO see if this problem needs to be resolved + print(temp[viol_ub][['Facility_Level', 'amc', 'OB', 'OB_prime']]) + temp[viol_ub][['Facility_Level', 'fac_name', 'amc', 'OB', 'OB_prime']].to_csv('violates_ub.csv') + + if return_move_log: + move_log = pd.DataFrame(move_rows) + return out, move_log + + return out +# pooled_df, pool_moves = redistribute_pooling_lp(lmis, tau_min=0.25, tau_max=3.0, return_move_log=True) + +# ------------------------ +# 4) HIGH-LEVEL DRIVER +# ------------------------ +def run_redistribution_scenarios( + df: pd.DataFrame, + scenario: Literal["radius_30","radius_60","pooling"], + time_matrix: Optional[pd.DataFrame] = None, + # tunables + tau_keep: float = 2.0, + tau_tar: float = 1.0, + alpha: float = 0.7, + K_in: int = 2, + K_out: int = 2, + Qmin_days: float = 7.0, + tau_min: float = 0.75, + tau_max: float = 3.0, + # schema + id_cols=("district","month","item_code"), + facility_col="fac_name", + amc_col="amc", + close_cols=("closing_bal","received","dispensed"), + # aggregation for TLO input + aggregate_for_tlo: bool = True, + level_col: str = "Facility_Level" +) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: + """ + Runs exactly one scenario and returns: + - detailed_df: row-level dataframe with 'available_prop_redis' + - tlo_df (optional): aggregated to (item_code, district, Facility_Level, month) with mean availability + + For radius scenarios, 'time_matrix' is required. For pooling, it is ignored. + """ + if scenario in ("radius_30","radius_60"): + if time_matrix is None: + raise ValueError("time_matrix is required for radius-based scenarios.") + radius = 30 if scenario=="radius_30" else 60 + detailed = redistribute_radius_lp( + df=df, + time_matrix=time_matrix, + radius_minutes=radius, + tau_keep=tau_keep, + tau_tar=tau_tar, + alpha=alpha, + K_in=K_in, + K_out=K_out, + Qmin_days=Qmin_days, + id_cols=id_cols, + facility_col=facility_col, + amc_col=amc_col, + close_cols=close_cols, + ) + elif scenario == "pooling": + detailed = redistribute_pooling_lp( + df=df, + tau_min=tau_min, + tau_max=tau_max, + id_cols=id_cols, + facility_col=facility_col, + amc_col=amc_col, + close_cols=close_cols, + return_move_log=False, + ) + else: + raise ValueError("scenario must be one of 'radius_30', 'radius_60', 'pooling'.") + + if not aggregate_for_tlo: + return detailed, None + + # Aggregate to TLO input grain: mean across facilities (match your current pipeline) + tlo = ( + detailed + .groupby(["item_code", "district", level_col, "month"], as_index=False) + .agg(new_available_prop=("available_prop_redis", "mean")) + .sort_values(["item_code","district",level_col,"month"]) + ) + return tlo + +# IMPLEMENT +# 1) Build a time matrix +fac_coords = lmis[['fac_name', 'district', 'lat','long']] +#T_car = build_time_matrices_by_district( +# fac_coords, +# mode="car", +# backend="osrm", +# osrm_base_url="https://router.project-osrm.org", +# max_chunk=50) + +# Store dictionary in pickle format +#with open(outputfilepath / "T_car2.pkl", "wb") as f: +# pickle.dump(T_car, f) +# -> Commented out because it takes long to run. The result has been stored in pickle format + +# Load pre-generated dictionary +with open(outputfilepath / "T_car2.pkl", "rb") as f: + T_car = pickle.load(f) +# T_car2 was created after cleaning fac names and getting rid of spaces in the text + +#edges_flat = build_edges_within_radius_flat(T_car, max_minutes= 60) + +# 2) Explore the availability and distances to make decisions about optimisation rules +# Plot stock adequacy by district and month to assess what bounds to set when pooling +fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'district', value_var = 'item_code', + value_label= f"% of consumables with Opening Balance ≥ 3 × AMC", + amc_threshold = 3, compare = "ge", + filename = "mth_district_stock_adequacy_3amc.png", figsize = (12,10)) +fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'district', value_var = 'item_code', + value_label= f"% of consumables with Opening Balance ≥ 3 × AMC", + amc_threshold = 2, compare = "ge", + filename = "mth_district_stock_adequacy_2amc.png", figsize = (12,10)) +fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'district', value_var = 'item_code', + value_label= f"% of consumables with Opening Balance <= 1 × AMC", + amc_threshold = 1, compare = "le", + filename = "mth_district_stock_inadequacy_1amc.png", figsize = (12,10)) +fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'item_code', value_var = 'fac_name', + value_label= f"% of facilities with Opening Balance ≥ 3 × AMC", + amc_threshold = 3, compare = "ge", + filename = "mth_item_stock_adequacy_3amc.png") +fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'item_code', value_var = 'fac_name', + value_label= f"% of facilities with Opening Balance <= 1 × AMC", + amc_threshold = 1, compare = "le", + filename = "mth_item_stock_inadequacy_1amc.png") + + +# Browse the number of eligible neighbours depending on allowable travel time +results = [] +for mins in [30, 60, 90, 120]: + edges_flat = build_edges_within_radius_flat(T_car, max_minutes=mins) + neighbors_count = pd.Series({fac: len(neigh) for fac, neigh in edges_flat.items()}) + mean = neighbors_count.mean() + sem = neighbors_count.sem() # standard error of mean + ci95 = 1.96 * sem + results.append({"radius": mins, "mean": mean, "ci95": ci95}) + +results_df = pd.DataFrame(results) + +# Plot +plt.figure(figsize=(6,4)) +plt.bar(results_df["radius"], results_df["mean"], yerr=results_df["ci95"], capsize=5, color="skyblue") +plt.xlabel("Travel time radius (minutes)") +plt.ylabel("Average number of facilities within radius") +plt.title("Average connectivity of facilities with 95% CI") +plt.xticks(results_df["radius"]) +plt.savefig(outputfilepath / "neighbour_count_by_max_travel_time") + +# A manual check shows that for distances greater than 60 minutes ORS underestimates the travel time a little +# TODO consider using google maps API + +#Drop NAs +# TODO find a more generalisable solution for this issue (within the optimisation functions) +#lmis = lmis[(lmis.amc != 0) & (lmis.amc.notna())] + +# 3) Implement pooled redistribution +# a) Build clusters from per-district travel-time matrices +# T_car_by_dist: {"District A": DF(index=fac_ids, cols=fac_ids), ...} +cluster_size = 3 +cluster_series = build_capacity_clusters_all(T_car, cluster_size=cluster_size) +# cluster_series is a pd.Series: index=facility_id, value like "District A#C00", "District A#C01", ... + +# b) Run optimisation at district level +print("Now running pooled redistribution at District level") +pooled_district_df, cluster_district_moves = redistribute_pooling_lp( + df=lmis, # the LMIS dataframe + tau_min=0, tau_max=4.0, + tau_donor_keep = 3.0, + pooling_level="district", + cluster_map=None, + return_move_log=True, + enforce_no_harm=False, + floor_to_baseline=True +) +print(pooled_district_df.groupby('Facility_Level')[['available_prop_redis', 'available_prop']].mean()) +pooled_district_df[['district', 'item_code', 'fac_name', 'month', 'amc', 'available_prop', 'Facility_Level', + 'OB', 'OB_prime', 'available_prop_redis_raw', 'available_prop_redis', 'received_from_pool', + 'skip_flag']].to_csv(outputfilepath/ 'clustering_district_df.csv', index=False) +tlo_pooled_district = ( + pooled_district_df + .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False) + .agg(available_prop_scen16=("available_prop_redis", "mean")) + .sort_values(["item_code","district","Facility_Level","month"]) + ) + + +# c) Run optimisation at cluster (size = 3) level +print("Now running pooled redistribution at Cluster (Size = 3) level") +pooled_cluster_df, cluster_moves = redistribute_pooling_lp( + df=lmis, # the LMIS dataframe + tau_min=0, tau_max=4.0, + tau_donor_keep = 3.0, + pooling_level="cluster", + cluster_map=cluster_series, + return_move_log=True, + enforce_no_harm=True, + floor_to_baseline=True +) +print(pooled_cluster_df.groupby('Facility_Level')[['available_prop_redis', 'available_prop']].mean()) +pooled_cluster_df.to_csv(outputfilepath/ 'clustering_n3_df.csv', index=False) + +tlo_pooled_cluster = ( + pooled_cluster_df + .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False) + .agg(available_prop_scen17=("available_prop_redis", "mean")) + .sort_values(["item_code","district","Facility_Level","month"]) + ) + +tlo_pooled = tlo_pooled_district.merge(tlo_pooled_cluster, on=["item_code", "district", "Facility_Level", "month"], how="left", validate="one_to_one") + +# 3) Implement pairwise redistribution +# 2) Run Scenario 1 (1-hour radius) + +tlo_30 = run_redistribution_scenarios( + lmis, + scenario="radius_30", + time_matrix=T_car, + tau_keep=2.0, tau_tar=1.0, alpha=1, + K_in=2, K_out=2, Qmin_days=7.0, +) + +tlo_60 = run_redistribution_scenarios( + lmis, + scenario="radius_60", + time_matrix=T_car, + tau_keep=2.0, tau_tar=1.0, alpha=1, + K_in=2, K_out=2, Qmin_days=7.0, +) + +tlo_pooling = run_redistribution_scenarios( + lmis, + scenario="pooling", + tau_min=0.75, tau_max=3.0 +) + +tlo_availability_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv") +# Drop any scenario data previously included in the resourcefile +tlo_availability_df = tlo_availability_df[['Facility_ID', 'month','item_code', 'available_prop']] + +# Import item_category +program_item_mapping = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']] +program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()] + +# 1.1.1 Attach district, facility level and item_category to this dataset +#---------------------------------------------------------------- +# Get TLO Facility_ID for each district and facility level +mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") +districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District']) +fac_levels = {'0', '1a', '1b', '2', '3', '4'} +tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + on = ['Facility_ID'], how='left') + +tlo_availability_df = tlo_availability_df.merge(program_item_mapping, + on = ['item_code'], how='left') +tlo_availability_df = tlo_availability_df[~tlo_availability_df[['District', 'Facility_Level', 'month', 'item_code']].duplicated()] + + +comparison_df = tlo_30.rename(columns ={'new_available_prop': 'available_prop_30mins'}).merge(tlo_availability_df, left_on = ['district', 'Facility_Level', 'month', 'item_code'], + right_on = ['District', 'Facility_Level', 'month', 'item_code'], how = 'left', validate = "1:1") +comparison_df = comparison_df.merge(tlo_60.rename(columns ={'new_available_prop': 'available_prop_60mins'}), on = ['district', 'Facility_Level', 'month', 'item_code'], how = 'left', validate = "1:1") +comparison_df = comparison_df.merge(tlo_pooling.rename(columns ={'new_available_prop': 'available_prop_pooling'}), on = ['district', 'Facility_Level', 'month', 'item_code'], how = 'left', validate = "1:1") +print(comparison_df['available_prop'].mean(),comparison_df['available_prop_30mins'].mean(), +comparison_df['available_prop_60mins'].mean(), comparison_df['available_prop_pooling'].mean()) + +comparison_df['available_prop_30mins'] = np.maximum( + comparison_df['available_prop_30mins'], + comparison_df['available_prop'] +) +comparison_df['available_prop_60mins'] = np.maximum( + comparison_df['available_prop_60mins'], + comparison_df['available_prop'] +) +comparison_df['available_prop_pooling'] = np.maximum( + comparison_df['available_prop_pooling'], + comparison_df['available_prop'] +) +print(comparison_df['available_prop'].mean(),comparison_df['available_prop_30mins'].mean(), +comparison_df['available_prop_60mins'].mean(), comparison_df['available_prop_pooling'].mean()) + +# TODO keep only government facilities?