From b609fc4af71669529c4582a7710b58d883f603e6 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Wed, 13 Aug 2025 19:58:24 +0100
Subject: [PATCH 01/16] Add script to generate descriptive plots for
 consumables impact manuscript - add heatmaps - add detailed table of average
 availability by consumable across scenarios

---
 .../descriptive_analysis.py                   | 372 ++++++++++++++++++
 1 file changed, 372 insertions(+)
 create mode 100644 src/scripts/consumables_analyses/descriptive_analysis.py

diff --git a/src/scripts/consumables_analyses/descriptive_analysis.py b/src/scripts/consumables_analyses/descriptive_analysis.py
new file mode 100644
index 0000000000..bf3528ea87
--- /dev/null
+++ b/src/scripts/consumables_analyses/descriptive_analysis.py
@@ -0,0 +1,372 @@
+'''
+Produce plots and estimates for the manuscript "Estimating the health gains and value for money of reducing drug stock-outs in Malawi: an individual-based modelling study"
+'''
+
+import datetime
+import os
+import textwrap
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from typing import Literal, Sequence, Optional, Union, List
+
+from tlo import Date
+from tlo.analysis.utils import (
+    extract_params,
+    get_scenario_info,
+    get_scenario_outputs,
+    load_pickled_dataframes,
+)
+
+# Define a timestamp for script outputs
+timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M")
+
+# Print the start time of the script
+print('Script Start', datetime.datetime.now().strftime('%H:%M'))
+
+# Create folders to store results
+resourcefilepath = Path("./resources")
+consumable_resourcefilepath = resourcefilepath / "healthsystem/consumables"
+simulationfilepath = Path('./outputs/sakshi.mohan@york.ac.uk')
+outputfilepath = Path('./outputs/consumables_impact_analysis')
+if not os.path.exists(outputfilepath):
+    os.makedirs(outputfilepath)
+
+# Utility functions
+# Prepare availability data
+def prepare_availability_dataset_for_plots(
+    _df: pd.DataFrame,
+    scenario_list: Optional[list[int]] = None,
+    scenario_names_dict: Optional[dict[str, str]] = None,
+    consumable_resourcefilepath: Path = None,
+    resourcefilepath: Path = None
+) -> pd.DataFrame:
+    """
+    Prepares a consumable availability dataset by merging facility and item category data,
+    renaming columns for scenarios, and cleaning category names for plotting.
+    """
+    if scenario_list is None:
+        scenario_list = []
+    if scenario_names_dict is None:
+        scenario_names_dict = {}
+
+    # Load item category mapping
+    program_item_mapping = pd.read_csv(
+        consumable_resourcefilepath / 'ResourceFile_Consumables_Item_Designations.csv',
+        usecols=['Item_Code', 'item_category']
+    )
+    program_item_mapping = program_item_mapping.rename(columns={'Item_Code': 'item_code'})
+    program_item_mapping = program_item_mapping[program_item_mapping['item_category'].notna()]
+
+    # Load facility list
+    mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv")
+
+    # Merge facility and program info
+    _df = _df.merge(
+        mfl[['District', 'Facility_Level', 'Facility_ID']],
+        on='Facility_ID', how='left'
+    )
+    _df = _df.merge(program_item_mapping, on='item_code', how='left')
+
+    # Rename scenario columns
+    _df = _df.rename(columns=scenario_names_dict)
+
+    # Clean item category names
+    clean_category_names = {
+        'cancer': 'Cancer',
+        'cardiometabolicdisorders': 'Cardiometabolic Disorders',
+        'contraception': 'Contraception',
+        'general': 'General',
+        'hiv': 'HIV',
+        'malaria': 'Malaria',
+        'ncds': 'Non-communicable Diseases',
+        'neonatal_health': 'Neonatal Health',
+        'other_childhood_illnesses': 'Other Childhood Illnesses',
+        'reproductive_health': 'Reproductive Health',
+        'road_traffic_injuries': 'Road Traffic Injuries',
+        'tb': 'Tuberculosis',
+        'undernutrition': 'Undernutrition',
+        'epi': 'Expanded programme on immunization'
+    }
+    _df['item_category'] = _df['item_category'].map(clean_category_names)
+
+    return _df
+
+# Wrap Labels
+def wrap_labels(labels, width=15):
+    """Wrap each label to the given character width."""
+    return [textwrap.fill(str(lab), width) if lab is not None else "" for lab in labels]
+
+# Generate heatmaps of average availability
+def generate_heatmap(
+    df: pd.DataFrame,
+    include_levels: Optional[List[str]] = None,
+    value_col: str = "Actual",
+    row: str = "item_category",
+    col: str = "Facility_Level",
+    row_order: Optional[Sequence[str]] = None,
+    col_order: Optional[Sequence[str]] = None,
+    figurespath: Optional[Path] = None,
+    filename: str = "heatmap_consumable_availability.png",
+    figsize: tuple[int, int] = (10, 8),
+    cmap: str = "RdYlGn",
+    annot: bool = True,
+    fmt: Optional[str] = None,              # None -> auto choose
+    font_scale: float = 0.9,
+    cbar_label: str = "Proportion of days on which consumable is available",
+    xlabel: Optional[str] = None,
+    ylabel: Optional[str] = None,
+    as_pct: bool = True,                   # format annotations as percentages
+    round_decimals: int = 2,
+    # option to plot scenarios on the x_axis
+    scenario_axis: bool = False,  # if True, columns become scenarios
+    scenario_cols: Optional[Sequence[str]] = None,
+):
+    """
+    Build a heatmap either by a single column (e.g., Facility_Level) or across multiple
+    scenario columns placed on the x-axis.
+    """
+    if include_levels is not None:
+        df = df[df.Facility_Level.isin(include_levels)]
+    if scenario_axis:
+        aggregated = (
+            df.groupby([row], dropna=True)[scenario_cols]
+                         .mean()
+                         .reset_index()
+        )
+        # Add perfect scenario
+        aggregated['Perfect'] = 1  # Add a column representing the perfect scenario
+        heatmap_df = aggregated.set_index('item_category')
+    else:
+        # Standard mode: columns = `col`, values = mean(value_col)
+        aggregated = (
+            df.groupby([row, col], dropna=True)[value_col]
+            .mean()
+            .reset_index()
+        )
+        heatmap_df = aggregated.pivot(index=row, columns=col, values=value_col)
+
+    # Optional ordering
+    if row_order is not None:
+        heatmap_df = heatmap_df.reindex(row_order)
+    if col_order is not None:
+        heatmap_df = heatmap_df.reindex(columns=col_order)
+
+    # 2) Totals (means across the raw data, not the pivot means)
+    if scenario_axis:
+        # Means by row across all programs for each scenario
+        row_means = heatmap_df.mean(axis=0)  # average per scenario across programs
+        avg_row = row_means.copy()
+        heatmap_df.loc["Average"] = avg_row
+    else:
+        # Compute from raw df to avoid double-averaging
+        col_means = df.groupby(row, dropna=False)[value_col].mean()
+        row_means = df.groupby(col, dropna=False)[value_col].mean()
+        overall_mean = df[value_col].mean()
+
+        heatmap_df["Average"] = col_means.reindex(heatmap_df.index)
+        avg_row = row_means.reindex(heatmap_df.columns).copy()
+        avg_row.loc["Average"] = overall_mean
+        heatmap_df.loc["Average"] = avg_row
+
+    # 3) Annotation formatting
+    if as_pct:
+        # If values are 0–1 proportions, annotate as percentages
+        display_df = (heatmap_df * 100).round(round_decimals)
+        if fmt is None:
+            fmt = ".0f" if round_decimals == 0 else f".{round_decimals}f"
+        annot_kws = {"fmt": fmt}
+        # Build string labels with % sign
+        annot_data = display_df.astype(float)
+    else:
+        display_df = heatmap_df.round(round_decimals)
+        if fmt is None:
+            fmt = f".{round_decimals}f"
+        annot_kws = {"fmt": fmt}
+        annot_data = display_df.astype(float)
+
+    # 4) Plot
+    sns.set(font_scale=font_scale)
+    fig, ax = plt.subplots(figsize=figsize)
+    hm = sns.heatmap(
+        annot_data,
+        annot=annot,
+        cmap=cmap,
+        cbar_kws={"label": cbar_label},
+        ax=ax,
+        fmt=".2f"
+    )
+
+    # If percentage labels requested, overwrite text with % suffix
+    if annot and as_pct:
+        for t in ax.texts:
+            t.set_text(f"{t.get_text()}%")
+
+    # 5) Labels & ticks
+    xlab = (xlabel or ("Scenario" if scenario_axis else col.replace("_", " ").title()))
+    ylab = (ylabel or row.replace("_", " ").title())
+    ax.set_xlabel(xlab)
+    ax.set_ylabel(ylab)
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
+    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
+
+    # 6) Save (optional)
+    if figurespath is not None:
+        figurespath = Path(figurespath)
+        figurespath.mkdir(parents=True, exist_ok=True)
+        outpath = figurespath / filename
+        plt.savefig(outpath, dpi=300, bbox_inches="tight")
+
+    plt.show()
+    plt.close(fig)
+    return fig, ax, heatmap_df
+
+
+# Import and clean data files
+#**********************************
+# Import TLO model availability data
+tlo_availability_df = pd.read_csv(consumable_resourcefilepath / "ResourceFile_Consumables_availability_small.csv")
+scenario_names_dict={
+        'available_prop': 'Actual',
+        'available_prop_scenario1': 'Non-therapeutic consumables',
+        'available_prop_scenario2': 'Vital medicines',
+        'available_prop_scenario3': 'Pharmacist- managed',
+        'available_prop_scenario4': 'Level 1b',
+        'available_prop_scenario5': 'CHAM',
+        'available_prop_scenario6': '75th percentile facility',
+        'available_prop_scenario7': '90th percentile facility',
+        'available_prop_scenario8': 'Best facility',
+        'available_prop_scenario9': 'Best facility (including DHO)',
+        'available_prop_scenario10': 'HIV supply chain',
+        'available_prop_scenario11': 'EPI supply chain',
+        'available_prop_scenario12': 'HIV moved to Govt supply chain (Avg by Level)',
+        'available_prop_scenario13': 'HIV moved to Govt supply chain (Avg by Facility_ID)',
+        'available_prop_scenario14': 'HIV moved to Govt supply chain (Avg by Facility_ID times 1.25)',
+        'available_prop_scenario15': 'HIV moved to Govt supply chain (Avg by Facility_ID times 0.75)'
+    }
+
+tlo_availability_df = prepare_availability_dataset_for_plots(
+    _df=tlo_availability_df,
+    scenario_list=[1, 2, 3, 6, 7, 8, 10, 11, 12, 13, 14, 15],
+    scenario_names_dict=scenario_names_dict,
+    consumable_resourcefilepath=consumable_resourcefilepath,
+    resourcefilepath=resourcefilepath
+)
+
+# Generate figures for manuscript
+#**********************************
+# Figure 1: Average probability of consumable availability in public and CHAM health facilities in Malawi
+_ = generate_heatmap(
+    df=tlo_availability_df,
+    value_col="Actual",
+    row="item_category",
+    col="Facility_Level",
+    figurespath = outputfilepath / 'manuscript',
+    filename="heatmap_program_and_level_actual.png",
+    figsize=(10, 8),
+    cmap="RdYlGn",
+    round_decimals=4,
+    cbar_label="Proportion of days on which consumable is available",
+    xlabel="Facility Level",
+    ylabel="Program",
+)
+
+# Figure 3: Comparison of consumable availability across modelled scenarios
+scenario_cols = ['Actual', 'Non-therapeutic consumables', 'Vital medicines', 'Pharmacist- managed','75th percentile facility', '90th percentile facility', 'Best facility']
+for level in ['1a', '1b']:
+    _ = generate_heatmap(
+        df=tlo_availability_df,
+        include_levels = [level],
+        value_col="Actual",
+        row="item_category",
+        col="Facility_Level",
+        figurespath=outputfilepath / 'manuscript',
+        filename=f"scenarios_heatmap_{level}.png",
+        figsize=(10, 8),
+        cmap="RdYlGn",
+        round_decimals=4,
+        cbar_label="Proportion of days on which consumable is available",
+        xlabel="Facility Level",
+        ylabel="Program",
+        scenario_axis = True,  # if True, columns become scenarios
+        scenario_cols = scenario_cols,
+    )
+
+
+# Figure A.1: Trend in average consumable availability by facility level
+
+# Figure A.2: Comparison of consumable availability as per Open Logistics Management Information System (OpenLMIS), 2018 and Harmonised
+# Health Facility Assessment, 2018-19
+
+# Table B.1: Average probability of availability for each consumable under all scenarios (Level 1a)
+def assign_consumable_names_to_item_codes(df):
+   # Create dictionary mapping item_codes to consumables names
+    consumables_df = pd.read_csv(consumable_resourcefilepath / "ResourceFile_Consumables_Items_and_Packages.csv")[['Item_Code', 'Items']]
+    consumables_df = consumables_df[consumables_df['Item_Code'].notna()]
+    consumables_dict = dict(zip(consumables_df['Item_Code'], consumables_df['Items']))
+
+    # Add consumable_name to df
+    df = df.copy()
+    df['item_name'] = df['item_code'].map(consumables_dict)
+
+    return df
+tlo_availability_df = assign_consumable_names_to_item_codes(tlo_availability_df)
+
+def generate_detail_availability_table(df,
+                               groupby_var,
+                               longtable = False,
+                               figurespath=outputfilepath / 'appendix',
+                               decimals=2
+):
+    table_df = df.copy()
+    table_df[groupby_var] = table_df[groupby_var].replace('_', ' ', regex=True)
+    table_df[groupby_var] = table_df[groupby_var].replace('%', r'\%', regex=True)
+    table_df[groupby_var] = table_df[groupby_var].replace('&', r'\&', regex=True)
+
+    table_df = table_df.groupby(['item_name'])[scenario_cols].mean()
+    # Multiply by 100 and format with escaped percent sign for LaTeX
+    table_df[scenario_cols] = table_df[scenario_cols].applymap(
+        lambda x: f"{x * 100:.{decimals}f}\\%"
+    )
+
+    # Rename columns for clarity
+    table_df = table_df.reset_index()
+    table_df.columns = ['Consumable'] + scenario_cols
+
+    # Rename columns
+    table_df.rename(columns={'item_name': 'Consumable'}, inplace=True)
+
+    # Convert to LaTeX
+    latex_table = table_df.to_latex(
+        longtable=longtable,
+        column_format='|R{4cm}|' + '|'.join(['R{1cm}'] * len(table_df.columns[1:])) + '|',
+        caption=f"Summarized availability by consumable",
+        label=f"tab:availability_by_{groupby_var}",
+        position="h",
+        index=False,
+        escape=False,  # we already escaped % and &
+        header=True
+    )
+
+    # Add \hline after each row
+    latex_table = latex_table.replace("\\\\", "\\\\ \\hline")
+
+    # Save
+    figurespath.mkdir(parents=True, exist_ok=True)
+    latex_file_path = figurespath / f'availability_by_{groupby_var}.tex'
+    with open(latex_file_path, 'w') as latex_file:
+        latex_file.write(latex_table)
+
+    # Print for reference
+    print(latex_table)
+
+# Table F1: Cost by cost subcategory
+generate_detail_availability_table(df = tlo_availability_df,
+                               groupby_var = 'item_name',
+                               longtable = True,
+                               figurespath=outputfilepath / 'appendix')
+
+# Table B.2: Average probability of availability for each consumable under all scenarios (Level 1b)
+

From 46a0cb9c0ccae8cb992c1aa258b0f126a4ddc932 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Wed, 20 Aug 2025 15:20:31 +0100
Subject: [PATCH 02/16] Update detailed tables on consumable availability by
 scenario

---
 .../descriptive_analysis.py                   | 204 ++++++++++++------
 1 file changed, 139 insertions(+), 65 deletions(-)

diff --git a/src/scripts/consumables_analyses/descriptive_analysis.py b/src/scripts/consumables_analyses/descriptive_analysis.py
index bf3528ea87..61780bf128 100644
--- a/src/scripts/consumables_analyses/descriptive_analysis.py
+++ b/src/scripts/consumables_analyses/descriptive_analysis.py
@@ -35,6 +35,19 @@
     os.makedirs(outputfilepath)
 
 # Utility functions
+# Assign item names to item codes
+def assign_consumable_names_to_item_codes(df):
+   # Create dictionary mapping item_codes to consumables names
+    consumables_df = pd.read_csv(consumable_resourcefilepath / "ResourceFile_Consumables_Items_and_Packages.csv")[['Item_Code', 'Items']]
+    consumables_df = consumables_df[consumables_df['Item_Code'].notna()]
+    consumables_dict = dict(zip(consumables_df['Item_Code'], consumables_df['Items']))
+
+    # Add consumable_name to df
+    df = df.copy()
+    df['item_name'] = df['item_code'].map(consumables_dict)
+
+    return df
+
 # Prepare availability data
 def prepare_availability_dataset_for_plots(
     _df: pd.DataFrame,
@@ -223,6 +236,111 @@ def generate_heatmap(
     plt.close(fig)
     return fig, ax, heatmap_df
 
+# Generate LaTex-compatible detailed table of availability
+def generate_detail_availability_table_by_scenario(
+    df: pd.DataFrame,
+    groupby_var: str,
+    scenario_cols: Sequence[str],
+    include_levels: Union[str, Sequence[str]],
+    longtable: bool = False,
+    outputpath: Path = None,
+    decimals: int = 2,
+    caption: Optional[str] = None,
+    label_prefix: str = "tab:availability_by_",
+    col_width_groupbyvar: str = "4cm",
+    col_width_scenario: str = "1.8cm",
+) -> str:
+    """
+    Create a LaTeX table of average availability (as percentages) for each consumable across scenarios,
+    filtered to selected facility levels.
+
+    Returns the LaTeX string and writes it to figurespath / f"availability_by_{groupby_var}.tex"
+    """
+
+    # --- Setup ---
+    if outputpath is None:
+        outputpath = outputfilepath / "appendix"  # falls back to your existing default
+    outputpath.mkdir(parents=True, exist_ok=True)
+
+    # Accept str OR list for include_levels
+    if isinstance(include_levels, str):
+        include_levels = [include_levels]
+
+    table_df = df.copy()
+
+    # Filter by facility level if the column is present
+    if "Facility_Level" in table_df.columns:
+        table_df = table_df[table_df["Facility_Level"].isin(include_levels)]
+
+    # Aggregate means per item
+    grouped = (
+        table_df
+        .groupby([groupby_var], dropna=False)[list(scenario_cols)]
+        .mean()
+        .reset_index()
+    )
+
+    # Rename first column to "Consumable"
+    grouped.rename(columns={groupby_var: "Consumable"}, inplace=True)
+
+    # Escape LaTeX in Consumable names
+    def _latex_escape(s):
+        if pd.isna(s):
+            return ""
+        s = str(s)
+        # Order matters for backslashes; escape backslash first
+        s = s.replace("\\", r"\\")
+        s = s.replace("&", r"\&").replace("%", r"\%").replace("_", r"\_")
+        s = s.replace("#", r"\#").replace("{", r"\{").replace("}", r"\}")
+        s = s.replace("$", r"\$").replace("^", r"\^{}").replace("~", r"\~{}")
+        return s
+
+    grouped["Consumable"] = grouped["Consumable"].map(_latex_escape)
+
+    # Convert proportions -> percentage strings with escaped %
+    def pct_format(x: float) -> str:
+        if pd.isna(x):
+            return ""
+        return f"{x * 100:.{decimals}f}\\%"
+
+    for c in scenario_cols:
+        grouped[c] = grouped[c].map(pct_format)
+
+    # Build column format dynamically
+    # First col wider for names, then one col per scenario
+    column_format = (
+        f"|R{{{col_width_groupbyvar}}}|"
+        + "|".join([f"R{{{col_width_scenario}}}"] * len(scenario_cols))
+        + "|"
+    )
+
+    # Caption/label
+    if caption is None:
+        caption = "Summarized availability by consumable"
+    label = f"{label_prefix}{groupby_var}_{include_levels}"
+
+    # Export to LaTeX (escape=False since we already escaped)
+    latex_table = grouped.to_latex(
+        longtable=longtable,
+        column_format=column_format,
+        caption=caption,
+        label=label,
+        position="h",
+        index=False,
+        escape=False,
+        header=True,
+    )
+
+    # Add \hline after each row
+    latex_table = latex_table.replace("\\\\\n", "\\\\ \\hline\n")
+
+    # Save
+    outpath = outputpath / f"availability_by_{groupby_var}_{include_levels}.tex"
+    with open(outpath, "w", encoding="utf-8") as f:
+        f.write(latex_table)
+
+    return latex_table
+
 
 # Import and clean data files
 #**********************************
@@ -301,72 +419,28 @@ def generate_heatmap(
 # Health Facility Assessment, 2018-19
 
 # Table B.1: Average probability of availability for each consumable under all scenarios (Level 1a)
-def assign_consumable_names_to_item_codes(df):
-   # Create dictionary mapping item_codes to consumables names
-    consumables_df = pd.read_csv(consumable_resourcefilepath / "ResourceFile_Consumables_Items_and_Packages.csv")[['Item_Code', 'Items']]
-    consumables_df = consumables_df[consumables_df['Item_Code'].notna()]
-    consumables_dict = dict(zip(consumables_df['Item_Code'], consumables_df['Items']))
-
-    # Add consumable_name to df
-    df = df.copy()
-    df['item_name'] = df['item_code'].map(consumables_dict)
-
-    return df
 tlo_availability_df = assign_consumable_names_to_item_codes(tlo_availability_df)
 
-def generate_detail_availability_table(df,
-                               groupby_var,
-                               longtable = False,
-                               figurespath=outputfilepath / 'appendix',
-                               decimals=2
-):
-    table_df = df.copy()
-    table_df[groupby_var] = table_df[groupby_var].replace('_', ' ', regex=True)
-    table_df[groupby_var] = table_df[groupby_var].replace('%', r'\%', regex=True)
-    table_df[groupby_var] = table_df[groupby_var].replace('&', r'\&', regex=True)
-
-    table_df = table_df.groupby(['item_name'])[scenario_cols].mean()
-    # Multiply by 100 and format with escaped percent sign for LaTeX
-    table_df[scenario_cols] = table_df[scenario_cols].applymap(
-        lambda x: f"{x * 100:.{decimals}f}\\%"
-    )
-
-    # Rename columns for clarity
-    table_df = table_df.reset_index()
-    table_df.columns = ['Consumable'] + scenario_cols
-
-    # Rename columns
-    table_df.rename(columns={'item_name': 'Consumable'}, inplace=True)
-
-    # Convert to LaTeX
-    latex_table = table_df.to_latex(
-        longtable=longtable,
-        column_format='|R{4cm}|' + '|'.join(['R{1cm}'] * len(table_df.columns[1:])) + '|',
-        caption=f"Summarized availability by consumable",
-        label=f"tab:availability_by_{groupby_var}",
-        position="h",
-        index=False,
-        escape=False,  # we already escaped % and &
-        header=True
-    )
-
-    # Add \hline after each row
-    latex_table = latex_table.replace("\\\\", "\\\\ \\hline")
-
-    # Save
-    figurespath.mkdir(parents=True, exist_ok=True)
-    latex_file_path = figurespath / f'availability_by_{groupby_var}.tex'
-    with open(latex_file_path, 'w') as latex_file:
-        latex_file.write(latex_table)
-
-    # Print for reference
-    print(latex_table)
-
-# Table F1: Cost by cost subcategory
-generate_detail_availability_table(df = tlo_availability_df,
-                               groupby_var = 'item_name',
-                               longtable = True,
-                               figurespath=outputfilepath / 'appendix')
+# Table B.1: Average probability of availability for each consumable under all scenarios (Level 1a)
+availablity_by_item_1a = generate_detail_availability_table_by_scenario(
+    df=tlo_availability_df,
+    groupby_var="item_name",
+    scenario_cols=scenario_cols,
+    include_levels="1a",
+    longtable=True,
+    outputpath=outputfilepath / "appendix",
+    decimals=2,
+    caption="Average probability of availability for each consumable under all scenarios (Level 1a)",
+)
 
 # Table B.2: Average probability of availability for each consumable under all scenarios (Level 1b)
-
+availablity_by_item_1b = generate_detail_availability_table_by_scenario(
+    df=tlo_availability_df,
+    groupby_var="item_name",
+    scenario_cols=scenario_cols,
+    include_levels="1b",
+    longtable=True,
+    outputpath=outputfilepath / "appendix",
+    decimals=2,
+    caption="Average probability of availability for each consumable under all scenarios (Level 1b)",
+)

From 53b030362c720ea6333d60cacc33abae45003cb2 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Wed, 10 Sep 2025 11:57:44 +0700
Subject: [PATCH 03/16] generate the first version of optimisation of
 consumable (re)distribution

---
 ...ate_consumable_redistribution_scenarios.py | 705 ++++++++++++++++++
 1 file changed, 705 insertions(+)
 create mode 100644 src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
new file mode 100644
index 0000000000..cd3680556d
--- /dev/null
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -0,0 +1,705 @@
+import datetime
+from pathlib import Path
+import pickle
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+from typing import Literal, Optional, Dict, Tuple
+import requests
+
+from pulp import LpProblem, LpMaximize, LpVariable, LpBinary, LpStatus, value, lpSum, LpContinuous
+
+# define a timestamp for script outputs
+timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M")
+
+# print the start time of the script
+print('Script Start', datetime.datetime.now().strftime('%H:%M'))
+
+# define folder pathways
+outputfilepath = Path("./outputs/consumables_impact_analysis")
+resourcefilepath = Path("./resources")
+path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables"
+# Set local shared drive source
+path_to_share = Path(  # <-- point to the shared folder
+    '/Users/sm2511/CloudStorage/OneDrive-SharedLibraries-ImperialCollegeLondon/TLOModel - WP - Documents/'
+)
+
+# 1. Import and clean data files
+#**********************************
+# Import Cleaned OpenLMIS data from 2018
+lmis = (pd.read_csv(outputfilepath / "ResourceFile_Consumables_availability_and_usage.csv")
+        [['district', 'fac_type_tlo', 'fac_name', 'month', 'item_code', 'available_prop',
+       'closing_bal', 'amc', 'dispensed', 'received']])
+# Import data on facility location
+location = (pd.read_excel(path_to_share / "07 - Data/Facility_GPS_Coordinates/gis_data_for_openlmis/LMISFacilityLocations_raw.xlsx")
+        [['LMIS Facility List', 'LATITUDE', 'LONGITUDE']])
+# Find duplicates in facilty names in the location dataset
+duplicates = location[location['LMIS Facility List'].duplicated(keep=False)]
+location = location.drop(duplicates[duplicates['LATITUDE'].isna()].index).reset_index(drop=True) # Drop those duplicates where location is missing
+# Import ownership data
+ownership = (pd.read_csv(path_to_share / "07 - Data/Consumables data/OpenLMIS/lmis_facility_ownership.csv"))[['fac_name', 'fac_owner']]
+ownership = ownership.drop_duplicates(subset=['fac_name'])
+
+# Merge OpenLMIS and location and ownership data
+lmis = lmis.merge(location, left_on='fac_name', right_on = 'LMIS Facility List', how = 'left', validate='m:1')
+lmis = lmis.merge(ownership, on='fac_name', how = 'left', validate='m:1')
+lmis.rename(columns = {'LATITUDE':'lat', 'LONGITUDE':'long', 'fac_type_tlo': 'Facility_Level'}, inplace = True)
+
+# Cleaning to match date to the same format as consumable availability RF in TLO model
+month_map = {
+    "January": 1, "February": 2, "March": 3, "April": 4,
+    "May": 5, "June": 6, "July": 7, "August": 8,
+    "September": 9, "October": 10, "November": 11, "December": 12
+}
+lmis["month"] = lmis["month"].map(month_map)
+lmis["Facility_Level"] = lmis["Facility_Level"].str.replace("Facility_level_", "", regex=False)
+
+# Keep only those facilities whose location is available
+lmis = lmis[lmis.lat.notna()]
+# TODO assume something else about location of these facilities with missing location - eg. centroid of district?
+# only 16 facilties have missing information
+
+# For now, let's keep only 8 facilities in 1 district
+#lmis = lmis[lmis.district == 'Balaka']
+# Keep only facilities with Facility_Level == '1a'
+#lmis = lmis[lmis["Facility_Level"] == '1a']
+
+# -----------------------------------
+# 1) Estimate travel time matrix
+# -----------------------------------
+def _chunk_indices(n: int, chunk: int):
+    """Yield (start, end) index pairs for chunking 0..n-1."""
+    for s in range(0, n, chunk):
+        e = min(n, s + chunk)
+        yield s, e
+
+def build_travel_time_matrix(
+    fac_df: pd.DataFrame,
+    *,
+    id_col: str = "fac_name",
+    lat_col: str = "lat",
+    lon_col: str = "long",
+    mode: Literal["car", "bicycle"] = "car",
+    backend: Literal["ors", "osrm"] = "ors",
+    # ORS options
+    ors_api_key: Optional[str] = None,
+    ors_base_url: str = "https://api.openrouteservice.org/v2/matrix",
+    # OSRM options (self-hosted or public; note: public often has only 'car')
+    osrm_base_url: str = "https://router.project-osrm.org",
+    osrm_profile_map: dict = None,
+    # matrix request chunking (keeps requests within API limits)
+    max_chunk: int = 40,
+    timeout: int = 60,
+) -> pd.DataFrame:
+    """
+    Build an NxN *road* travel-time matrix (minutes) for facilities, by CAR or BICYCLE
+
+    backends:
+      - 'ors'  -> uses OpenRouteService Matrix API (profiles: driving-car, cycling-regular).
+                  Requires ors_api_key. Has rate/size limits; we auto-chunk.
+      - 'osrm' -> uses OSRM 'table' service (profiles typically 'car' or 'bike').
+                  For bicycle, you'll likely need a self-hosted OSRM with the bicycle profile.
+
+    Parameters
+    ----------
+    mode : 'car' | 'bicycle'
+        Travel mode on roads.
+    max_chunk : int
+        Max #origins (and #destinations) per sub-matrix request to keep within API limits.
+
+    Returns
+    -------
+    pd.DataFrame
+        Square DataFrame (minutes), index/columns = facility names.
+    """
+    facs = fac_df[[id_col, lat_col, lon_col]].dropna().drop_duplicates().reset_index(drop=True)
+    ids = facs[id_col].tolist()
+    lats = facs[lat_col].to_numpy()
+    lons = facs[lon_col].to_numpy()
+    n = len(ids)
+
+    T = pd.DataFrame(np.full((n, n), np.nan, dtype=float), index=ids, columns=ids)
+    np.fill_diagonal(T.values, 0.0)
+
+    if n == 0:
+        return T
+
+    if backend == "ors":
+        if ors_api_key is None:
+            raise ValueError("OpenRouteService requires ors_api_key.")
+        profile = "driving-car" if mode == "car" else "cycling-regular"
+
+        # ORS expects [lon, lat]
+        coords = [[float(lons[i]), float(lats[i])] for i in range(n)]
+
+        headers = {"Authorization": ors_api_key, "Content-Type": "application/json"}
+
+        # Chunk both sources and destinations to stay under limits
+        for si, sj in _chunk_indices(n, max_chunk):
+            for di, dj in _chunk_indices(n, max_chunk):
+                sources = list(range(si, sj))
+                destinations = list(range(di, dj))
+
+                body = {
+                    "locations": coords,
+                    "sources": sources,
+                    "destinations": destinations,
+                    "metrics": ["duration"],
+                }
+                url = f"{ors_base_url}/{profile}"
+                r = requests.post(url, json=body, headers=headers, timeout=timeout)
+                r.raise_for_status()
+                data = r.json()
+
+                # durations in seconds; fill submatrix
+                durs = data.get("durations")
+                if durs is None:
+                    raise RuntimeError(f"ORS returned no durations for block {si}:{sj} x {di}:{dj}")
+                sub = (np.array(durs, dtype=float) / 60.0)  # minutes
+                T.iloc[si:sj, di:dj] = sub
+
+    elif backend == "osrm":
+        # Map desired mode to OSRM profile names
+        if osrm_profile_map is None:
+            osrm_profile_map = {"car": "car", "bicycle": "bike"}
+        profile = osrm_profile_map.get(mode)
+        if profile is None:
+            raise ValueError(f"No OSRM profile mapped for mode='{mode}'.")
+
+        # NOTE: The public OSRM demo often supports *car only*.
+        # For bicycle, run your own OSRM with the bicycle profile.
+        # We use the OSRM 'table' service; POST with 'sources' and 'destinations' indices.
+
+        coords = ";".join([f"{lons[i]},{lats[i]}" for i in range(n)])  # lon,lat
+
+        for si, sj in _chunk_indices(n, max_chunk):
+            for di, dj in _chunk_indices(n, max_chunk):
+                sources = ";".join(map(str, range(si, sj)))
+                destinations = ";".join(map(str, range(di, dj)))
+
+                url = (
+                    f"{osrm_base_url}/table/v1/{profile}/{coords}"
+                    f"?sources={sources}&destinations={destinations}&annotations=duration"
+                )
+                r = requests.get(url, timeout=timeout)
+                r.raise_for_status()
+                data = r.json()
+
+                durs = data.get("durations")
+                if durs is None:
+                    raise RuntimeError(f"OSRM returned no durations for block {si}:{sj} x {di}:{dj}")
+                sub = (np.array(durs, dtype=float) / 60.0)  # minutes
+                T.iloc[si:sj, di:dj] = sub
+
+    else:
+        raise ValueError("backend must be 'ors' or 'osrm'.")
+
+    # Safety: replace any residual NaNs (unroutable pairs) with inf or a large number
+    T = T.fillna(np.inf)
+    return T
+
+# Because ORS and ORSM can only handle a limited number of elements at a time, it's better to run this by district
+# each of which already has under 50 facilities
+def build_time_matrices_by_district(
+    df: pd.DataFrame,
+    *,
+    district_col: str = "district",
+    id_col: str = "fac_name",
+    lat_col: str = "lat",
+    lon_col: str = "long",
+    mode: str = "car",
+    backend: str = "osrm",
+    osrm_base_url: str = "https://router.project-osrm.org",
+    ors_api_key: str | None = None,
+    max_chunk: int = 50,               # safe for both OSRM/ORS
+) -> dict[str, pd.DataFrame]:
+    """
+    Returns a dict: {district -> square minutes matrix DataFrame}, computed within each district only.
+    """
+    matrices = {}
+    # unique facilities per district (drop duplicates in case of repeated rows)
+    fac_cols = [district_col, id_col, lat_col, lon_col]
+    fac_coords = df[fac_cols].dropna().drop_duplicates()
+
+    for d, facs_d in fac_coords.groupby(district_col):
+        # Skip tiny groups
+        if len(facs_d) < 2:
+            continue
+
+        T = build_travel_time_matrix(
+            fac_df=facs_d[[id_col, lat_col, lon_col]],
+            id_col=id_col, lat_col=lat_col, lon_col=lon_col,
+            mode=mode, backend=backend,
+            osrm_base_url=osrm_base_url,
+            ors_api_key=ors_api_key,
+            max_chunk=max_chunk,
+        )
+        matrices[d] = T
+
+    return matrices
+
+# -----------------------------------------------
+# 2) Data prep for redistribution linear program
+# -----------------------------------------------
+def compute_opening_balance(df: pd.DataFrame) -> pd.Series:
+    """
+    Opening balance from same-month records:
+    OB = closing_bal - received + dispensed.
+    This is equivalent to OB_(m) = CB_(m-1)
+    """
+    return df["closing_bal"] - df["received"] + df["dispensed"]
+
+
+def presumed_availability(ob, amc, eps=1e-9) -> float:
+    """
+    Presumed likelihood of cons availability =  p = min(1, OB/AMC) at month start (no additional receipts considered
+    at this point in time).
+    """
+    return float(min(1.0, max(0.0, (ob / max(eps, amc)))))
+
+
+def build_edges_within_radius(
+    time_matrix: pd.DataFrame,
+    max_minutes: float
+) -> Dict[str, set]:
+    """
+    For each facility g, return set of receivers f such that T[g,f] <= max_minutes, f != g.
+    """
+    edges = {}
+    for g in time_matrix.index:
+        feasible = set(time_matrix.columns[(time_matrix.loc[g] <= max_minutes) & (time_matrix.columns != g)])
+        edges[g] = feasible
+    return edges
+
+def build_edges_within_radius_flat(T_by_dist: dict, max_minutes: float) -> dict[str, set[str]]:
+    """
+    Takes the district-wise dictionary of time travel matrices and converts it into a flat dictionary of facilities
+    and their edge neighbours depending on the maximum allowable travel distance.
+    T_by_dist: {district -> square DataFrame of minutes (index/cols = facility IDs)}
+    Returns: {facility_id -> set(of facility_ids)} for all districts combined.
+    """
+    edges: dict[str, set[str]] = {}
+    for _, T in T_by_dist.items():
+        for g in T.index:
+            row = T.loc[g].to_numpy()
+            feasible_mask = (row <= max_minutes) & np.isfinite(row)
+            # Exclude self
+            feasible = [f for f in T.columns[feasible_mask] if f != g]
+            if g not in edges:
+                edges[g] = set()
+            edges[g].update(feasible)
+    return edges
+
+# a = build_edges_within_radius(T_car, max_minutes = 18)
+
+# -----------------------------------------------
+# 3) LP/MILP Redistribution
+# -----------------------------------------------
+def redistribute_radius_lp(
+    df: pd.DataFrame,
+    time_matrix: pd.DataFrame,
+    radius_minutes: float,
+    tau_keep: float = 2.0, # Keeper minimum (2 --> Donors want to keep at least 2 X AMC)
+    tau_tar: float = 1.0, # Receiver target (1 --> Receivers want OB = AMC)
+    alpha: float = 0.7, # Donation cap (between 0 and 1)
+    # alpha can be set to 1 if donors only want to keep a min of tau_keep X AMC
+    K_in: int = 2,     # max donors per receiver (per item)
+    K_out: int = 2,    # max receivers per donor (per item)
+    # TODO the above limits should be across items not per item
+    Qmin_days: float = 7.0,  # minimum lot size in "days of demand" at receiver
+    # i.e. at least Qmin_days worth of stock should be supplied for the transfer to be justifiable
+    id_cols=("district","month","item_code"),
+    facility_col="fac_name",
+    amc_col="amc",
+    close_cols=("closing_bal","received","dispensed"),
+) -> pd.DataFrame:
+    """
+    Scenario 1 or 2: local swaps within a time radius.
+    Presumed p' = min(1, OB'/AMC).
+    Objective: maximize total filled deficits (sum of transfers), which is a proxy for maximizing total availability.
+
+    Returns a copy of df with a new column 'available_prop_redis' for redistributed probability per row.
+    """
+    closing_bal, received, dispensed = close_cols
+    df = df.copy()
+    df["OB"] = compute_opening_balance(df)
+
+    # container for updated OB by (district, month, item, facility)
+    df["OB_prime"] = df["OB"]
+
+    # iterate per (district, month, item)
+    group_cols = list(id_cols)
+    for (d, m, i), g in df.groupby(group_cols):
+        # facilities in this slice
+        g = g.copy()
+        facs = g[facility_col].tolist()
+
+        # build per-fac arrays
+        AMC = g.set_index(facility_col)[amc_col].astype(float)
+        OB0 = g.set_index(facility_col)["OB"].astype(float)
+
+        # donor surplus cap and receiver deficit
+        surplus_cap = alpha * np.maximum(0.0, OB0.values - tau_keep * AMC.values)
+        deficit     = np.maximum(0.0, tau_tar * AMC.values - OB0.values)
+        donors_mask = surplus_cap > 1e-9 # boolean vector marking facilities that actually have positive surplus
+        recv_mask   = deficit > 1e-9 # boolean vector marking facilities that actually have a real need (deficit > 0).
+
+        if not donors_mask.any() or not recv_mask.any():
+            # nothing to do for this slice
+            continue
+
+        donors = list(AMC.index[donors_mask])
+        receivers = list(AMC.index[recv_mask])
+
+        # Feasible edges within radius
+        #feasible = build_edges_within_radius(time_matrix.loc[donors + receivers, donors + receivers], radius_minutes)
+        T_d = time_matrix[d]  # time_matrix is dict {district -> DataFrame}
+
+        # 2) limit to only facilities present in this slice
+        slice_facs = g[facility_col].unique().tolist()
+        T_d = T_d.loc[slice_facs, slice_facs]
+
+        # 3) build feasible edges for this radius
+        feasible = {}
+        for g_fac in T_d.index:
+            feas_mask = (T_d.loc[g_fac].to_numpy() <= radius_minutes) & np.isfinite(T_d.loc[g_fac].to_numpy())
+            feasible[g_fac] = set(T_d.columns[feas_mask]) - {g_fac}
+
+        # Minimum lot (units) for each receiver depends on its AMC
+        AMC_map = AMC.to_dict()
+        Qmin_units = {f: Qmin_days * max(1e-9, AMC_map[f]) for f in receivers}
+
+        # Big-M per edge: cannot exceed receiver deficit
+        deficit_map = dict(zip(AMC.index, deficit)) # creates a dict with key = facility and value = deficit
+        # this is the upper bound for how much a facility can receive
+        smax_map = dict(zip(AMC.index, surplus_cap)) # creates a dict with key = facility and value = donatable surplus
+        # this is the upper bound for how much a facility can donate
+
+        # Build MILP
+        prob = LpProblem(f"Redistribution_{d}_{m}_{i}", LpMaximize)
+
+        # Variables
+        t = {}   # transfer amounts per (g,f): continuous >=0
+        y = {}   # edge-activation binaries
+
+        for g_fac in donors:
+            for f_fac in feasible.get(g_fac, set()):
+                if f_fac in receivers:
+                    t[(g_fac, f_fac)] = LpVariable(f"t_{g_fac}->{f_fac}", lowBound=0, cat=LpContinuous)
+                    y[(g_fac, f_fac)] = LpVariable(f"y_{g_fac}->{f_fac}", lowBound=0, upBound=1, cat=LpBinary)
+
+        # Objective: maximize total shipped (sum of t)
+        prob += lpSum(t.values())
+
+        # Donor outflow caps
+        for g_fac in donors:
+            prob += lpSum(t[(g_fac, f_fac)] for f_fac in receivers if (g_fac, f_fac) in t) <= smax_map[g_fac]
+
+        # Receiver inflow caps
+        for f_fac in receivers:
+            prob += lpSum(t[(g_fac, f_fac)] for g_fac in donors if (g_fac, f_fac) in t) <= deficit_map[f_fac]
+
+        # Edge linking + minimum lot size + K_in/K_out
+        for (g_fac, f_fac), var in t.items():
+            M = deficit_map[f_fac]
+            prob += var <= M * y[(g_fac, f_fac)]
+            # minimum lot only makes sense if the edge is used at all; also don't force Qmin to exceed deficit
+            prob += var >= min(Qmin_units[f_fac], deficit_map[f_fac]) * y[(g_fac, f_fac)]
+
+        # Max donors per receiver
+        for f_fac in receivers:
+            prob += lpSum(y[(g_fac, f_fac)] for g_fac in donors if (g_fac, f_fac) in y) <= K_in
+
+        # Max receivers per donor
+        for g_fac in donors:
+            prob += lpSum(y[(g_fac, f_fac)] for f_fac in receivers if (g_fac, f_fac) in y) <= K_out
+
+        # Solve (default solver)
+        prob.solve()
+
+        # Apply transfers to OB'
+        if LpStatus[prob.status] not in ("Optimal","Optimal Infeasible","Not Solved"):
+            # If not solved (rare), skip changes
+            continue
+
+        # net change per facility
+        delta = {fac: 0.0 for fac in facs}
+        for (g_fac, f_fac), var in t.items():
+            moved = value(var) or 0.0
+            delta[g_fac] -= moved
+            delta[f_fac] += moved
+
+        # write back OB'
+        sel = (df["district"].eq(d) & df["month"].eq(m) & df["item_code"].eq(i))
+        df.loc[sel, "OB_prime"] = df.loc[sel].apply(
+            lambda r: r["OB"] + delta.get(r[facility_col], 0.0), axis=1
+        )
+
+    # Mechanistic availability after redistribution
+    df["available_prop_redis"] = np.minimum(1.0, np.maximum(0.0, df["OB_prime"] / np.maximum(1e-9, df[amc_col])))
+    return df
+
+def redistribute_pooling_lp(
+    df: pd.DataFrame,
+    tau_min: float = 0.25,   # optional lower floor (this is the same as 7 days Q_min)
+    tau_max: float = 3.0,    # storage/USAID style max
+    id_cols=("district","month","item_code"),
+    facility_col="fac_name",
+    amc_col="amc",
+    close_cols=("closing_bal","received","dispensed"),
+) -> pd.DataFrame:
+    """
+    Scenario 3: district-level pooling/push. Linear program maximizing total presumed availability.
+
+    We linearize p = min(1, x/AMC) by introducing p_f in [0,1] with constraint AMC_f * p_f <= x_f.
+
+    Returns df with 'available_prop_redis' updated for the pooling scenario.
+    """
+    closing_bal, received, dispensed = close_cols
+    df = df.copy()
+    df["OB"] = compute_opening_balance(df)
+
+    df["OB_prime"] = df["OB"]
+
+    group_cols = list(id_cols)
+    for (d, m, i), g in df.groupby(group_cols):
+        g = g.copy()
+        facs = g[facility_col].tolist()
+        AMC = g.set_index(facility_col)[amc_col].astype(float)
+        OB0 = g.set_index(facility_col)["OB"].astype(float)
+
+        total_stock = float(OB0.sum())
+        if total_stock <= 1e-9:
+            # nothing to allocate
+            continue
+
+        # LP
+        prob = LpProblem(f"Pooling_{d}_{m}_{i}", LpMaximize)
+
+        # Decision vars
+        x = {f: LpVariable(f"x_{f}", lowBound=0) for f in facs}      # allocated opening stock
+        p = {f: LpVariable(f"p_{f}", lowBound=0, upBound=1) for f in facs}  # availability
+
+        # Objective: maximize sum p_f (optionally weight by item importance)
+        prob += lpSum(p.values())
+
+        # Conservation
+        prob += lpSum(x.values()) == total_stock
+
+        # Guardrails (optional)
+        for f in facs:
+            prob += x[f] >= tau_min * AMC[f]
+            prob += x[f] <= tau_max * AMC[f]
+
+        # Linearization: AMC_f * p_f <= x_f
+        for f in facs:
+            prob += AMC[f] * p[f] <= x[f]
+
+        prob.solve()
+
+        if LpStatus[prob.status] not in ("Optimal","Optimal Infeasible","Not Solved"):
+            continue
+
+        # Write back OB' and p'
+        x_sol = {f: value(var) or 0.0 for f, var in x.items()}
+        # OB' equals allocated x
+        sel = (df["district"].eq(d) & df["month"].eq(m) & df["item_code"].eq(i))
+        df.loc[sel, "OB_prime"] = df.loc[sel].apply(lambda r: x_sol.get(r[facility_col], r["OB"]), axis=1)
+
+    # Mechanistic availability p' = min(1, OB'/AMC)
+    df["available_prop_redis"] = np.minimum(1.0, np.maximum(0.0, df["OB_prime"] / np.maximum(1e-9, df[amc_col])))
+    return df
+
+# ------------------------
+# 4) HIGH-LEVEL DRIVER
+# ------------------------
+def run_redistribution_scenarios(
+    df: pd.DataFrame,
+    scenario: Literal["radius_30","radius_60","pooling"],
+    time_matrix: Optional[pd.DataFrame] = None,
+    # tunables
+    tau_keep: float = 2.0,
+    tau_tar: float = 1.0,
+    alpha: float = 0.7,
+    K_in: int = 2,
+    K_out: int = 2,
+    Qmin_days: float = 7.0,
+    tau_min: float = 0.75,
+    tau_max: float = 3.0,
+    # schema
+    id_cols=("district","month","item_code"),
+    facility_col="fac_name",
+    amc_col="amc",
+    close_cols=("closing_bal","received","dispensed"),
+    # aggregation for TLO input
+    aggregate_for_tlo: bool = True,
+    level_col: str = "Facility_Level"
+) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
+    """
+    Runs exactly one scenario and returns:
+      - detailed_df: row-level dataframe with 'available_prop_redis'
+      - tlo_df (optional): aggregated to (item_code, district, Facility_Level, month) with mean availability
+
+    For radius scenarios, 'time_matrix' is required. For pooling, it is ignored.
+    """
+    if scenario in ("radius_30","radius_60"):
+        if time_matrix is None:
+            raise ValueError("time_matrix is required for radius-based scenarios.")
+        radius = 30 if scenario=="radius_30" else 60
+        detailed = redistribute_radius_lp(
+            df=df,
+            time_matrix=time_matrix,
+            radius_minutes=radius,
+            tau_keep=tau_keep,
+            tau_tar=tau_tar,
+            alpha=alpha,
+            K_in=K_in,
+            K_out=K_out,
+            Qmin_days=Qmin_days,
+            id_cols=id_cols,
+            facility_col=facility_col,
+            amc_col=amc_col,
+            close_cols=close_cols,
+        )
+    elif scenario == "pooling":
+        detailed = redistribute_pooling_lp(
+            df=df,
+            tau_min=tau_min,
+            tau_max=tau_max,
+            id_cols=id_cols,
+            facility_col=facility_col,
+            amc_col=amc_col,
+            close_cols=close_cols,
+        )
+    else:
+        raise ValueError("scenario must be one of 'radius_30', 'radius_60', 'pooling'.")
+
+    if not aggregate_for_tlo:
+        return detailed, None
+
+    # Aggregate to TLO input grain: mean across facilities (match your current pipeline)
+    tlo = (
+        detailed
+        .groupby(["item_code", "district", level_col, "month"], as_index=False)
+        .agg(new_available_prop=("available_prop_redis", "mean"))
+        .sort_values(["item_code","district",level_col,"month"])
+    )
+    return tlo
+
+# IMPLEMENT
+# 1) Build a time matrix
+fac_coords = lmis[['fac_name', 'district', 'lat','long']]
+#T_car = build_time_matrices_by_district(
+#    fac_coords,
+#    mode="car",
+#    backend="osrm",
+#    osrm_base_url="https://router.project-osrm.org",
+#    max_chunk=50)
+
+# Store dictionary in pickle format
+#with open(outputfilepath / "T_car.pkl", "wb") as f:
+#    pickle.dump(T_car, f)
+# -> Commented out because it takes long to run. The result has been stored in pickle format
+
+# Load pre-generated dictionary
+with open(outputfilepath / "T_car.pkl", "rb") as f:
+    T_car = pickle.load(f)
+
+#edges_flat = build_edges_within_radius_flat(T_car, max_minutes= 60)
+
+# Browse the number of eligible neighbours depending on allowable travel time
+results = []
+for mins in [30, 60, 90, 120]:
+    edges_flat = build_edges_within_radius_flat(T_car, max_minutes=mins)
+    neighbors_count = pd.Series({fac: len(neigh) for fac, neigh in edges_flat.items()})
+    mean = neighbors_count.mean()
+    sem = neighbors_count.sem()  # standard error of mean
+    ci95 = 1.96 * sem
+    results.append({"radius": mins, "mean": mean, "ci95": ci95})
+
+results_df = pd.DataFrame(results)
+
+# Plot
+plt.figure(figsize=(6,4))
+plt.bar(results_df["radius"], results_df["mean"], yerr=results_df["ci95"], capsize=5, color="skyblue")
+plt.xlabel("Travel time radius (minutes)")
+plt.ylabel("Average number of facilities within radius")
+plt.title("Average connectivity of facilities with 95% CI")
+plt.xticks(results_df["radius"])
+plt.savefig(outputfilepath / "neighbour_count_by_max_travel_time")
+
+# A manual check shows that for distances greater than 60 minutes ORS underestimates the travel time a little
+# TODO consider using google maps API
+
+# 2) Run Scenario 1 (1-hour radius)
+
+#Drop NAs
+# TODO find a more generalisable solution for this issue (within the optimisation functions)
+lmis = lmis[(lmis.amc != 0) & (lmis.amc.notna())]
+
+tlo_30 = run_redistribution_scenarios(
+    lmis,
+    scenario="radius_30",
+    time_matrix=T_car,
+    tau_keep=2.0, tau_tar=1.0, alpha=1,
+    K_in=2, K_out=2, Qmin_days=7.0,
+)
+
+tlo_60 = run_redistribution_scenarios(
+    lmis,
+    scenario="radius_60",
+    time_matrix=T_car,
+    tau_keep=2.0, tau_tar=1.0, alpha=1,
+    K_in=2, K_out=2, Qmin_days=7.0,
+)
+
+tlo_pooling = run_redistribution_scenarios(
+    lmis,
+    scenario="pooling",
+    tau_min=0.75, tau_max=3.0
+)
+
+tlo_availability_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv")
+# Drop any scenario data previously included in the resourcefile
+tlo_availability_df = tlo_availability_df[['Facility_ID', 'month','item_code', 'available_prop']]
+
+# Import item_category
+program_item_mapping = pd.read_csv(path_for_new_resourcefiles  / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']]
+program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()]
+
+# 1.1.1 Attach district,  facility level and item_category to this dataset
+#----------------------------------------------------------------
+# Get TLO Facility_ID for each district and facility level
+mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv")
+districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District'])
+fac_levels = {'0', '1a', '1b', '2', '3', '4'}
+tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
+                    on = ['Facility_ID'], how='left')
+
+tlo_availability_df = tlo_availability_df.merge(program_item_mapping,
+                    on = ['item_code'], how='left')
+tlo_availability_df = tlo_availability_df[~tlo_availability_df[['District', 'Facility_Level', 'month', 'item_code']].duplicated()]
+
+
+comparison_df = tlo_30.merge(tlo_availability_df, left_on = ['district', 'Facility_Level', 'month', 'item_code'],
+                             right_on = ['District', 'Facility_Level', 'month', 'item_code'], how = 'left', validate = "1:1")
+comparison_df = comparison_df.merge(tlo_60, on = ['district', 'Facility_Level', 'month', 'item_code'], how = 'left', validate = "1:1")
+print(comparison_df['new_available_prop_x'].mean(),comparison_df['new_available_prop_y'].mean(),
+comparison_df['available_prop'].mean())
+
+comparison_df['new_available_prop_x'] = np.maximum(
+    comparison_df['new_available_prop_x'],
+    comparison_df['available_prop']
+)
+comparison_df['new_available_prop_y'] = np.maximum(
+    comparison_df['new_available_prop_y'],
+    comparison_df['available_prop']
+)
+
+print(comparison_df['new_available_prop_x'].mean(),comparison_df['new_available_prop_y'].mean(),
+comparison_df['available_prop'].mean())
+
+# TODO keep only government facilities?
+

From 797ba66763063ce320ddff8eeaab47f65ce63691 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Thu, 11 Sep 2025 10:01:01 +0700
Subject: [PATCH 04/16] make the radius redistribution more efficient

---
 ...ate_consumable_redistribution_scenarios.py | 356 ++++++++++++------
 1 file changed, 243 insertions(+), 113 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
index cd3680556d..a7604868f3 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -9,7 +9,8 @@
 from typing import Literal, Optional, Dict, Tuple
 import requests
 
-from pulp import LpProblem, LpMaximize, LpVariable, LpBinary, LpStatus, value, lpSum, LpContinuous
+from pulp import LpProblem, LpMaximize, LpVariable, LpBinary, LpStatus, value, lpSum, LpContinuous, PULP_CBC_CMD
+
 
 # define a timestamp for script outputs
 timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M")
@@ -331,51 +332,93 @@ def redistribute_radius_lp(
 
     # iterate per (district, month, item)
     group_cols = list(id_cols)
+    EPS = 1e-9
+    EPS_AMC = 1e-6
+
     for (d, m, i), g in df.groupby(group_cols):
         # facilities in this slice
         g = g.copy()
-        facs = g[facility_col].tolist()
 
-        # build per-fac arrays
-        AMC = g.set_index(facility_col)[amc_col].astype(float)
-        OB0 = g.set_index(facility_col)["OB"].astype(float)
+        # ---------- (1) Select the correct time matrix for this district / slice ----------
+        if isinstance(time_matrix, dict):
+            T_d = time_matrix.get(d)
+            if T_d is None or T_d.empty:
+                continue
+        else:
+            T_d = time_matrix
+
+        # facilities present in this slice AND in the matrix index/cols
+        slice_facs = g[facility_col].dropna().unique().tolist()
+        present = [f for f in slice_facs if (f in  T_d.index and f in T_d.columns)]
+        if len(present) < 2:
+            continue
+
+        # subset and sanitize travel-times (NaN -> inf)
+        # This is to make sure that the redistribution LP is run only for those pairs for which a travel time could be calculated
+        T_sub = T_d.loc[present, present].replace(np.nan, np.inf)
 
+        # ---------- (2) Clean inputs (AMC, OB) ----------
+        # build per-fac arrays
+        AMC = (g.set_index(facility_col)[amc_col]
+               .astype(float)
+               .replace([np.inf, -np.inf], np.nan)
+               .fillna(0.0))
+        AMC[AMC <= 0.0] = EPS_AMC  # avoid zero AMC
+
+        OB0 = (g.set_index(facility_col)["OB"]
+               .astype(float)
+               .replace([np.inf, -np.inf], np.nan)
+               .fillna(0.0))
+
+        # keep only present facilities
+        AMC = AMC.reindex(present).fillna(EPS_AMC)
+        OB0 = OB0.reindex(present).fillna(0.0)
+
+        # ---------- (3) Surplus/deficit with finite guards ----------
         # donor surplus cap and receiver deficit
         surplus_cap = alpha * np.maximum(0.0, OB0.values - tau_keep * AMC.values)
-        deficit     = np.maximum(0.0, tau_tar * AMC.values - OB0.values)
-        donors_mask = surplus_cap > 1e-9 # boolean vector marking facilities that actually have positive surplus
-        recv_mask   = deficit > 1e-9 # boolean vector marking facilities that actually have a real need (deficit > 0).
+        deficit = np.maximum(0.0, tau_tar * AMC.values - OB0.values)
+        surplus_cap = np.where(np.isfinite(surplus_cap), surplus_cap, 0.0)
+        deficit = np.where(np.isfinite(deficit), deficit, 0.0)
 
+        donors_mask = surplus_cap > EPS
+        recv_mask = deficit > EPS
         if not donors_mask.any() or not recv_mask.any():
-            # nothing to do for this slice
             continue
 
         donors = list(AMC.index[donors_mask])
         receivers = list(AMC.index[recv_mask])
 
-        # Feasible edges within radius
-        #feasible = build_edges_within_radius(time_matrix.loc[donors + receivers, donors + receivers], radius_minutes)
-        T_d = time_matrix[d]  # time_matrix is dict {district -> DataFrame}
-
-        # 2) limit to only facilities present in this slice
-        slice_facs = g[facility_col].unique().tolist()
-        T_d = T_d.loc[slice_facs, slice_facs]
+        smax_map = {fac: float(x) for fac, x in zip(AMC.index, surplus_cap)} # creates a dict with key = facility and value = donatable surplus
+        # this is the upper bound for how much a facility can donate
+        deficit_map = {fac: float(x) for fac, x in zip(AMC.index, deficit)} # creates a dict with key = facility and value = deficit
+        # this is the upper bound for how much a facility can receive
 
-        # 3) build feasible edges for this radius
+        # ---------- (4) Build feasible edges within radius (finite & <= radius) ----------
         feasible = {}
-        for g_fac in T_d.index:
-            feas_mask = (T_d.loc[g_fac].to_numpy() <= radius_minutes) & np.isfinite(T_d.loc[g_fac].to_numpy())
-            feasible[g_fac] = set(T_d.columns[feas_mask]) - {g_fac}
-
-        # Minimum lot (units) for each receiver depends on its AMC
-        AMC_map = AMC.to_dict()
-        Qmin_units = {f: Qmin_days * max(1e-9, AMC_map[f]) for f in receivers}
+        for g_fac in present:
+            row = T_sub.loc[g_fac].to_numpy()
+            mask = (row <= radius_minutes) & np.isfinite(row)
+            feas = set(T_sub.columns[mask]) - {g_fac}
+            feasible[g_fac] = feas
+
+        # ---------- (5) Edge set with tight M_edge and min-lot per receiver ----------
+        Qmin_units = {f: float(max(Qmin_days * AMC[f], 0.0)) for f in present}
+        M_edge = {}
+        for g_fac in donors:
+            smax = smax_map.get(g_fac, 0.0)
+            if smax <= EPS:
+                continue
+            for f_fac in feasible.get(g_fac, set()):
+                if f_fac not in receivers:
+                    continue
+                h = deficit_map.get(f_fac, 0.0)
+                M = min(h, smax)
+                if np.isfinite(M) and M > EPS:
+                    M_edge[(g_fac, f_fac)] = M
 
-        # Big-M per edge: cannot exceed receiver deficit
-        deficit_map = dict(zip(AMC.index, deficit)) # creates a dict with key = facility and value = deficit
-        # this is the upper bound for how much a facility can receive
-        smax_map = dict(zip(AMC.index, surplus_cap)) # creates a dict with key = facility and value = donatable surplus
-        # this is the upper bound for how much a facility can donate
+        if not M_edge:
+            continue  # nothing feasible to move
 
         # Build MILP
         prob = LpProblem(f"Redistribution_{d}_{m}_{i}", LpMaximize)
@@ -383,134 +426,215 @@ def redistribute_radius_lp(
         # Variables
         t = {}   # transfer amounts per (g,f): continuous >=0
         y = {}   # edge-activation binaries
+        for (g_fac, f_fac), M in M_edge.items():
+            t[(g_fac, f_fac)] = LpVariable(f"t_{g_fac}->{f_fac}", lowBound=0, upBound=M, cat=LpContinuous)
+            y[(g_fac, f_fac)] = LpVariable(f"y_{g_fac}->{f_fac}", lowBound=0, upBound=1, cat=LpBinary)
 
-        for g_fac in donors:
-            for f_fac in feasible.get(g_fac, set()):
-                if f_fac in receivers:
-                    t[(g_fac, f_fac)] = LpVariable(f"t_{g_fac}->{f_fac}", lowBound=0, cat=LpContinuous)
-                    y[(g_fac, f_fac)] = LpVariable(f"y_{g_fac}->{f_fac}", lowBound=0, upBound=1, cat=LpBinary)
-
-        # Objective: maximize total shipped (sum of t)
+        # objective: maximize total shipped (proxy for availability gain)
         prob += lpSum(t.values())
 
-        # Donor outflow caps
+        # donor outflow caps
         for g_fac in donors:
-            prob += lpSum(t[(g_fac, f_fac)] for f_fac in receivers if (g_fac, f_fac) in t) <= smax_map[g_fac]
+            prob += lpSum(t[(g_fac, f_fac)] for f_fac in receivers if (g_fac, f_fac) in t) <= smax_map.get(g_fac, 0.0)
 
-        # Receiver inflow caps
+        # receiver inflow caps
         for f_fac in receivers:
-            prob += lpSum(t[(g_fac, f_fac)] for g_fac in donors if (g_fac, f_fac) in t) <= deficit_map[f_fac]
+            prob += lpSum(t[(g_fac, f_fac)] for g_fac in donors if (g_fac, f_fac) in t) <= deficit_map.get(f_fac, 0.0)
 
-        # Edge linking + minimum lot size + K_in/K_out
-        for (g_fac, f_fac), var in t.items():
-            M = deficit_map[f_fac]
-            prob += var <= M * y[(g_fac, f_fac)]
-            # minimum lot only makes sense if the edge is used at all; also don't force Qmin to exceed deficit
-            prob += var >= min(Qmin_units[f_fac], deficit_map[f_fac]) * y[(g_fac, f_fac)]
+        # link t and y; enforce min-lot clipped by edge capacity
+        for (g_fac, f_fac), M in M_edge.items():
+            prob += t[(g_fac, f_fac)] <= M * y[(g_fac, f_fac)]
+            qmin = min(Qmin_units.get(f_fac, 0.0), M)
+            # if qmin is basically zero, don't force a min lot
+            if qmin > EPS:
+                prob += t[(g_fac, f_fac)] >= qmin * y[(g_fac, f_fac)]
 
-        # Max donors per receiver
+        # cardinality limits
         for f_fac in receivers:
             prob += lpSum(y[(g_fac, f_fac)] for g_fac in donors if (g_fac, f_fac) in y) <= K_in
-
-        # Max receivers per donor
         for g_fac in donors:
             prob += lpSum(y[(g_fac, f_fac)] for f_fac in receivers if (g_fac, f_fac) in y) <= K_out
 
-        # Solve (default solver)
-        prob.solve()
+        # solve
+        prob.solve(PULP_CBC_CMD(msg=False, cuts=0, presolve=True, threads=1))
 
-        # Apply transfers to OB'
-        if LpStatus[prob.status] not in ("Optimal","Optimal Infeasible","Not Solved"):
-            # If not solved (rare), skip changes
+        if LpStatus[prob.status] not in ("Optimal", "Optimal Infeasible", "Not Solved"):
             continue
 
-        # net change per facility
-        delta = {fac: 0.0 for fac in facs}
+        # ---------- (7) Apply transfers to OB' ----------
+        delta = {fac: 0.0 for fac in present}
         for (g_fac, f_fac), var in t.items():
-            moved = value(var) or 0.0
-            delta[g_fac] -= moved
-            delta[f_fac] += moved
+            moved = float(value(var) or 0.0)
+            if moved:
+                delta[g_fac] -= moved
+                delta[f_fac] += moved
 
-        # write back OB'
         sel = (df["district"].eq(d) & df["month"].eq(m) & df["item_code"].eq(i))
         df.loc[sel, "OB_prime"] = df.loc[sel].apply(
-            lambda r: r["OB"] + delta.get(r[facility_col], 0.0), axis=1
+            lambda r: (r["OB"] + delta.get(r[facility_col], 0.0)), axis=1
         )
 
-    # Mechanistic availability after redistribution
-    df["available_prop_redis"] = np.minimum(1.0, np.maximum(0.0, df["OB_prime"] / np.maximum(1e-9, df[amc_col])))
+        print(d, m, i,
+              "donors:", len(donors),
+              "receivers:", len(receivers),
+              "edges:", len(M_edge))
+
+    # ---------- (8) Mechanistic availability after redistribution ----------
+    df["available_prop_redis"] = np.minimum(1.0, np.maximum(0.0, df["OB_prime"] / np.maximum(EPS_AMC,
+                                                                                             df[amc_col].astype(
+                                                                                                 float).values)))
     return df
 
 def redistribute_pooling_lp(
     df: pd.DataFrame,
-    tau_min: float = 0.25,   # optional lower floor (this is the same as 7 days Q_min)
-    tau_max: float = 3.0,    # storage/USAID style max
+    tau_min: float = 0.25,   # lower floor in "months of demand" (≈ 7–8 days)
+    tau_max: float = 3.0,    # upper ceiling (storage/policy max)
     id_cols=("district","month","item_code"),
     facility_col="fac_name",
     amc_col="amc",
     close_cols=("closing_bal","received","dispensed"),
+    keep_baseline_for_amc0: bool = True,   # leave baseline availability where AMC≈0
+    amc_eps: float = 1e-6                  # threshold to treat AMC as "zero"
 ) -> pd.DataFrame:
     """
-    Scenario 3: district-level pooling/push. Linear program maximizing total presumed availability.
-
-    We linearize p = min(1, x/AMC) by introducing p_f in [0,1] with constraint AMC_f * p_f <= x_f.
-
-    Returns df with 'available_prop_redis' updated for the pooling scenario.
+    Scenario 3: district-level pooling/push (robust).
+    Maximizes total availability with:
+      - NaN/inf guards on AMC/OB
+      - duplicate facility IDs collapsed within group
+      - floors scaled if total stock < sum floors
+      - optional 'excess' sink if total stock > sum ceilings
+      - availability computed safely; AMC≈0 rows keep baseline (optional)
     """
     closing_bal, received, dispensed = close_cols
-    df = df.copy()
-    df["OB"] = compute_opening_balance(df)
+    out = df.copy()
 
-    df["OB_prime"] = df["OB"]
+    # Safe opening balance
+    out["OB"] = (
+        out[closing_bal].astype(float)
+        - out[received].astype(float)
+        + out[dispensed].astype(float)
+    ).replace([np.inf, -np.inf], np.nan).fillna(0.0)
+
+    # Default (will overwrite per group)
+    out["OB_prime"] = out["OB"]
 
     group_cols = list(id_cols)
-    for (d, m, i), g in df.groupby(group_cols):
+
+    for (d, m, i), g in out.groupby(group_cols, sort=False):
         g = g.copy()
-        facs = g[facility_col].tolist()
-        AMC = g.set_index(facility_col)[amc_col].astype(float)
-        OB0 = g.set_index(facility_col)["OB"].astype(float)
 
+        # Clean AMC/OB and collapse duplicate facilities (unique index is required)
+        AMC = (g.set_index(facility_col)[amc_col]
+                 .astype(float)
+                 .replace([np.inf, -np.inf], np.nan)
+                 .fillna(0.0))
+        OB0 = (g.set_index(facility_col)["OB"]
+                 .astype(float)
+                 .replace([np.inf, -np.inf], np.nan)
+                 .fillna(0.0))
+
+        if AMC.index.duplicated().any():
+            AMC = AMC.groupby(level=0).first()
+        if OB0.index.duplicated().any():
+            OB0 = OB0.groupby(level=0).sum()
+
+        facs_all = AMC.index.tolist()
         total_stock = float(OB0.sum())
         if total_stock <= 1e-9:
-            # nothing to allocate
             continue
 
-        # LP
+        # Positive-demand decision set
+        mask_pos = AMC >= amc_eps
+        facs_pos = AMC.index[mask_pos].tolist()
+        if len(facs_pos) == 0:
+            # No demand nodes to allocate to; leave OB as-is (or zero them if you prefer)
+            continue
+
+        AMC_pos = AMC.loc[facs_pos]
+        # Floors/ceilings
+        lb = (tau_min * AMC_pos).astype(float)
+        ub = (tau_max * AMC_pos).astype(float)
+
+        sum_lb = float(lb.sum())
+        sum_ub = float(ub.sum())
+
+        # Feasibility: scale floors if not enough stock
+        if total_stock + 1e-9 < sum_lb:
+            scale = max(1e-9, total_stock / max(1e-9, sum_lb))
+            lb = lb * scale
+            sum_lb = float(lb.sum())
+
+        # If stock exceeds ceilings, allow an excess sink to absorb leftovers
+        allow_excess_sink = total_stock > sum_ub + 1e-9
+
+        # ---------- LP ----------
         prob = LpProblem(f"Pooling_{d}_{m}_{i}", LpMaximize)
 
-        # Decision vars
-        x = {f: LpVariable(f"x_{f}", lowBound=0) for f in facs}      # allocated opening stock
-        p = {f: LpVariable(f"p_{f}", lowBound=0, upBound=1) for f in facs}  # availability
+        # Decision variables
+        x = {f: LpVariable(f"x_{f}", lowBound=0) for f in facs_pos}      # allocation
+        p = {f: LpVariable(f"p_{f}", lowBound=0, upBound=1) for f in facs_pos}  # availability
+
+        excess = LpVariable("excess", lowBound=0) if allow_excess_sink else None
 
-        # Objective: maximize sum p_f (optionally weight by item importance)
+        # Objective: maximize total availability
         prob += lpSum(p.values())
 
         # Conservation
-        prob += lpSum(x.values()) == total_stock
+        if excess is None:
+            prob += lpSum(x.values()) == total_stock
+        else:
+            prob += lpSum(x.values()) + excess == total_stock
 
-        # Guardrails (optional)
-        for f in facs:
-            prob += x[f] >= tau_min * AMC[f]
-            prob += x[f] <= tau_max * AMC[f]
+        # Floors/ceilings (use scalar values to avoid float(Series) errors)
+        for f in facs_pos:
+            prob += x[f] >= float(lb.loc[f])
+            prob += x[f] <= float(ub.loc[f])
 
-        # Linearization: AMC_f * p_f <= x_f
-        for f in facs:
-            prob += AMC[f] * p[f] <= x[f]
+        # Linearization: AMC_f * p_f <= x_f (only for AMC>=amc_eps)
+        for f in facs_pos:
+            prob += float(AMC_pos.loc[f]) * p[f] <= x[f]
 
+        # Solve
         prob.solve()
-
-        if LpStatus[prob.status] not in ("Optimal","Optimal Infeasible","Not Solved"):
+        if LpStatus[prob.status] != "Optimal":
             continue
 
-        # Write back OB' and p'
-        x_sol = {f: value(var) or 0.0 for f, var in x.items()}
-        # OB' equals allocated x
-        sel = (df["district"].eq(d) & df["month"].eq(m) & df["item_code"].eq(i))
-        df.loc[sel, "OB_prime"] = df.loc[sel].apply(lambda r: x_sol.get(r[facility_col], r["OB"]), axis=1)
+        # Apply solution
+        x_sol = {f: float(value(var) or 0.0) for f, var in x.items()}
+
+        sel = (out["district"].eq(d) & out["month"].eq(m) & out["item_code"].eq(i))
+        # Map allocation to rows with positive demand
+        mask_rows_pos = sel & out[facility_col].isin(facs_pos)
+        out.loc[mask_rows_pos, "OB_prime"] = out.loc[mask_rows_pos, facility_col].map(x_sol).values
+
+        # Rows with AMC<eps donate to the pool; set OB' to 0 (availability handled below)
+        mask_rows_zero = sel & ~out[facility_col].isin(facs_pos)
+        out.loc[mask_rows_zero, "OB_prime"] = 0.0
+
+    # ---------- Availability after redistribution (safe) ----------
+    amc_safe = out[amc_col].astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)
+    pos_mask = amc_safe >= amc_eps
+
+    p_mech = np.zeros(len(out), dtype=float)
+    # Mechanistic availability for positive-demand rows
+    denom = np.maximum(amc_eps, amc_safe[pos_mask].to_numpy())
+    p_mech[pos_mask.values] = np.minimum(1.0, np.maximum(
+        0.0, out.loc[pos_mask, "OB_prime"].to_numpy() / denom
+    ))
+
+    if keep_baseline_for_amc0:
+        # Keep baseline availability where AMC≈0
+        base = (out.loc[~pos_mask, "available_prop"]
+                  .astype(float)
+                  .replace([np.inf, -np.inf], np.nan)
+                  .fillna(0.0)).to_numpy()
+        p_mech[~pos_mask.values] = base
+    else:
+        # Or: 1 if OB'>0 else 0 for AMC≈0 rows
+        p_mech[~pos_mask.values] = (out.loc[~pos_mask, "OB_prime"].to_numpy() > 0.0).astype(float)
 
-    # Mechanistic availability p' = min(1, OB'/AMC)
-    df["available_prop_redis"] = np.minimum(1.0, np.maximum(0.0, df["OB_prime"] / np.maximum(1e-9, df[amc_col])))
-    return df
+    out["available_prop_redis"] = p_mech
+    return out
 
 # ------------------------
 # 4) HIGH-LEVEL DRIVER
@@ -637,7 +761,9 @@ def run_redistribution_scenarios(
 
 #Drop NAs
 # TODO find a more generalisable solution for this issue (within the optimisation functions)
-lmis = lmis[(lmis.amc != 0) & (lmis.amc.notna())]
+#lmis = lmis[(lmis.amc != 0) & (lmis.amc.notna())]
+
+#lmis = lmis[lmis.district == 'Lilongwe']
 
 tlo_30 = run_redistribution_scenarios(
     lmis,
@@ -683,23 +809,27 @@ def run_redistribution_scenarios(
 tlo_availability_df = tlo_availability_df[~tlo_availability_df[['District', 'Facility_Level', 'month', 'item_code']].duplicated()]
 
 
-comparison_df = tlo_30.merge(tlo_availability_df, left_on = ['district', 'Facility_Level', 'month', 'item_code'],
+comparison_df = tlo_30.rename(columns ={'new_available_prop': 'available_prop_30mins'}).merge(tlo_availability_df, left_on = ['district', 'Facility_Level', 'month', 'item_code'],
                              right_on = ['District', 'Facility_Level', 'month', 'item_code'], how = 'left', validate = "1:1")
-comparison_df = comparison_df.merge(tlo_60, on = ['district', 'Facility_Level', 'month', 'item_code'], how = 'left', validate = "1:1")
-print(comparison_df['new_available_prop_x'].mean(),comparison_df['new_available_prop_y'].mean(),
-comparison_df['available_prop'].mean())
+comparison_df = comparison_df.merge(tlo_60.rename(columns ={'new_available_prop': 'available_prop_60mins'}), on = ['district', 'Facility_Level', 'month', 'item_code'], how = 'left', validate = "1:1")
+comparison_df = comparison_df.merge(tlo_pooling.rename(columns ={'new_available_prop': 'available_prop_pooling'}), on = ['district', 'Facility_Level', 'month', 'item_code'], how = 'left', validate = "1:1")
+print(comparison_df['available_prop'].mean(),comparison_df['available_prop_30mins'].mean(),
+comparison_df['available_prop_60mins'].mean(), comparison_df['available_prop_pooling'].mean())
 
-comparison_df['new_available_prop_x'] = np.maximum(
-    comparison_df['new_available_prop_x'],
+comparison_df['available_prop_30mins'] = np.maximum(
+    comparison_df['available_prop_30mins'],
     comparison_df['available_prop']
 )
-comparison_df['new_available_prop_y'] = np.maximum(
-    comparison_df['new_available_prop_y'],
+comparison_df['available_prop_60mins'] = np.maximum(
+    comparison_df['available_prop_60mins'],
     comparison_df['available_prop']
 )
-
-print(comparison_df['new_available_prop_x'].mean(),comparison_df['new_available_prop_y'].mean(),
-comparison_df['available_prop'].mean())
+comparison_df['available_prop_pooling'] = np.maximum(
+    comparison_df['available_prop_pooling'],
+    comparison_df['available_prop']
+)
+print(comparison_df['available_prop'].mean(),comparison_df['available_prop_30mins'].mean(),
+comparison_df['available_prop_60mins'].mean(), comparison_df['available_prop_pooling'].mean())
 
 # TODO keep only government facilities?
 

From 50720616c7abc3e91e42e0d914bb94a69f4c42de Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Wed, 24 Sep 2025 18:33:46 +0100
Subject: [PATCH 05/16] Updated to MILP functions - - get detailed logs of
 transfers

---
 ...ate_consumable_redistribution_scenarios.py | 109 +++++++++++++-----
 1 file changed, 82 insertions(+), 27 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
index a7604868f3..0f58c4d1c2 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -315,6 +315,7 @@ def redistribute_radius_lp(
     facility_col="fac_name",
     amc_col="amc",
     close_cols=("closing_bal","received","dispensed"),
+    return_edge_log: bool = True,
 ) -> pd.DataFrame:
     """
     Scenario 1 or 2: local swaps within a time radius.
@@ -324,18 +325,18 @@ def redistribute_radius_lp(
     Returns a copy of df with a new column 'available_prop_redis' for redistributed probability per row.
     """
     closing_bal, received, dispensed = close_cols
-    df = df.copy()
-    df["OB"] = compute_opening_balance(df)
-
-    # container for updated OB by (district, month, item, facility)
-    df["OB_prime"] = df["OB"]
+    out = df.copy()
+    out["OB"] =  compute_opening_balance(out).replace([np.inf, -np.inf], np.nan).fillna(0.0)
+    out["OB_prime"] = out["OB"] # container for updated OB by (district, month, item, facility)
 
     # iterate per (district, month, item)
     group_cols = list(id_cols)
     EPS = 1e-9
     EPS_AMC = 1e-6
 
-    for (d, m, i), g in df.groupby(group_cols):
+    edge_logs = []   # will log dicts with district, month, item_code, donor, receiver, units_moved, travel_minutes
+
+    for (d, m, i), g in out.groupby(group_cols, sort=False):
         # facilities in this slice
         g = g.copy()
 
@@ -461,16 +462,25 @@ def redistribute_radius_lp(
         if LpStatus[prob.status] not in ("Optimal", "Optimal Infeasible", "Not Solved"):
             continue
 
-        # ---------- (7) Apply transfers to OB' ----------
+        # apply transfers & LOG
         delta = {fac: 0.0 for fac in present}
         for (g_fac, f_fac), var in t.items():
             moved = float(value(var) or 0.0)
-            if moved:
+            if moved > EPS_AMC:
                 delta[g_fac] -= moved
                 delta[f_fac] += moved
-
-        sel = (df["district"].eq(d) & df["month"].eq(m) & df["item_code"].eq(i))
-        df.loc[sel, "OB_prime"] = df.loc[sel].apply(
+                if return_edge_log:
+                    edge_logs.append({
+                        "district": d, "month": m, "item_code": i,
+                        "donor_fac": g_fac, "receiver_fac": f_fac,
+                        "units_moved": moved,
+                        "travel_minutes": float(T_sub.loc[g_fac, f_fac]) if np.isfinite(
+                            T_sub.loc[g_fac, f_fac]) else np.nan
+                    })
+
+        # write OB'
+        sel = (out["district"].eq(d) & out["month"].eq(m) & out["item_code"].eq(i))
+        out.loc[sel, "OB_prime"] = out.loc[sel].apply(
             lambda r: (r["OB"] + delta.get(r[facility_col], 0.0)), axis=1
         )
 
@@ -480,30 +490,44 @@ def redistribute_radius_lp(
               "edges:", len(M_edge))
 
     # ---------- (8) Mechanistic availability after redistribution ----------
-    df["available_prop_redis"] = np.minimum(1.0, np.maximum(0.0, df["OB_prime"] / np.maximum(EPS_AMC,
-                                                                                             df[amc_col].astype(
-                                                                                                 float).values)))
-    return df
+    out["available_prop_redis"] = np.minimum(
+        1.0,
+        np.maximum(0.0, out["OB_prime"] / np.maximum(EPS_AMC, out[amc_col].astype(float).values))
+    )
+
+    # build log DataFrames
+    edge_log_df = pd.DataFrame(edge_logs,
+                               columns=["district", "month", "item_code", "donor_fac", "receiver_fac", "units_moved",
+                                        "travel_minutes"]) if return_edge_log else None
+
+    return out, edge_log_df
 
 def redistribute_pooling_lp(
     df: pd.DataFrame,
-    tau_min: float = 0.25,   # lower floor in "months of demand" (≈ 7–8 days)
+    tau_min: float = 0.25,   # lower floor in "months of demand" (≈ 7–8 days) - minimum transfer required
     tau_max: float = 3.0,    # upper ceiling (storage/policy max)
     id_cols=("district","month","item_code"),
     facility_col="fac_name",
     amc_col="amc",
     close_cols=("closing_bal","received","dispensed"),
     keep_baseline_for_amc0: bool = True,   # leave baseline availability where AMC≈0
-    amc_eps: float = 1e-6                  # threshold to treat AMC as "zero"
+    amc_eps: float = 1e-6,                  # threshold to treat AMC as "zero"
+    return_move_log: bool = True # return a detailed df showing net movement of consumables after redistribution
 ) -> pd.DataFrame:
     """
-    Scenario 3: district-level pooling/push (robust).
+    Scenario 3: district-level pooling/push .
     Maximizes total availability with:
       - NaN/inf guards on AMC/OB
       - duplicate facility IDs collapsed within group
       - floors scaled if total stock < sum floors
       - optional 'excess' sink if total stock > sum ceilings
       - availability computed safely; AMC≈0 rows keep baseline (optional)
+
+        Returns:
+      - out: original df plus columns:
+          OB, OB_prime, available_prop_redis, received_from_pool
+        where received_from_pool = OB_prime - OB (pos=received, neg=donated)
+      - (optional) move_log: per (district, month, item, facility) net movement summary
     """
     closing_bal, received, dispensed = close_cols
     out = df.copy()
@@ -519,6 +543,7 @@ def redistribute_pooling_lp(
     out["OB_prime"] = out["OB"]
 
     group_cols = list(id_cols)
+    move_rows = []  # optional movement log
 
     for (d, m, i), g in out.groupby(group_cols, sort=False):
         g = g.copy()
@@ -571,13 +596,13 @@ def redistribute_pooling_lp(
         prob = LpProblem(f"Pooling_{d}_{m}_{i}", LpMaximize)
 
         # Decision variables
-        x = {f: LpVariable(f"x_{f}", lowBound=0) for f in facs_pos}      # allocation
-        p = {f: LpVariable(f"p_{f}", lowBound=0, upBound=1) for f in facs_pos}  # availability
+        x = {f: LpVariable(f"x_{f}", lowBound=0) for f in facs_pos}      # how much stock to allocation to facility f
+        p = {f: LpVariable(f"p_{f}", lowBound=0, upBound=1) for f in facs_pos}  # Availability probability proxy for facility f
 
         excess = LpVariable("excess", lowBound=0) if allow_excess_sink else None
 
         # Objective: maximize total availability
-        prob += lpSum(p.values())
+        prob += lpSum(p.values()) # prioritize broad coverage—get as many facilities as possible to have stock
 
         # Conservation
         if excess is None:
@@ -590,7 +615,7 @@ def redistribute_pooling_lp(
             prob += x[f] >= float(lb.loc[f])
             prob += x[f] <= float(ub.loc[f])
 
-        # Linearization: AMC_f * p_f <= x_f (only for AMC>=amc_eps)
+        # Linearization: AMC_f * p_f <= x_f (only for AMC>=amc_eps), i.e p = x/AMC
         for f in facs_pos:
             prob += float(AMC_pos.loc[f]) * p[f] <= x[f]
 
@@ -602,20 +627,36 @@ def redistribute_pooling_lp(
         # Apply solution
         x_sol = {f: float(value(var) or 0.0) for f, var in x.items()}
 
-        sel = (out["district"].eq(d) & out["month"].eq(m) & out["item_code"].eq(i))
+        sel = (out["district"].eq(d) & out["month"].eq(m) & out["item_code"].eq(i)) # selection mask
+        # boolean filter that picks out the subset of rows in out corresponding to the current group (district, month, item)
         # Map allocation to rows with positive demand
-        mask_rows_pos = sel & out[facility_col].isin(facs_pos)
-        out.loc[mask_rows_pos, "OB_prime"] = out.loc[mask_rows_pos, facility_col].map(x_sol).values
+        mask_rows_pos = sel & out[facility_col].isin(facs_pos) # only those facilities with a positive demand/AMC
+        out.loc[mask_rows_pos, "OB_prime"] = out.loc[mask_rows_pos, facility_col].map(x_sol).values # assign new opening balance
 
         # Rows with AMC<eps donate to the pool; set OB' to 0 (availability handled below)
         mask_rows_zero = sel & ~out[facility_col].isin(facs_pos)
         out.loc[mask_rows_zero, "OB_prime"] = 0.0
 
+        if return_move_log:
+            # Build a per-facility movement summary for this group
+            # Note: for duplicate facility rows in the group, we report the same net value (x_f - OB0_f_agg)
+            #       OB0_agg is OB0 after deduplication (summing duplicates)
+            for f in AMC.index:  # all facilities (including amc≈0)
+                x_f = x_sol.get(f, 0.0) if f in facs_pos else 0.0
+                net = x_f - float(OB0.get(f, 0.0))  # positive=received, negative=donated
+                move_rows.append({
+                    "district": d, "month": m, "item_code": i,
+                    "facility": f,
+                    "received_from_pool": net,
+                    "x_allocated": x_f,
+                    "OB0_agg": float(OB0.get(f, 0.0))
+                })
+
     # ---------- Availability after redistribution (safe) ----------
-    amc_safe = out[amc_col].astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)
+    amc_safe = out[amc_col].astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0) # returns a clean numeric AMC series with no weird values
     pos_mask = amc_safe >= amc_eps
 
-    p_mech = np.zeros(len(out), dtype=float)
+    p_mech = np.zeros(len(out), dtype=float) # Prepare an output array for the new availability (available_prop_redis), initialized to 0 for all rows.
     # Mechanistic availability for positive-demand rows
     denom = np.maximum(amc_eps, amc_safe[pos_mask].to_numpy())
     p_mech[pos_mask.values] = np.minimum(1.0, np.maximum(
@@ -634,8 +675,21 @@ def redistribute_pooling_lp(
         p_mech[~pos_mask.values] = (out.loc[~pos_mask, "OB_prime"].to_numpy() > 0.0).astype(float)
 
     out["available_prop_redis"] = p_mech
+    # NEW: per-row received_from_pool (using each row's OB)
+    out["received_from_pool"] = out["OB_prime"] - out["OB"]
+
+    if return_move_log:
+        move_log = pd.DataFrame(
+            move_rows,
+            columns=["district", "month", "item_code", "facility", "received_from_pool", "x_allocated", "OB0_agg"]
+        )
+        return out, move_log
+
     return out
 
+# pooled_df, pool_moves = redistribute_pooling_lp(lmis, tau_min=0.25, tau_max=3.0, return_move_log=True)
+
+
 # ------------------------
 # 4) HIGH-LEVEL DRIVER
 # ------------------------
@@ -696,6 +750,7 @@ def run_redistribution_scenarios(
             facility_col=facility_col,
             amc_col=amc_col,
             close_cols=close_cols,
+            return_move_log=False,
         )
     else:
         raise ValueError("scenario must be one of 'radius_30', 'radius_60', 'pooling'.")

From ed042146f010bae93392989277b859dc1affba68 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Thu, 25 Sep 2025 15:05:56 +0100
Subject: [PATCH 06/16] Allow for resource pooling at a lower level than
 district

---
 ...ate_consumable_redistribution_scenarios.py | 284 +++++++++++++-----
 1 file changed, 217 insertions(+), 67 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
index 0f58c4d1c2..9b0feeafa5 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -10,7 +10,7 @@
 import requests
 
 from pulp import LpProblem, LpMaximize, LpVariable, LpBinary, LpStatus, value, lpSum, LpContinuous, PULP_CBC_CMD
-
+from math import ceil
 
 # define a timestamp for script outputs
 timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M")
@@ -295,6 +295,127 @@ def build_edges_within_radius_flat(T_by_dist: dict, max_minutes: float) -> dict[
 
 # a = build_edges_within_radius(T_car, max_minutes = 18)
 
+# Defining clusters of health facilities within district
+# This function helps find the facilities which would be appropriate cluster centers
+def _farthest_first_seeds(T: pd.DataFrame, k: int, big: float = 1e9) -> list:
+    """
+    Pick k seed medoids via farthest-first traversal on a travel-time matrix.
+    Treat inf/NaN distances as 'big' so disconnected components get separate seeds.
+    """
+    n = T.shape[0]
+    facs = T.index.tolist()
+    D = T.to_numpy().astype(float)
+    D[~np.isfinite(D)] = big
+
+    # Start at the row with largest average distance (covers sparse areas first)
+    start = int(np.nanargmax(np.nanmean(D, axis=1))) # the remotest facility
+    seeds_idx = [start]
+
+    # Iteratively add the point with max distance to its nearest seed
+    for _ in range(1, k):
+        # min distance to any existing seed for every point
+        min_to_seed = np.min(D[:, seeds_idx], axis=1) # this has a dimension of [facs, number of seeds]
+        next_idx = int(np.argmax(min_to_seed)) # for each facility find the distance to its nearest seed
+        # and the facility farthest from the nearest seed becomes the next seed
+        if next_idx in seeds_idx:
+            # Fallback: pick any non-seed with highest min_to_seed
+            candidates = [i for i in range(n) if i not in seeds_idx]
+            if not candidates:
+                break
+            next_idx = int(candidates[np.argmax(min_to_seed[candidates])])
+        seeds_idx.append(next_idx)
+
+    return [facs[i] for i in seeds_idx] # list of length k representing the clustering points
+
+# Assign each facility to its nearest seed subject to a hard cluster capacity (≤ X members)
+def _assign_to_cluster_with_fixed_capacity(T: pd.DataFrame, seeds: list, capacity: int, big: float = 1e9) -> Dict[str, int]:
+    """
+    Greedy assignment of facilities to nearest seed that still has capacity.
+    Returns: mapping facility -> seed_index (position in seeds list).
+    """
+    facs = T.index.tolist()
+    D = T.loc[facs, seeds].to_numpy().astype(float) # Distance of all facilities from the k seeds
+    D[~np.isfinite(D)] = big
+
+    # each facility: nearest distance to any seed (for stable ordering)
+    nearest = D.min(axis=1) # find the shortest distance to a cluster for each facility
+    order = np.argsort(nearest)  # sort all facilities in ascending order of their distance from the nearest facility
+
+    cap_left = {j: capacity for j in range(len(seeds))} # the capacity left for each seed
+    assign = {}
+
+    for idx in order:
+        f = facs[idx]
+        # try seeds in ascending distance
+        seq = np.argsort(D[idx, :]) # the sequence of seeds most suitable for idx
+        placed = False
+        for j in seq:
+            if cap_left[j] > 0:
+                assign[f] = j
+                cap_left[j] -= 1
+                placed = True
+                break
+        if not placed:
+            # total capacity >= n, so this should be rare; if it happens, put in least-loaded seed
+            j = min(cap_left, key=lambda jj: cap_left[jj])
+            assign[f] = j
+            cap_left[j] -= 1
+
+    return assign
+
+def capacity_clusters_for_district(
+    T_d: pd.DataFrame, cluster_size: int = 5, big: float = 1e9, refine_swaps: int = 0
+) -> Dict[str, str]:
+    """
+    Build ~equal-size clusters (size<=cluster_size) from a district's travel-time matrix via
+    capacity-constrained k-medoids (farthest-first seeds + greedy capacity assignment).
+
+    Returns: {facility_id -> cluster_id} (cluster ids like 'C00','C01',...)
+    """
+    facs = T_d.index.tolist()
+    n = len(facs)
+    if n == 0:
+        return {}
+    if n <= cluster_size:
+        return {f: "C00" for f in facs}
+
+    k = ceil(n / cluster_size)
+    seeds = _farthest_first_seeds(T_d, k=k, big=big)
+    assign_seed_idx = _assign_to_cluster_with_fixed_capacity(T_d, seeds=seeds, capacity=cluster_size, big=big)
+
+    # Optional tiny refinement: (off by default)
+    # You could add 1–2 passes of medoid swap within clusters to reduce intra-cluster travel.
+
+    # Build cluster ids in seed order
+    seed_to_cid = {j: f"C{j:02d}" for j in range(len(seeds))}
+    return {f: seed_to_cid[assign_seed_idx[f]] for f in facs}
+
+def build_capacity_clusters_all(
+    T_by_dist: Dict[str, pd.DataFrame], cluster_size: int = 5
+) -> pd.Series:
+    """
+    Apply capacity clustering to all districts.
+    Args:
+      T_by_dist: {'DistrictName': time_matrix (minutes, square DF with facility ids)}
+      cluster_size: desired max cluster size (e.g., 5)
+
+    Returns:
+      pd.Series mapping facility_id -> cluster_id names scoped by district, e.g. 'Nkhotakota#C03'
+    """
+    mappings = []
+    for d, T_d in T_by_dist.items():
+        if T_d is None or T_d.empty:
+            continue
+        local_map = capacity_clusters_for_district(T_d, cluster_size=cluster_size)
+        if not local_map:
+            continue
+        s = pd.Series(local_map, name="cluster_id")
+        s = s.map(lambda cid: f"{d}#{cid}")  # scope cluster name by district
+        mappings.append(s)
+    if not mappings:
+        return pd.Series(dtype=object)
+    return pd.concat(mappings)
+
 # -----------------------------------------------
 # 3) LP/MILP Redistribution
 # -----------------------------------------------
@@ -512,7 +633,9 @@ def redistribute_pooling_lp(
     close_cols=("closing_bal","received","dispensed"),
     keep_baseline_for_amc0: bool = True,   # leave baseline availability where AMC≈0
     amc_eps: float = 1e-6,                  # threshold to treat AMC as "zero"
-    return_move_log: bool = True # return a detailed df showing net movement of consumables after redistribution
+    return_move_log: bool = True, # return a detailed df showing net movement of consumables after redistribution
+    pooling_level: str = "district",  # "district" or "cluster"
+    cluster_map: pd.Series | None = None  # required if pooling_level=="cluster"; this specifes which cluster each facility belongs to
 ) -> pd.DataFrame:
     """
     Scenario 3: district-level pooling/push .
@@ -523,12 +646,21 @@ def redistribute_pooling_lp(
       - optional 'excess' sink if total stock > sum ceilings
       - availability computed safely; AMC≈0 rows keep baseline (optional)
 
+    Pooling redistribution that can operate at the district level (default)
+    or within fixed-size clusters inside districts.
+
+    If pooling_level == "cluster", you must pass cluster_map: Series mapping facility_id -> cluster_id
+    (cluster ids should already be district-scoped, e.g., "Dedza#C01").
+
         Returns:
       - out: original df plus columns:
           OB, OB_prime, available_prop_redis, received_from_pool
         where received_from_pool = OB_prime - OB (pos=received, neg=donated)
       - (optional) move_log: per (district, month, item, facility) net movement summary
     """
+    if pooling_level not in ("district", "cluster"):
+        raise ValueError("pooling_level must be 'district' or 'cluster'.")
+
     closing_bal, received, dispensed = close_cols
     out = df.copy()
 
@@ -542,147 +674,140 @@ def redistribute_pooling_lp(
     # Default (will overwrite per group)
     out["OB_prime"] = out["OB"]
 
+    # Attach cluster if needed
+    if pooling_level == "cluster":
+        if cluster_map is None:
+            raise ValueError("cluster_map is required when pooling_level='cluster'.")
+        # cluster_map: index = facility_id (facility_col), value = cluster_id (already district-scoped)
+        out = out.merge(
+            cluster_map.rename("cluster_id"),
+            how="left",
+            left_on=facility_col,
+            right_index=True,
+        )
+        if out["cluster_id"].isna().any():
+            # facilities missing a cluster—assign singleton clusters to keep them
+            out["cluster_id"] = out["cluster_id"].fillna(
+                out["district"].astype(str) + "#CXX_" + out[facility_col].astype(str))
+
     group_cols = list(id_cols)
+    node_label = "district"
+    if pooling_level == "cluster":
+        group_cols = ["cluster_id", "month", "item_code"]
+        node_label = "cluster_id"
     move_rows = []  # optional movement log
 
-    for (d, m, i), g in out.groupby(group_cols, sort=False):
+    for keys, g in out.groupby(group_cols, sort=False):
         g = g.copy()
+        node_val, m, i = keys if pooling_level == "cluster" else (g["district"].iloc[0], keys[1], keys[2])
 
-        # Clean AMC/OB and collapse duplicate facilities (unique index is required)
+        # Clean AMC/OB and dedupe
         AMC = (g.set_index(facility_col)[amc_col]
-                 .astype(float)
-                 .replace([np.inf, -np.inf], np.nan)
-                 .fillna(0.0))
+               .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0))
         OB0 = (g.set_index(facility_col)["OB"]
-                 .astype(float)
-                 .replace([np.inf, -np.inf], np.nan)
-                 .fillna(0.0))
-
+               .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0))
         if AMC.index.duplicated().any():
             AMC = AMC.groupby(level=0).first()
         if OB0.index.duplicated().any():
             OB0 = OB0.groupby(level=0).sum()
 
-        facs_all = AMC.index.tolist()
         total_stock = float(OB0.sum())
         if total_stock <= 1e-9:
             continue
 
-        # Positive-demand decision set
+        # Participants
         mask_pos = AMC >= amc_eps
         facs_pos = AMC.index[mask_pos].tolist()
-        if len(facs_pos) == 0:
-            # No demand nodes to allocate to; leave OB as-is (or zero them if you prefer)
+        if not facs_pos:
             continue
 
         AMC_pos = AMC.loc[facs_pos]
-        # Floors/ceilings
         lb = (tau_min * AMC_pos).astype(float)
         ub = (tau_max * AMC_pos).astype(float)
-
-        sum_lb = float(lb.sum())
+        sum_lb = float(lb.sum());
         sum_ub = float(ub.sum())
 
-        # Feasibility: scale floors if not enough stock
+        # Feasibility fixes
         if total_stock + 1e-9 < sum_lb:
             scale = max(1e-9, total_stock / max(1e-9, sum_lb))
             lb = lb * scale
             sum_lb = float(lb.sum())
-
-        # If stock exceeds ceilings, allow an excess sink to absorb leftovers
         allow_excess_sink = total_stock > sum_ub + 1e-9
 
         # ---------- LP ----------
-        prob = LpProblem(f"Pooling_{d}_{m}_{i}", LpMaximize)
-
-        # Decision variables
-        x = {f: LpVariable(f"x_{f}", lowBound=0) for f in facs_pos}      # how much stock to allocation to facility f
-        p = {f: LpVariable(f"p_{f}", lowBound=0, upBound=1) for f in facs_pos}  # Availability probability proxy for facility f
+        prob = LpProblem(f"Pooling_{node_val}_{m}_{i}", LpMaximize)
 
+        x = {f: LpVariable(f"x_{f}", lowBound=0) for f in facs_pos}
+        p = {f: LpVariable(f"p_{f}", lowBound=0, upBound=1) for f in facs_pos}
         excess = LpVariable("excess", lowBound=0) if allow_excess_sink else None
 
-        # Objective: maximize total availability
-        prob += lpSum(p.values()) # prioritize broad coverage—get as many facilities as possible to have stock
+        prob += lpSum(p.values())  # maximize total availability
 
-        # Conservation
         if excess is None:
             prob += lpSum(x.values()) == total_stock
         else:
             prob += lpSum(x.values()) + excess == total_stock
 
-        # Floors/ceilings (use scalar values to avoid float(Series) errors)
         for f in facs_pos:
             prob += x[f] >= float(lb.loc[f])
             prob += x[f] <= float(ub.loc[f])
-
-        # Linearization: AMC_f * p_f <= x_f (only for AMC>=amc_eps), i.e p = x/AMC
-        for f in facs_pos:
             prob += float(AMC_pos.loc[f]) * p[f] <= x[f]
 
-        # Solve
         prob.solve()
         if LpStatus[prob.status] != "Optimal":
             continue
 
-        # Apply solution
+        # Apply solution to OB'
         x_sol = {f: float(value(var) or 0.0) for f, var in x.items()}
 
-        sel = (out["district"].eq(d) & out["month"].eq(m) & out["item_code"].eq(i)) # selection mask
-        # boolean filter that picks out the subset of rows in out corresponding to the current group (district, month, item)
-        # Map allocation to rows with positive demand
-        mask_rows_pos = sel & out[facility_col].isin(facs_pos) # only those facilities with a positive demand/AMC
-        out.loc[mask_rows_pos, "OB_prime"] = out.loc[mask_rows_pos, facility_col].map(x_sol).values # assign new opening balance
+        # selection mask for this node-month-item
+        if pooling_level == "district":
+            sel = (out["district"].eq(node_val) & out["month"].eq(m) & out["item_code"].eq(i))
+        else:
+            sel = (out["cluster_id"].eq(node_val) & out["month"].eq(m) & out["item_code"].eq(i))
+
+        mask_rows_pos = sel & out[facility_col].isin(facs_pos)
+        out.loc[mask_rows_pos, "OB_prime"] = out.loc[mask_rows_pos, facility_col].map(x_sol).values
 
-        # Rows with AMC<eps donate to the pool; set OB' to 0 (availability handled below)
         mask_rows_zero = sel & ~out[facility_col].isin(facs_pos)
         out.loc[mask_rows_zero, "OB_prime"] = 0.0
 
         if return_move_log:
-            # Build a per-facility movement summary for this group
-            # Note: for duplicate facility rows in the group, we report the same net value (x_f - OB0_f_agg)
-            #       OB0_agg is OB0 after deduplication (summing duplicates)
-            for f in AMC.index:  # all facilities (including amc≈0)
+            for f in AMC.index:  # include amc≈0 facilities (x_f=0)
                 x_f = x_sol.get(f, 0.0) if f in facs_pos else 0.0
-                net = x_f - float(OB0.get(f, 0.0))  # positive=received, negative=donated
+                net = x_f - float(OB0.get(f, 0.0))
                 move_rows.append({
-                    "district": d, "month": m, "item_code": i,
+                    node_label: node_val,
+                    "month": m,
+                    "item_code": i,
                     "facility": f,
                     "received_from_pool": net,
                     "x_allocated": x_f,
-                    "OB0_agg": float(OB0.get(f, 0.0))
+                    "OB0_agg": float(OB0.get(f, 0.0)),
                 })
 
     # ---------- Availability after redistribution (safe) ----------
-    amc_safe = out[amc_col].astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0) # returns a clean numeric AMC series with no weird values
+    amc_safe = out[amc_col].astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)
     pos_mask = amc_safe >= amc_eps
 
-    p_mech = np.zeros(len(out), dtype=float) # Prepare an output array for the new availability (available_prop_redis), initialized to 0 for all rows.
-    # Mechanistic availability for positive-demand rows
+    p_mech = np.zeros(len(out), dtype=float)
     denom = np.maximum(amc_eps, amc_safe[pos_mask].to_numpy())
-    p_mech[pos_mask.values] = np.minimum(1.0, np.maximum(
-        0.0, out.loc[pos_mask, "OB_prime"].to_numpy() / denom
-    ))
+    p_mech[pos_mask.values] = np.minimum(
+        1.0, np.maximum(0.0, out.loc[pos_mask, "OB_prime"].to_numpy() / denom)
+    )
 
     if keep_baseline_for_amc0:
-        # Keep baseline availability where AMC≈0
         base = (out.loc[~pos_mask, "available_prop"]
-                  .astype(float)
-                  .replace([np.inf, -np.inf], np.nan)
-                  .fillna(0.0)).to_numpy()
+                  .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)).to_numpy()
         p_mech[~pos_mask.values] = base
     else:
-        # Or: 1 if OB'>0 else 0 for AMC≈0 rows
         p_mech[~pos_mask.values] = (out.loc[~pos_mask, "OB_prime"].to_numpy() > 0.0).astype(float)
 
     out["available_prop_redis"] = p_mech
-    # NEW: per-row received_from_pool (using each row's OB)
     out["received_from_pool"] = out["OB_prime"] - out["OB"]
 
     if return_move_log:
-        move_log = pd.DataFrame(
-            move_rows,
-            columns=["district", "month", "item_code", "facility", "received_from_pool", "x_allocated", "OB0_agg"]
-        )
+        move_log = pd.DataFrame(move_rows)
         return out, move_log
 
     return out
@@ -812,14 +937,40 @@ def run_redistribution_scenarios(
 # A manual check shows that for distances greater than 60 minutes ORS underestimates the travel time a little
 # TODO consider using google maps API
 
-# 2) Run Scenario 1 (1-hour radius)
-
 #Drop NAs
 # TODO find a more generalisable solution for this issue (within the optimisation functions)
 #lmis = lmis[(lmis.amc != 0) & (lmis.amc.notna())]
 
 #lmis = lmis[lmis.district == 'Lilongwe']
 
+# Trying the updated pool-based redistribution
+# 1) Build clusters from your per-district travel-time matrices (minutes)
+#    T_car_by_dist: {"District A": DF(index=fac_ids, cols=fac_ids), ...}
+cluster_size = 3
+cluster_series = build_capacity_clusters_all(T_car, cluster_size=cluster_size)
+# cluster_series is a pd.Series: index=facility_id, value like "District A#C00", "District A#C01", ...
+
+# 2) Pool at the cluster level
+pooled_df, cluster_moves = redistribute_pooling_lp(
+    df=lmis,  # your LMIS dataframe
+    tau_min=0.25, tau_max=3.0,
+    pooling_level="cluster",
+    cluster_map=cluster_series,
+    return_move_log=True,
+)
+pooled_df.to_csv(outputfilepath/ 'clustering_n3_df.csv', index=False)
+
+# 3) Pool at the full district level (upper bound)
+pooled_district_df, district_moves = redistribute_pooling_lp(
+    df=lmis,
+    tau_min=0.25, tau_max=3.0,
+    pooling_level="district",
+    return_move_log=True,
+)
+pooled_df.to_csv(outputfilepath/ 'clustering_district_df.csv', index=False)
+
+# 2) Run Scenario 1 (1-hour radius)
+
 tlo_30 = run_redistribution_scenarios(
     lmis,
     scenario="radius_30",
@@ -887,4 +1038,3 @@ def run_redistribution_scenarios(
 comparison_df['available_prop_60mins'].mean(), comparison_df['available_prop_pooling'].mean())
 
 # TODO keep only government facilities?
-

From 8b5542764ec98a3f7da11a27d40f51583aa9fc63 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Mon, 20 Oct 2025 16:46:28 +0100
Subject: [PATCH 07/16] plot stock adequacy by district and month

---
 ...ate_consumable_redistribution_scenarios.py | 140 ++++++++++++++++--
 1 file changed, 126 insertions(+), 14 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
index 9b0feeafa5..ef3910232e 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -68,8 +68,126 @@
 #lmis = lmis[lmis["Facility_Level"] == '1a']
 
 # -----------------------------------
-# 1) Estimate travel time matrix
+# 1) Data exploration
 # -----------------------------------
+def compute_opening_balance(df: pd.DataFrame) -> pd.Series:
+    """
+    Opening balance from same-month records:
+    OB = closing_bal - received + dispensed.
+    This is equivalent to OB_(m) = CB_(m-1)
+    """
+    return df["closing_bal"] - df["received"] + df["dispensed"]
+
+def generate_stock_adequacy_heatmap(
+    df: pd.DataFrame,
+    figures_path: Path = Path("figures"),
+    filename: str = "heatmap_adequacy_opening_vs_3xamc.png",
+    include_missing_as_fail: bool = False,   # if True, items with NaN opening/amc count as NOT adequate
+    as_pct: bool = True,
+    amc_threshold: float = 3.0,
+    decimals: int = 0,
+    cmap: str = "RdYlGn",
+    figsize=(14, 10),
+    xtick_rotation: int = 45,
+    ytick_rotation: int = 0,
+):
+    """
+    Heatmap values: for each (month, district), the % of item_code groups where
+    sum(opening_balance over Facility_ID) >= 3 * sum(amc over Facility_ID).
+    """
+
+    df = df.copy()
+    df["opening_bal"] =  compute_opening_balance(df).replace([np.inf, -np.inf], np.nan).fillna(0.0)
+
+    # ---- 1) Make sure 'month' is sortable chronologically ----
+    # ---- 2) Aggregate to (month, district, item_code) over facilities ----
+    agg = (
+        df.groupby(["month", "district", "item_code"], dropna=False)
+          .agg(opening_bal=("opening_bal", "sum"),
+               amc=("amc", "sum"))
+          .reset_index()
+    )
+
+    # ---- 3) Adequacy indicator per (month, district, item_code) ----
+    if include_missing_as_fail:
+        # NaNs treated as fail -> fill with NaN-safe compare: set False when either missing
+        adequate = (agg["opening_bal"].fillna(np.nan) >= amc_threshold * agg["amc"].fillna(np.nan)) & \
+                   agg[["opening_bal", "amc"]].notna().all(axis=1)
+    else:
+        valid = agg.dropna(subset=["opening_bal", "amc"])
+        adequate = pd.Series(False, index=agg.index)
+        adequate.loc[valid.index] = valid["opening_bal"] >= amc_threshold * valid["amc"]
+
+    agg["adequate"] = adequate.astype(int)  # 1/0
+
+    # ---- 4) % adequate per (month, district) across item_code ----
+    # Denominator = number of item_code present after NaN handling
+    if include_missing_as_fail:
+        denom = agg.groupby(["month", "district"])["item_code"].nunique()
+        numer = agg.groupby(["month", "district"])["adequate"].sum()
+    else:
+        valid_mask = agg[["opening_bal", "amc"]].notna().all(axis=1)
+        denom = agg[valid_mask].groupby(["month", "district"])["item_code"].nunique()
+        numer = agg[valid_mask].groupby(["month", "district"])["adequate"].sum()
+
+    pct = (numer / denom * 100).replace([np.inf, -np.inf], np.nan)
+
+    # Bring to a DataFrame for pivoting
+    pct_df = pct.reset_index(name="pct_adequate")
+
+    # ---- 5) Pivot: districts (rows) x months (columns) ----
+    # Sort months by _month_sort and use _month_label as the displayed column name
+    month_order = (
+        pct_df[["month"]]
+        .drop_duplicates()
+        .sort_values("month")
+        ["month"]
+        .tolist()
+    )
+    heatmap_df = pct_df.pivot(index="district", columns="month", values="pct_adequate")
+    heatmap_df = heatmap_df.reindex(columns=month_order)
+
+    # Optional rounding for nicer colorbar ticks (doesn't affect color)
+    if decimals is not None:
+        heatmap_df = heatmap_df.round(decimals)
+
+    # ---- 6) Plot heatmap ----
+    sns.set(font_scale=1.0)
+    fig, ax = plt.subplots(figsize=figsize)
+
+    hm = sns.heatmap(
+        heatmap_df,
+        cmap=cmap,
+        cbar_kws={"label": f"% of consumables with Opening Balance ≥ {amc_threshold} × AMC"},
+        ax=ax,
+        annot=True, annot_kws={"size": 12},
+        vmin = 0, vmax = 100)
+
+    ax.set_xlabel("Month")
+    ax.set_ylabel("District")
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=xtick_rotation)
+    ax.set_yticklabels(ax.get_yticklabels(), rotation=ytick_rotation)
+
+    # Keep colorbar ticks plain (no scientific notation)
+    try:
+        cbar_ax = ax.figure.axes[-1]
+        cbar_ax.ticklabel_format(style="plain")
+    except Exception:
+        pass
+
+    # ---- 7) Save & return ----
+    figures_path.mkdir(parents=True, exist_ok=True)
+    outpath = figures_path / filename
+    plt.savefig(outpath, dpi=300, bbox_inches="tight")
+    plt.show()
+    plt.close(fig)
+
+    return fig, ax, heatmap_df
+
+# -----------------------------------
+# 2) Estimate travel time matrix
+# -----------------------------------
+
 def _chunk_indices(n: int, chunk: int):
     """Yield (start, end) index pairs for chunking 0..n-1."""
     for s in range(0, n, chunk):
@@ -242,17 +360,8 @@ def build_time_matrices_by_district(
     return matrices
 
 # -----------------------------------------------
-# 2) Data prep for redistribution linear program
+# 3) Data prep for redistribution linear program
 # -----------------------------------------------
-def compute_opening_balance(df: pd.DataFrame) -> pd.Series:
-    """
-    Opening balance from same-month records:
-    OB = closing_bal - received + dispensed.
-    This is equivalent to OB_(m) = CB_(m-1)
-    """
-    return df["closing_bal"] - df["received"] + df["dispensed"]
-
-
 def presumed_availability(ob, amc, eps=1e-9) -> float:
     """
     Presumed likelihood of cons availability =  p = min(1, OB/AMC) at month start (no additional receipts considered
@@ -330,7 +439,7 @@ def _farthest_first_seeds(T: pd.DataFrame, k: int, big: float = 1e9) -> list:
 # Assign each facility to its nearest seed subject to a hard cluster capacity (≤ X members)
 def _assign_to_cluster_with_fixed_capacity(T: pd.DataFrame, seeds: list, capacity: int, big: float = 1e9) -> Dict[str, int]:
     """
-    Greedy assignment of facilities to nearest seed that still has capacity.
+    Greedy assignment of facilities to nearest seed that still has capacity (based on maximum cluster size).
     Returns: mapping facility -> seed_index (position in seeds list).
     """
     facs = T.index.tolist()
@@ -701,7 +810,6 @@ def redistribute_pooling_lp(
         g = g.copy()
         node_val, m, i = keys if pooling_level == "cluster" else (g["district"].iloc[0], keys[1], keys[2])
 
-        # Clean AMC/OB and dedupe
         AMC = (g.set_index(facility_col)[amc_col]
                .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0))
         OB0 = (g.set_index(facility_col)["OB"]
@@ -803,7 +911,7 @@ def redistribute_pooling_lp(
     else:
         p_mech[~pos_mask.values] = (out.loc[~pos_mask, "OB_prime"].to_numpy() > 0.0).astype(float)
 
-    out["available_prop_redis"] = p_mech
+    out["available_prop_redis"] = np.maximum(p_mech,out["available_prop"])
     out["received_from_pool"] = out["OB_prime"] - out["OB"]
 
     if return_move_log:
@@ -902,6 +1010,10 @@ def run_redistribution_scenarios(
 #    osrm_base_url="https://router.project-osrm.org",
 #    max_chunk=50)
 
+# Plot stock adequacy by district and month
+fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, amc_threshold = 3, filename = "mth_district_stock_adequacy_3amc.png")
+fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, amc_threshold = 2, filename = "mth_district_stock_adequacy_2amc.png")
+
 # Store dictionary in pickle format
 #with open(outputfilepath / "T_car.pkl", "wb") as f:
 #    pickle.dump(T_car, f)

From 9f0673f444b7f82a9d041e2ffccbac8984349499 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Mon, 20 Oct 2025 17:39:09 +0100
Subject: [PATCH 08/16] add functionality to heatmap function (less than AMC or
 greater than AMC) + Add average row and column

---
 ...ate_consumable_redistribution_scenarios.py | 114 +++++++++++++-----
 1 file changed, 86 insertions(+), 28 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
index ef3910232e..8857725e05 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -1,6 +1,7 @@
 import datetime
 from pathlib import Path
 import pickle
+import calendar
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -82,14 +83,18 @@ def generate_stock_adequacy_heatmap(
     df: pd.DataFrame,
     figures_path: Path = Path("figures"),
     filename: str = "heatmap_adequacy_opening_vs_3xamc.png",
+    y_var: str = "district", # the variable on the y-axis of the heatmap
+    value_var: str = "item_code", # the count variable on the basis of which the values are calculated
+    value_label: str = "", # label describing the values in the heatmap
     include_missing_as_fail: bool = False,   # if True, items with NaN opening/amc count as NOT adequate
-    as_pct: bool = True,
     amc_threshold: float = 3.0,
+    compare: str = "ge" ,        # "ge" for >= threshold*AMC, "le" for <= threshold*AMC
     decimals: int = 0,
     cmap: str = "RdYlGn",
-    figsize=(14, 10),
+    figsize= None,
     xtick_rotation: int = 45,
     ytick_rotation: int = 0,
+    annotation: bool = True,
 ):
     """
     Heatmap values: for each (month, district), the % of item_code groups where
@@ -99,10 +104,16 @@ def generate_stock_adequacy_heatmap(
     df = df.copy()
     df["opening_bal"] =  compute_opening_balance(df).replace([np.inf, -np.inf], np.nan).fillna(0.0)
 
-    # ---- 1) Make sure 'month' is sortable chronologically ----
+    # --- 1. Ensure month is int and build label ---
+    df["month"] = pd.to_numeric(df["month"], errors="coerce").astype("Int64")
+    df = df.dropna(subset=["month"])
+    df["month"] = df["month"].astype(int)
+
+    df["_month_label"] = df["month"].map(lambda m: calendar.month_abbr[m])
+
     # ---- 2) Aggregate to (month, district, item_code) over facilities ----
     agg = (
-        df.groupby(["month", "district", "item_code"], dropna=False)
+        df.groupby(["month", "_month_label", y_var, value_var], dropna=False)
           .agg(opening_bal=("opening_bal", "sum"),
                amc=("amc", "sum"))
           .reset_index()
@@ -111,60 +122,85 @@ def generate_stock_adequacy_heatmap(
     # ---- 3) Adequacy indicator per (month, district, item_code) ----
     if include_missing_as_fail:
         # NaNs treated as fail -> fill with NaN-safe compare: set False when either missing
-        adequate = (agg["opening_bal"].fillna(np.nan) >= amc_threshold * agg["amc"].fillna(np.nan)) & \
-                   agg[["opening_bal", "amc"]].notna().all(axis=1)
+        ok = agg[["opening_bal", "amc"]].notna().all(axis=1)
+        left = agg["opening_bal"]
+        right = amc_threshold * agg["amc"]
+        if compare == "le":
+            cond = (left <= right) & ok
+        else:  # default to ">="
+            cond = (left >= right) & ok
     else:
         valid = agg.dropna(subset=["opening_bal", "amc"])
-        adequate = pd.Series(False, index=agg.index)
-        adequate.loc[valid.index] = valid["opening_bal"] >= amc_threshold * valid["amc"]
+        cond = pd.Series(False, index=agg.index)
+        left = valid["opening_bal"]
+        right = amc_threshold * valid["amc"]
+        if compare == "le":
+            cond.loc[valid.index] = left <= right
+        else:
+            cond.loc[valid.index] = left >= right
 
-    agg["adequate"] = adequate.astype(int)  # 1/0
+    agg["condition_met"] = cond.astype(int)
 
-    # ---- 4) % adequate per (month, district) across item_code ----
-    # Denominator = number of item_code present after NaN handling
+    # --- % meeting condition per (month, district) across item_code ---
     if include_missing_as_fail:
-        denom = agg.groupby(["month", "district"])["item_code"].nunique()
-        numer = agg.groupby(["month", "district"])["adequate"].sum()
+        denom = agg.groupby(["month", "_month_label", y_var])[value_var].nunique()
+        numer = agg.groupby(["month", "_month_label", y_var])["condition_met"].sum()
     else:
         valid_mask = agg[["opening_bal", "amc"]].notna().all(axis=1)
-        denom = agg[valid_mask].groupby(["month", "district"])["item_code"].nunique()
-        numer = agg[valid_mask].groupby(["month", "district"])["adequate"].sum()
-
-    pct = (numer / denom * 100).replace([np.inf, -np.inf], np.nan)
+        denom = agg[valid_mask].groupby(["month", "_month_label", y_var])[value_var].nunique()
+        numer = agg[valid_mask].groupby(["month", "_month_label", y_var])["condition_met"].sum()
 
-    # Bring to a DataFrame for pivoting
-    pct_df = pct.reset_index(name="pct_adequate")
+    pct = (numer / denom * 100).replace([np.inf, -np.inf], np.nan).reset_index(name="pct_meeting")
 
     # ---- 5) Pivot: districts (rows) x months (columns) ----
     # Sort months by _month_sort and use _month_label as the displayed column name
     month_order = (
-        pct_df[["month"]]
+        pct[["month", "_month_label"]]
         .drop_duplicates()
         .sort_values("month")
-        ["month"]
+        ["_month_label"]
         .tolist()
     )
-    heatmap_df = pct_df.pivot(index="district", columns="month", values="pct_adequate")
+    heatmap_df = pct.pivot(index=y_var, columns="_month_label", values="pct_meeting")
     heatmap_df = heatmap_df.reindex(columns=month_order)
 
+    # --- Add average row and column ---
+    # Column average (mean of each month)
+    heatmap_df.loc["Average"] = heatmap_df.mean(axis=0)
+    # Row average (mean of each y_var)
+    heatmap_df["Average"] = heatmap_df.mean(axis=1)
+
+    # Fix overall average (bottom-right)
+    overall_avg = heatmap_df.loc["Average", "Average"]
+    heatmap_df.loc["Average", "Average"] = overall_avg
+
     # Optional rounding for nicer colorbar ticks (doesn't affect color)
     if decimals is not None:
         heatmap_df = heatmap_df.round(decimals)
 
+    # --- Dynamic figure size ---
+    if figsize is None:
+        n_rows = len(heatmap_df)
+        n_cols = len(heatmap_df.columns)
+        height = max(6, n_rows * 0.2)  # taller if many rows
+        width = max(8, n_cols * 0.6)
+        figsize = (width, height)
+
     # ---- 6) Plot heatmap ----
     sns.set(font_scale=1.0)
     fig, ax = plt.subplots(figsize=figsize)
 
+    cbar_kws = value_label
     hm = sns.heatmap(
         heatmap_df,
         cmap=cmap,
-        cbar_kws={"label": f"% of consumables with Opening Balance ≥ {amc_threshold} × AMC"},
+        cbar_kws={"label": value_label},
         ax=ax,
-        annot=True, annot_kws={"size": 12},
+        annot=True, annot_kws={"size": 10},
         vmin = 0, vmax = 100)
 
     ax.set_xlabel("Month")
-    ax.set_ylabel("District")
+    ax.set_ylabel(f"{y_var}")
     ax.set_xticklabels(ax.get_xticklabels(), rotation=xtick_rotation)
     ax.set_yticklabels(ax.get_yticklabels(), rotation=ytick_rotation)
 
@@ -1011,9 +1047,31 @@ def run_redistribution_scenarios(
 #    max_chunk=50)
 
 # Plot stock adequacy by district and month
-fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, amc_threshold = 3, filename = "mth_district_stock_adequacy_3amc.png")
-fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, amc_threshold = 2, filename = "mth_district_stock_adequacy_2amc.png")
-
+fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath,
+                                                 y_var = 'district', value_var = 'item_code',
+                                                 value_label= f"% of consumables with Opening Balance ≥ 3 × AMC",
+                                                 amc_threshold = 3, compare = "ge",
+                                                 filename = "mth_district_stock_adequacy_3amc.png", figsize = (12,10))
+fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath,
+                                                 y_var = 'district', value_var = 'item_code',
+                                                 value_label= f"% of consumables with Opening Balance ≥ 3 × AMC",
+                                                 amc_threshold = 2, compare = "ge",
+                                                 filename = "mth_district_stock_adequacy_2amc.png", figsize = (12,10))
+fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath,
+                                                 y_var = 'district', value_var = 'item_code',
+                                                 value_label= f"% of consumables with Opening Balance <= 1 × AMC",
+                                                 amc_threshold = 1, compare = "le",
+                                                 filename = "mth_district_stock_inadequacy_1amc.png", figsize = (12,10))
+fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath,
+                                                 y_var = 'item_code', value_var = 'fac_name',
+                                                 value_label= f"% of facilities with Opening Balance ≥ 3 × AMC",
+                                                 amc_threshold = 3, compare = "ge",
+                                                 filename = "mth_item_stock_adequacy_3amc.png")
+fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath,
+                                                 y_var = 'item_code', value_var = 'fac_name',
+                                                 value_label= f"% of facilities with Opening Balance <= 1 × AMC",
+                                                 amc_threshold = 1, compare = "le",
+                                                 filename = "mth_item_stock_inadequacy_1amc.png")
 # Store dictionary in pickle format
 #with open(outputfilepath / "T_car.pkl", "wb") as f:
 #    pickle.dump(T_car, f)

From 31d2c879ae89897d2496012eaedde3cd869cbc38 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Tue, 21 Oct 2025 10:54:58 +0100
Subject: [PATCH 09/16] add constraints to pooling algorithm - 1. only levels
 1a and 1b can receive 2. facilities only transfer stock that they have beyond
 3 X AMC

---
 ...ate_consumable_redistribution_scenarios.py | 149 +++++++++++++-----
 1 file changed, 112 insertions(+), 37 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
index 8857725e05..1165c8abe9 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -774,13 +774,16 @@ def redistribute_pooling_lp(
     tau_max: float = 3.0,    # upper ceiling (storage/policy max)
     id_cols=("district","month","item_code"),
     facility_col="fac_name",
+    level_col="Facility_Level",
     amc_col="amc",
     close_cols=("closing_bal","received","dispensed"),
     keep_baseline_for_amc0: bool = True,   # leave baseline availability where AMC≈0
     amc_eps: float = 1e-6,                  # threshold to treat AMC as "zero"
     return_move_log: bool = True, # return a detailed df showing net movement of consumables after redistribution
     pooling_level: str = "district",  # "district" or "cluster"
-    cluster_map: pd.Series | None = None  # required if pooling_level=="cluster"; this specifes which cluster each facility belongs to
+    cluster_map: pd.Series | None = None,  # required if pooling_level=="cluster"; this specifes which cluster each facility belongs to
+    enforce_no_harm: bool = True,  # if True, enforce p_f >= baseline at each facility (hard constraint)
+    floor_to_baseline: bool = True # if True, report availability floored to baseline (no decrease in outputs)
 ) -> pd.DataFrame:
     """
     Scenario 3: district-level pooling/push .
@@ -810,11 +813,7 @@ def redistribute_pooling_lp(
     out = df.copy()
 
     # Safe opening balance
-    out["OB"] = (
-        out[closing_bal].astype(float)
-        - out[received].astype(float)
-        + out[dispensed].astype(float)
-    ).replace([np.inf, -np.inf], np.nan).fillna(0.0)
+    out["OB"] = compute_opening_balance(out).replace([np.inf, -np.inf], np.nan).fillna(0.0)
 
     # Default (will overwrite per group)
     out["OB_prime"] = out["OB"]
@@ -841,17 +840,33 @@ def redistribute_pooling_lp(
         group_cols = ["cluster_id", "month", "item_code"]
         node_label = "cluster_id"
     move_rows = []  # optional movement log
+    # TODO could remove the movement log
 
     for keys, g in out.groupby(group_cols, sort=False):
         g = g.copy()
-        node_val, m, i = keys if pooling_level == "cluster" else (g["district"].iloc[0], keys[1], keys[2])
+        # Resolve node ID for logging and selection masks
+        if pooling_level == "district":
+            node_val, m, i = g["district"].iloc[0], keys[1], keys[2]
+        else:
+            node_val, m, i = keys
 
+        # Build per-facility Series (unique index) for AMC, OB, Level, Baseline p
         AMC = (g.set_index(facility_col)[amc_col]
                .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0))
         OB0 = (g.set_index(facility_col)["OB"]
                .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0))
+        LVL = (g.set_index(facility_col)[level_col].astype(str)
+               .replace({np.nan: ""}))
+        Pbase = (g.set_index(facility_col)["available_prop"]
+                 .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0))
+
+        # collapse duplicates if any
         if AMC.index.duplicated().any():
-            AMC = AMC.groupby(level=0).first()
+            AMC = AMC[~AMC.index.duplicated(keep="first")]
+        if LVL.index.duplicated().any():
+            LVL = LVL[~LVL.index.duplicated(keep="first")]
+        if Pbase.index.duplicated().any():
+            Pbase = Pbase.groupby(level=0).mean()  # average baseline if duplicates
         if OB0.index.duplicated().any():
             OB0 = OB0.groupby(level=0).sum()
 
@@ -859,44 +874,87 @@ def redistribute_pooling_lp(
         if total_stock <= 1e-9:
             continue
 
-        # Participants
+        # Participants (positive demand)
         mask_pos = AMC >= amc_eps
         facs_pos = AMC.index[mask_pos].tolist()
         if not facs_pos:
+            # nothing to reallocate to; they will be donors only (handled by OB' defaults)
             continue
 
         AMC_pos = AMC.loc[facs_pos]
-        lb = (tau_min * AMC_pos).astype(float)
-        ub = (tau_max * AMC_pos).astype(float)
-        sum_lb = float(lb.sum());
-        sum_ub = float(ub.sum())
-
-        # Feasibility fixes
-        if total_stock + 1e-9 < sum_lb:
-            scale = max(1e-9, total_stock / max(1e-9, sum_lb))
-            lb = lb * scale
-            sum_lb = float(lb.sum())
-        allow_excess_sink = total_stock > sum_ub + 1e-9
+        OB0_pos = OB0.loc[facs_pos]
+        LVL_pos = LVL.reindex(facs_pos)
+        Pbase_pos = Pbase.reindex(facs_pos).fillna(0.0)
+
+        # policy floors/ceilings (raw)
+        tau_min_floor = (tau_min * AMC_pos).astype(float)
+        tau_max_ceiling = (tau_max * AMC_pos).astype(float)
+
+        # ---- Donor protection: x_f >= pLB_f = min(OB, 3*AMC)
+        pLB = np.minimum(OB0_pos, 3.0 * AMC_pos)  # Series
+
+        # ---- Receiver eligibility: only 1a/1b may increase above OB
+        eligible_mask = LVL_pos.isin(["1a", "1b"])
+        # For eligible: UB = tau_max*AMC
+        # For ineligible: UB = min(OB, tau_max*AMC)  (can donate, cannot receive above OB)
+        UB = tau_max_ceiling.copy()
+        UB.loc[~eligible_mask] = np.minimum(OB0_pos.loc[~eligible_mask], tau_max_ceiling.loc[~eligible_mask])
+
+        # ---- Lower bound assembly BEFORE scaling tau_min
+        LB0 = np.maximum(pLB, tau_min_floor)  # candidate lower bound
+
+        # ---- Optional "no-harm": p_f >= baseline -> x_f >= AMC_f * p_base_f
+        if enforce_no_harm:
+            no_harm_lb = (AMC_pos * Pbase_pos).astype(float)
+            LB0 = np.maximum(LB0, no_harm_lb)
+
+        # ---- Feasibility: scale ONLY the tau_min component if needed
+        sum_LB0 = float(LB0.sum())
+        if total_stock + 1e-9 < sum_LB0:
+            # We want LB = max(pLB, tau_min_scaled, [no_harm_lb])
+            # Scale only the portion of tau_min_floor that lies above pLB (and above no_harm if on).
+            base_guard = pLB.copy()
+            if enforce_no_harm:
+                base_guard = np.maximum(base_guard, (AMC_pos * Pbase_pos))
+
+            # Decompose tau_min contribution above base_guard
+            tau_min_above = np.maximum(0.0, tau_min_floor - np.minimum(tau_min_floor, base_guard))
+            need = float(tau_min_above.sum())
+            # Available budget for that component:
+            budget = total_stock - float(base_guard.sum())
+            scale = 0.0 if need <= 1e-12 else max(0.0, min(1.0, budget / max(1e-9, need)))
+
+            tau_min_scaled = np.minimum(base_guard, tau_min_floor) + tau_min_above * scale
+            LB = np.maximum(base_guard, tau_min_scaled)
+        else:
+            LB = LB0
+
+        # ---- Excess sink if ceilings bind
+        sum_UB = float(UB.sum())
+        allow_excess_sink = total_stock > sum_UB + 1e-9
 
         # ---------- LP ----------
         prob = LpProblem(f"Pooling_{node_val}_{m}_{i}", LpMaximize)
-
         x = {f: LpVariable(f"x_{f}", lowBound=0) for f in facs_pos}
         p = {f: LpVariable(f"p_{f}", lowBound=0, upBound=1) for f in facs_pos}
         excess = LpVariable("excess", lowBound=0) if allow_excess_sink else None
 
-        prob += lpSum(p.values())  # maximize total availability
+        # Objective: maximize total availability
+        prob += lpSum(p.values())
 
+        # Conservation
         if excess is None:
             prob += lpSum(x.values()) == total_stock
         else:
             prob += lpSum(x.values()) + excess == total_stock
 
+        # Bounds + linearization
         for f in facs_pos:
-            prob += x[f] >= float(lb.loc[f])
-            prob += x[f] <= float(ub.loc[f])
-            prob += float(AMC_pos.loc[f]) * p[f] <= x[f]
+            prob += x[f] >= float(LB.loc[f])  # donor protection + tau_min (scaled) + (optional) no-harm
+            prob += x[f] <= float(UB.loc[f])  # eligibility-aware ceiling
+            prob += float(AMC_pos.loc[f]) * p[f] <= x[f]  # p <= x/AMC
 
+        # Solve
         prob.solve()
         if LpStatus[prob.status] != "Optimal":
             continue
@@ -904,15 +962,18 @@ def redistribute_pooling_lp(
         # Apply solution to OB'
         x_sol = {f: float(value(var) or 0.0) for f, var in x.items()}
 
-        # selection mask for this node-month-item
+        # Selection mask for writing back
         if pooling_level == "district":
             sel = (out["district"].eq(node_val) & out["month"].eq(m) & out["item_code"].eq(i))
         else:
             sel = (out["cluster_id"].eq(node_val) & out["month"].eq(m) & out["item_code"].eq(i))
 
+        # Facilities with AMC>=eps get x_f
         mask_rows_pos = sel & out[facility_col].isin(facs_pos)
         out.loc[mask_rows_pos, "OB_prime"] = out.loc[mask_rows_pos, facility_col].map(x_sol).values
 
+        # Facilities with AMC<eps donate entirely: OB' = min(OB, 3*AMC) but since AMC≈0, pLB=0 -> OB' = 0
+        # (this matches the "donate to pool" assumption)
         mask_rows_zero = sel & ~out[facility_col].isin(facs_pos)
         out.loc[mask_rows_zero, "OB_prime"] = 0.0
 
@@ -928,34 +989,48 @@ def redistribute_pooling_lp(
                     "received_from_pool": net,
                     "x_allocated": x_f,
                     "OB0_agg": float(OB0.get(f, 0.0)),
+                    "eligible_receiver": bool(LVL.get(f, "") in {"1a", "1b"}),
                 })
 
-    # ---------- Availability after redistribution (safe) ----------
+    # ---------- Availability after redistribution ----------
     amc_safe = out[amc_col].astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)
-    pos_mask = amc_safe >= amc_eps
+    pos_mask_all = amc_safe >= amc_eps
 
     p_mech = np.zeros(len(out), dtype=float)
-    denom = np.maximum(amc_eps, amc_safe[pos_mask].to_numpy())
-    p_mech[pos_mask.values] = np.minimum(
-        1.0, np.maximum(0.0, out.loc[pos_mask, "OB_prime"].to_numpy() / denom)
+    denom = np.maximum(amc_eps, amc_safe[pos_mask_all].to_numpy())
+    p_mech[pos_mask_all.values] = np.minimum(
+        1.0, np.maximum(0.0, out.loc[pos_mask_all, "OB_prime"].to_numpy() / denom)
     )
 
     if keep_baseline_for_amc0:
-        base = (out.loc[~pos_mask, "available_prop"]
-                  .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)).to_numpy()
-        p_mech[~pos_mask.values] = base
+        base = (out.loc[~pos_mask_all, "available_prop"]
+                .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)).to_numpy()
+        p_mech[~pos_mask_all.values] = base
     else:
-        p_mech[~pos_mask.values] = (out.loc[~pos_mask, "OB_prime"].to_numpy() > 0.0).astype(float)
+        p_mech[~pos_mask_all.values] = (out.loc[~pos_mask_all, "OB_prime"].to_numpy() > 0.0).astype(float)
+
+    out["available_prop_redis_raw"] = p_mech
+    out["available_prop_redis"] = (
+        np.maximum(p_mech, out["available_prop"].astype(float).values)
+        if floor_to_baseline else p_mech
+    )
 
-    out["available_prop_redis"] = np.maximum(p_mech,out["available_prop"])
+    # Per-row movement
     out["received_from_pool"] = out["OB_prime"] - out["OB"]
 
+    # Check if the rules are correctly applied
+    # No facility should end below min(OB, 3*AMC)
+    viol = (out["OB_prime"] < np.minimum(out["OB"], 3 * out["amc"]) - 1e-6)
+    assert not viol.any()
+    # Non-eligible levels never increase OB
+    mask_non_elig = ~out["Facility_Level"].isin(["1a", "1b"])
+    assert not (out.loc[mask_non_elig, "OB_prime"] > out.loc[mask_non_elig, "OB"] + 1e-6).any()
+
     if return_move_log:
         move_log = pd.DataFrame(move_rows)
         return out, move_log
 
     return out
-
 # pooled_df, pool_moves = redistribute_pooling_lp(lmis, tau_min=0.25, tau_max=3.0, return_move_log=True)
 
 

From bea8ddcae45276d34524b5e047871e6e3f466844 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Tue, 21 Oct 2025 16:22:46 +0100
Subject: [PATCH 10/16] fixes to pooling algorithm - - differentiate between
 tau_donor_keep and tau_max

---
 ...ate_consumable_redistribution_scenarios.py | 34 ++++++++++++-------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
index 1165c8abe9..872d577ec4 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -772,6 +772,7 @@ def redistribute_pooling_lp(
     df: pd.DataFrame,
     tau_min: float = 0.25,   # lower floor in "months of demand" (≈ 7–8 days) - minimum transfer required
     tau_max: float = 3.0,    # upper ceiling (storage/policy max)
+    tau_donor_keep: float = 3.0, # minimum the donor keeps before donating
     id_cols=("district","month","item_code"),
     facility_col="fac_name",
     level_col="Facility_Level",
@@ -890,8 +891,9 @@ def redistribute_pooling_lp(
         tau_min_floor = (tau_min * AMC_pos).astype(float)
         tau_max_ceiling = (tau_max * AMC_pos).astype(float)
 
-        # ---- Donor protection: x_f >= pLB_f = min(OB, 3*AMC)
-        pLB = np.minimum(OB0_pos, 3.0 * AMC_pos)  # Series
+        # ---- Donor protection: x_f >= pLB_f = min(OB, tau_donor_keep*AMC)
+        pLB = np.minimum(OB0_pos, tau_donor_keep * AMC_pos)  # the lower bound for a donor facility is OB if that is less than
+        #tau_donor_keep X AMC or tau_donor_keep X AMC is that is lower than OB (i.e. only OB - 3 X AMC can be donated, if tau_donor_keep = 3)
 
         # ---- Receiver eligibility: only 1a/1b may increase above OB
         eligible_mask = LVL_pos.isin(["1a", "1b"])
@@ -901,7 +903,7 @@ def redistribute_pooling_lp(
         UB.loc[~eligible_mask] = np.minimum(OB0_pos.loc[~eligible_mask], tau_max_ceiling.loc[~eligible_mask])
 
         # ---- Lower bound assembly BEFORE scaling tau_min
-        LB0 = np.maximum(pLB, tau_min_floor)  # candidate lower bound
+        LB0 = np.maximum(pLB, tau_min_floor)  # enforces receiver's lower bound
 
         # ---- Optional "no-harm": p_f >= baseline -> x_f >= AMC_f * p_base_f
         if enforce_no_harm:
@@ -955,7 +957,7 @@ def redistribute_pooling_lp(
             prob += float(AMC_pos.loc[f]) * p[f] <= x[f]  # p <= x/AMC
 
         # Solve
-        prob.solve()
+        prob.solve(PULP_CBC_CMD(msg=False, cuts=0, presolve=True, threads=1))
         if LpStatus[prob.status] != "Optimal":
             continue
 
@@ -996,7 +998,8 @@ def redistribute_pooling_lp(
     amc_safe = out[amc_col].astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)
     pos_mask_all = amc_safe >= amc_eps
 
-    p_mech = np.zeros(len(out), dtype=float)
+    p_mech = np.zeros(len(out), dtype=float) # mechanistically calculated p based on balances and AMC
+    # (different from the base which was calculated from actual stkout_days recorded_
     denom = np.maximum(amc_eps, amc_safe[pos_mask_all].to_numpy())
     p_mech[pos_mask_all.values] = np.minimum(
         1.0, np.maximum(0.0, out.loc[pos_mask_all, "OB_prime"].to_numpy() / denom)
@@ -1009,22 +1012,30 @@ def redistribute_pooling_lp(
     else:
         p_mech[~pos_mask_all.values] = (out.loc[~pos_mask_all, "OB_prime"].to_numpy() > 0.0).astype(float)
 
+    # For levels 1a and 1b force availability to be max of old (actual) versus new (mechnistically calculated)
     out["available_prop_redis_raw"] = p_mech
     out["available_prop_redis"] = (
         np.maximum(p_mech, out["available_prop"].astype(float).values)
         if floor_to_baseline else p_mech
     )
+    # For levels other than 1a and 1b force availability to be equal to old
+    mask_non_elig = ~out["Facility_Level"].isin(["1a", "1b"])
+    out.loc[mask_non_elig, "available_prop_redis"] = out.loc[mask_non_elig,"available_prop"] # this should ideally happen automatically
+    # however, there are facilities at levels 2-4 whether some stock out was experienced even though OB > AMC
+    # We want to retain the original probability in these cases because our overall analysis is restricted to levels 1a and 1b
 
     # Per-row movement
     out["received_from_pool"] = out["OB_prime"] - out["OB"]
 
     # Check if the rules are correctly applied
-    # No facility should end below min(OB, 3*AMC)
-    viol = (out["OB_prime"] < np.minimum(out["OB"], 3 * out["amc"]) - 1e-6)
-    assert not viol.any()
-    # Non-eligible levels never increase OB
-    mask_non_elig = ~out["Facility_Level"].isin(["1a", "1b"])
-    assert not (out.loc[mask_non_elig, "OB_prime"] > out.loc[mask_non_elig, "OB"] + 1e-6).any()
+    # No facility should end below min(OB, tau_donor_keep*AMC)
+    viol_lb = out["OB_prime"] < (np.minimum(out["OB"], tau_donor_keep * out["amc"]) - 1e-6)
+    #assert not viol_lb.any(), "Donor-protection LB violated (OB' < min(OB, tau_donor_keep*AMC))."
+    print(out[viol_lb][['available_prop_redis', 'available_prop', 'OB', 'OB_prime', 'amc']])
+    # Non-eligible levels never increase prob
+    #mask_non_elig = ~out["Facility_Level"].isin(["1a", "1b"])
+    #mask_correct = (out["available_prop_redis"] == out["available_prop"])
+    #print(out[mask_non_elig & ~mask_correct][['available_prop_redis', 'available_prop', 'OB', 'OB_prime', 'amc']])
 
     if return_move_log:
         move_log = pd.DataFrame(move_rows)
@@ -1033,7 +1044,6 @@ def redistribute_pooling_lp(
     return out
 # pooled_df, pool_moves = redistribute_pooling_lp(lmis, tau_min=0.25, tau_max=3.0, return_move_log=True)
 
-
 # ------------------------
 # 4) HIGH-LEVEL DRIVER
 # ------------------------

From 6f1d374d8abf857094f6cb46223031e7481c9b7e Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Tue, 21 Oct 2025 16:35:40 +0100
Subject: [PATCH 11/16] remove duplicated lmis rows (same fac_name and district
 but different levels)

---
 ...ate_consumable_redistribution_scenarios.py | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
index 872d577ec4..202e823b68 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -34,6 +34,34 @@
 lmis = (pd.read_csv(outputfilepath / "ResourceFile_Consumables_availability_and_usage.csv")
         [['district', 'fac_type_tlo', 'fac_name', 'month', 'item_code', 'available_prop',
        'closing_bal', 'amc', 'dispensed', 'received']])
+
+# Drop duplicated facility, item, month combinations
+print(lmis.shape, "rows before collapsing duplicates")
+key_cols = ["district", "item_code", "fac_name", "month"] # keys that define a unique record
+
+# helper to keep one facility level per group (mode → most common; fallback to first non-null)
+def _mode_or_first(s: pd.Series):
+    s = s.dropna()
+    if s.empty:
+        return np.nan
+    m = s.mode()
+    return m.iloc[0] if not m.empty else s.iloc[0]
+
+lmis = (
+    lmis
+    .groupby(key_cols, as_index=False)
+    .agg(
+        closing_bal=("closing_bal", "sum"),
+        dispensed=("dispensed", "sum"),
+        received=("received", "sum"),
+        amc=("amc", "sum"),
+        available_prop=("available_prop", "mean"),
+        fac_type_tlo=("fac_type_tlo", _mode_or_first),   # optional; remove if not needed
+    )
+)
+
+print(lmis.shape, "rows after collapsing duplicates")
+
 # Import data on facility location
 location = (pd.read_excel(path_to_share / "07 - Data/Facility_GPS_Coordinates/gis_data_for_openlmis/LMISFacilityLocations_raw.xlsx")
         [['LMIS Facility List', 'LATITUDE', 'LONGITUDE']])

From cd4f1493a204ac3ded349cb0493b6d41d92420d8 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Tue, 21 Oct 2025 19:15:28 +0100
Subject: [PATCH 12/16] update assertions for pooling LP (to exclude the cases
 which were not included in the optimisation)

---
 ...ate_consumable_redistribution_scenarios.py | 134 +++++++++++++-----
 1 file changed, 100 insertions(+), 34 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
index 202e823b68..842ea2f1c0 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -4,6 +4,7 @@
 import calendar
 
 import matplotlib.pyplot as plt
+import seaborn as sns
 import numpy as np
 import pandas as pd
 
@@ -871,6 +872,7 @@ def redistribute_pooling_lp(
     move_rows = []  # optional movement log
     # TODO could remove the movement log
 
+    skipped_nodes = []  # collect nodes that did NOT solve optimally
     for keys, g in out.groupby(group_cols, sort=False):
         g = g.copy()
         # Resolve node ID for logging and selection masks
@@ -950,7 +952,7 @@ def redistribute_pooling_lp(
             # Decompose tau_min contribution above base_guard
             tau_min_above = np.maximum(0.0, tau_min_floor - np.minimum(tau_min_floor, base_guard))
             need = float(tau_min_above.sum())
-            # Available budget for that component:
+            # Available stock for that component:
             budget = total_stock - float(base_guard.sum())
             scale = 0.0 if need <= 1e-12 else max(0.0, min(1.0, budget / max(1e-9, need)))
 
@@ -987,7 +989,11 @@ def redistribute_pooling_lp(
         # Solve
         prob.solve(PULP_CBC_CMD(msg=False, cuts=0, presolve=True, threads=1))
         if LpStatus[prob.status] != "Optimal":
+            skipped_nodes.append((node_val, m, i))
+            #print("NO Optimal solution found", node_val, m, i)
             continue
+        #else:
+            #print("Optimal solution found", node_val, m, i)
 
         # Apply solution to OB'
         x_sol = {f: float(value(var) or 0.0) for f, var in x.items()}
@@ -1056,14 +1062,48 @@ def redistribute_pooling_lp(
     out["received_from_pool"] = out["OB_prime"] - out["OB"]
 
     # Check if the rules are correctly applied
-    # No facility should end below min(OB, tau_donor_keep*AMC)
-    viol_lb = out["OB_prime"] < (np.minimum(out["OB"], tau_donor_keep * out["amc"]) - 1e-6)
-    #assert not viol_lb.any(), "Donor-protection LB violated (OB' < min(OB, tau_donor_keep*AMC))."
-    print(out[viol_lb][['available_prop_redis', 'available_prop', 'OB', 'OB_prime', 'amc']])
-    # Non-eligible levels never increase prob
-    #mask_non_elig = ~out["Facility_Level"].isin(["1a", "1b"])
-    #mask_correct = (out["available_prop_redis"] == out["available_prop"])
-    #print(out[mask_non_elig & ~mask_correct][['available_prop_redis', 'available_prop', 'OB', 'OB_prime', 'amc']])
+    # --- Build masks for skipping ---
+    # 1. Nodes that failed to solve optimally
+    # Exclude any node/month/item combinations that didn't solve optimally or which were skipped due to AMC == 0
+    if skipped_nodes:
+        skipped_df = pd.DataFrame(skipped_nodes, columns=[node_label, "month", "item_code"])
+
+        # Merge to flag rows belonging to skipped groups
+        out = out.merge(
+            skipped_df.assign(skip_flag=True),
+            on=[node_label, "month", "item_code"],
+            how="left",
+        )
+        mask_skip_solution = out["skip_flag"].fillna(False)
+    else:
+        mask_skip_solution = pd.Series(False, index=out.index)
+
+    # 2. Facilities with AMC effectively zero
+    mask_skip_amc = out["amc"].astype(float) <= 1e-9
+
+    # Combined skip mask
+    mask_skip = mask_skip_solution | mask_skip_amc
+    mask_solved = ~mask_skip
+    print(f"Skipping {mask_skip.sum()} rows due to non-optimal LPs or AMC≈0")
+
+    # No facility should end below min(OB, tau_donor_keep*AMC) (# Lower bound check)
+    tol = 1e-6 #tolerance
+    viol_lb = (
+        (out.loc[mask_solved, "OB_prime"] <
+         (np.minimum(out.loc[mask_solved, "OB"], tau_donor_keep * out.loc[mask_solved, "amc"]) - tol))
+    )
+
+    # No facility ends up above upper bounds (# Upper bound check)
+    elig = out.loc[mask_solved, "Facility_Level"].isin(["1a", "1b"]).values
+    ub = np.where(
+        elig,
+        tau_max * out.loc[mask_solved, "amc"],
+        np.minimum(out.loc[mask_solved, "OB"], tau_max * out.loc[mask_solved, "amc"])
+    )
+    viol_ub = out.loc[mask_solved, "OB_prime"].values > (ub + tol)
+
+    assert not viol_lb.any(), "viol_lb violated"
+    assert not viol_ub.any(), "viol_ub violated"
 
     if return_move_log:
         move_log = pd.DataFrame(move_rows)
@@ -1159,7 +1199,19 @@ def run_redistribution_scenarios(
 #    osrm_base_url="https://router.project-osrm.org",
 #    max_chunk=50)
 
-# Plot stock adequacy by district and month
+# Store dictionary in pickle format
+#with open(outputfilepath / "T_car.pkl", "wb") as f:
+#    pickle.dump(T_car, f)
+# -> Commented out because it takes long to run. The result has been stored in pickle format
+
+# Load pre-generated dictionary
+with open(outputfilepath / "T_car.pkl", "rb") as f:
+    T_car = pickle.load(f)
+
+#edges_flat = build_edges_within_radius_flat(T_car, max_minutes= 60)
+
+# 2) Explore the availability and distances to make decisions about optimisation rules
+# Plot stock adequacy by district and month to assess what bounds to set when pooling
 fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath,
                                                  y_var = 'district', value_var = 'item_code',
                                                  value_label= f"% of consumables with Opening Balance ≥ 3 × AMC",
@@ -1185,16 +1237,7 @@ def run_redistribution_scenarios(
                                                  value_label= f"% of facilities with Opening Balance <= 1 × AMC",
                                                  amc_threshold = 1, compare = "le",
                                                  filename = "mth_item_stock_inadequacy_1amc.png")
-# Store dictionary in pickle format
-#with open(outputfilepath / "T_car.pkl", "wb") as f:
-#    pickle.dump(T_car, f)
-# -> Commented out because it takes long to run. The result has been stored in pickle format
 
-# Load pre-generated dictionary
-with open(outputfilepath / "T_car.pkl", "rb") as f:
-    T_car = pickle.load(f)
-
-#edges_flat = build_edges_within_radius_flat(T_car, max_minutes= 60)
 
 # Browse the number of eligible neighbours depending on allowable travel time
 results = []
@@ -1224,34 +1267,57 @@ def run_redistribution_scenarios(
 # TODO find a more generalisable solution for this issue (within the optimisation functions)
 #lmis = lmis[(lmis.amc != 0) & (lmis.amc.notna())]
 
-#lmis = lmis[lmis.district == 'Lilongwe']
-
-# Trying the updated pool-based redistribution
-# 1) Build clusters from your per-district travel-time matrices (minutes)
+# 3) Implement pooled redistribution
+# a) Build clusters from per-district travel-time matrices
 #    T_car_by_dist: {"District A": DF(index=fac_ids, cols=fac_ids), ...}
 cluster_size = 3
 cluster_series = build_capacity_clusters_all(T_car, cluster_size=cluster_size)
 # cluster_series is a pd.Series: index=facility_id, value like "District A#C00", "District A#C01", ...
 
-# 2) Pool at the cluster level
-pooled_df, cluster_moves = redistribute_pooling_lp(
-    df=lmis,  # your LMIS dataframe
+# b) Run optimisation at district level
+pooled_district_df, cluster_district_moves = redistribute_pooling_lp(
+    df=lmis_test,  # the LMIS dataframe
     tau_min=0.25, tau_max=3.0,
-    pooling_level="cluster",
-    cluster_map=cluster_series,
+    tau_donor_keep = 3.0,
+    pooling_level="district",
+    cluster_map=None,
     return_move_log=True,
+    enforce_no_harm=True,
+    floor_to_baseline=True
 )
-pooled_df.to_csv(outputfilepath/ 'clustering_n3_df.csv', index=False)
+print(pooled_district_df.groupby('Facility_Level')[['available_prop_redis', 'available_prop']].mean())
+pooled_district_df.to_csv(outputfilepath/ 'clustering_district_df.csv', index=False)
+tlo_pooled_district = (
+        pooled_district_df
+        .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False)
+        .agg(available_prop_scen16=("available_prop_redis", "mean"))
+        .sort_values(["item_code","district","Facility_Level","month"])
+    )
+
 
-# 3) Pool at the full district level (upper bound)
-pooled_district_df, district_moves = redistribute_pooling_lp(
-    df=lmis,
+#  c) Run optimisation at cluster (size = 3) level
+pooled_cluster_df, cluster_moves = redistribute_pooling_lp(
+    df=lmis,  # the LMIS dataframe
     tau_min=0.25, tau_max=3.0,
-    pooling_level="district",
+    tau_donor_keep = 3.0,
+    pooling_level="cluster",
+    cluster_map=cluster_series,
     return_move_log=True,
+    enforce_no_harm=True,
+    floor_to_baseline=True
 )
-pooled_df.to_csv(outputfilepath/ 'clustering_district_df.csv', index=False)
+print(pooled_cluster_df.groupby('Facility_Level')[['available_prop_redis', 'available_prop']].mean())
+pooled_cluster_df.to_csv(outputfilepath/ 'clustering_n3_df.csv', index=False)
+
+tlo_pooled_cluster = (
+        pooled_district_df
+        .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False)
+        .agg(available_prop_scen17=("available_prop_redis", "mean"))
+        .sort_values(["item_code","district","Facility_Level","month"])
+    )
+
 
+# 3) Implement pairwise redistribution
 # 2) Run Scenario 1 (1-hour radius)
 
 tlo_30 = run_redistribution_scenarios(

From 5f5f05aee6644a02b42800501bda9601e6c81ac8 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Wed, 22 Oct 2025 17:11:42 +0100
Subject: [PATCH 13/16] clean LMIS entries (only formatting)

---
 ...ate_consumable_redistribution_scenarios.py | 53 +++++++++++++++----
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
index 842ea2f1c0..e5300cfd16 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -87,6 +87,21 @@ def _mode_or_first(s: pd.Series):
 lmis["month"] = lmis["month"].map(month_map)
 lmis["Facility_Level"] = lmis["Facility_Level"].str.replace("Facility_level_", "", regex=False)
 
+# Clean data types before analysis
+# 1) Normalize fac_name
+lmis["fac_name"] = (
+    lmis["fac_name"]
+    .astype("string")            # Pandas string dtype (not just object)
+    .str.normalize("NFKC")       # unify unicode forms
+    .str.strip()                 # trim leading/trailing spaces
+    .str.replace(r"\s+", "_", regex=True)  # collapse internal whitespace
+)
+
+# 2) Normalize other key columns used in grouping/joins
+lmis["item_code"] = lmis["item_code"].astype("string").str.strip()
+lmis["district"] = lmis["district"].astype("string").str.strip().str.replace(r"\s+", "_", regex=True)
+lmis["Facility_Level"] = lmis["Facility_Level"].astype("string").str.strip()
+
 # Keep only those facilities whose location is available
 lmis = lmis[lmis.lat.notna()]
 # TODO assume something else about location of these facilities with missing location - eg. centroid of district?
@@ -1008,7 +1023,7 @@ def redistribute_pooling_lp(
         mask_rows_pos = sel & out[facility_col].isin(facs_pos)
         out.loc[mask_rows_pos, "OB_prime"] = out.loc[mask_rows_pos, facility_col].map(x_sol).values
 
-        # Facilities with AMC<eps donate entirely: OB' = min(OB, 3*AMC) but since AMC≈0, pLB=0 -> OB' = 0
+        # Facilities with AMC<eps donate entirely: OB' = min(OB, tau_donor_keep*AMC) but since AMC≈0, pLB=0 -> OB' = 0
         # (this matches the "donate to pool" assumption)
         mask_rows_zero = sel & ~out[facility_col].isin(facs_pos)
         out.loc[mask_rows_zero, "OB_prime"] = 0.0
@@ -1102,8 +1117,20 @@ def redistribute_pooling_lp(
     )
     viol_ub = out.loc[mask_solved, "OB_prime"].values > (ub + tol)
 
-    assert not viol_lb.any(), "viol_lb violated"
-    assert not viol_ub.any(), "viol_ub violated"
+    #assert not viol_lb.any(), "viol_lb violated"
+    #assert not viol_ub.any(), "viol_ub violated"
+    temp = out[mask_solved]
+    if viol_lb.any():
+        print("For the following rows (facility, item and month combinations), unclear why OB_prime < tau_donor_keep * AMC "
+              "which violates a constraint in the LPP")
+        # TODO see if this problem needs to be resolved
+        print(temp[viol_lb][['Facility_Level', 'amc', 'OB', 'OB_prime']])
+
+    if viol_ub.any():
+        print("For the following rows (facility, item and month combinations), unclear why OB_prime > tau_max * AMC "
+              "which violates a constraint in the LPP")
+        # TODO see if this problem needs to be resolved
+        print(temp[viol_ub][['Facility_Level', 'amc', 'OB', 'OB_prime']])
 
     if return_move_log:
         move_log = pd.DataFrame(move_rows)
@@ -1200,13 +1227,14 @@ def run_redistribution_scenarios(
 #    max_chunk=50)
 
 # Store dictionary in pickle format
-#with open(outputfilepath / "T_car.pkl", "wb") as f:
+#with open(outputfilepath / "T_car2.pkl", "wb") as f:
 #    pickle.dump(T_car, f)
 # -> Commented out because it takes long to run. The result has been stored in pickle format
 
 # Load pre-generated dictionary
-with open(outputfilepath / "T_car.pkl", "rb") as f:
+with open(outputfilepath / "T_car2.pkl", "rb") as f:
     T_car = pickle.load(f)
+# T_car2 was created after cleaning fac names and getting rid of spaces in the text
 
 #edges_flat = build_edges_within_radius_flat(T_car, max_minutes= 60)
 
@@ -1276,17 +1304,19 @@ def run_redistribution_scenarios(
 
 # b) Run optimisation at district level
 pooled_district_df, cluster_district_moves = redistribute_pooling_lp(
-    df=lmis_test,  # the LMIS dataframe
-    tau_min=0.25, tau_max=3.0,
+    df=lmis,  # the LMIS dataframe
+    tau_min=0, tau_max=4.0,
     tau_donor_keep = 3.0,
     pooling_level="district",
     cluster_map=None,
     return_move_log=True,
-    enforce_no_harm=True,
+    enforce_no_harm=False,
     floor_to_baseline=True
 )
 print(pooled_district_df.groupby('Facility_Level')[['available_prop_redis', 'available_prop']].mean())
-pooled_district_df.to_csv(outputfilepath/ 'clustering_district_df.csv', index=False)
+pooled_district_df[['district', 'item_code',	'fac_name', 'month', 'amc', 'available_prop', 'Facility_Level',
+                    'OB', 'OB_prime', 'available_prop_redis_raw', 'available_prop_redis', 'received_from_pool',
+                    'skip_flag']].to_csv(outputfilepath/ 'clustering_district_df.csv', index=False)
 tlo_pooled_district = (
         pooled_district_df
         .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False)
@@ -1298,7 +1328,7 @@ def run_redistribution_scenarios(
 #  c) Run optimisation at cluster (size = 3) level
 pooled_cluster_df, cluster_moves = redistribute_pooling_lp(
     df=lmis,  # the LMIS dataframe
-    tau_min=0.25, tau_max=3.0,
+    tau_min=0, tau_max=4.0,
     tau_donor_keep = 3.0,
     pooling_level="cluster",
     cluster_map=cluster_series,
@@ -1310,12 +1340,13 @@ def run_redistribution_scenarios(
 pooled_cluster_df.to_csv(outputfilepath/ 'clustering_n3_df.csv', index=False)
 
 tlo_pooled_cluster = (
-        pooled_district_df
+        pooled_cluster_df
         .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False)
         .agg(available_prop_scen17=("available_prop_redis", "mean"))
         .sort_values(["item_code","district","Facility_Level","month"])
     )
 
+tlo_pooled = tlo_pooled_district.merge(tlo_pooled_cluster, on=["item_code", "district", "Facility_Level", "month"], how="left", validate="one_to_one")
 
 # 3) Implement pairwise redistribution
 # 2) Run Scenario 1 (1-hour radius)

From c090cb65ddec8d8be213bc77b0c182c9bfb74c34 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Wed, 22 Oct 2025 18:43:34 +0100
Subject: [PATCH 14/16] Interpolate values of AMC and Closing Balance when
 missing or 0

---
 ...ate_consumable_redistribution_scenarios.py | 57 ++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
index e5300cfd16..4a057748b6 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -102,8 +102,60 @@ def _mode_or_first(s: pd.Series):
 lmis["district"] = lmis["district"].astype("string").str.strip().str.replace(r"\s+", "_", regex=True)
 lmis["Facility_Level"] = lmis["Facility_Level"].astype("string").str.strip()
 
+# 3) Ensure numeric types (quietly coerce bad strings to NaN)
+lmis["amc"] = pd.to_numeric(lmis["amc"], errors="coerce")
+lmis["closing_bal"] = pd.to_numeric(lmis["closing_bal"], errors="coerce")
+
 # Keep only those facilities whose location is available
 lmis = lmis[lmis.lat.notna()]
+
+lmis[(lmis.amc ==0)|(lmis.amc.isna())].to_csv(outputfilepath / 'exploring_amc_0.csv', index=False)
+
+# --- 1) Replace amc==0 with mean(amc>0) per (fac_name, item_code) ---
+# Mark AMC bad-before
+amc_bad_before = lmis["amc"].isna() | (lmis["amc"] == 0)
+
+# Fix AMC: fill zeros/NaNs with mean of positive AMC within (fac_name, item_code)
+def _fill_amc_pos_mean(s: pd.Series) -> pd.Series:
+    m = s[s > 0].mean()  # mean over positive AMC only
+    if pd.isna(m):
+        return s          # no positive months -> leave as-is
+    return s.where(~(s.isna() | (s == 0)), m)
+
+amc_before_zeros_and_nan = ((lmis["amc"] == 0)|(lmis['amc'].isna())).sum()
+lmis['amc_previously_zero_or_nan'] = ((lmis["amc"] == 0)|(lmis['amc'].isna()))
+lmis["amc"] = (
+    lmis.groupby(["fac_name", "item_code"], group_keys=False)["amc"]
+        .transform(_fill_amc_pos_mean)
+)
+amc_after_zeros_and_nan =  ((lmis["amc"] == 0)|(lmis['amc'].isna())).sum()
+print(f"AMC zeroes and nan before: {amc_before_zeros_and_nan:,} | after: {amc_after_zeros_and_nan:,}")
+
+# Rows where AMC was bad and is now fixed (>0)
+amc_fixed_now = amc_bad_before & lmis["amc"].notna() & (lmis["amc"] > 0)
+
+# Prepare group means for closing_bal (non-zero, non-NaN only)
+cb_group_mean = (
+    lmis.groupby(["fac_name", "item_code"])["closing_bal"]
+         .transform(lambda s: s.replace(0, np.nan).mean())
+ )
+
+# Edit closing_bal ONLY where:
+#       (a) AMC was previously bad and is now fixed, AND
+#       (b) closing_bal is 0 or NaN, AND
+#       (c) a group mean exists
+cb_bad_now = lmis["closing_bal"].isna() | (lmis["closing_bal"] == 0)
+mask_edit = amc_fixed_now & cb_bad_now & cb_group_mean.notna()
+
+before = (cb_bad_now & amc_fixed_now).sum()
+
+lmis.loc[mask_edit, "closing_bal"] = cb_group_mean[mask_edit]
+
+after = ( (lmis["closing_bal"].isna()) | (lmis["closing_bal"] == 0) ) & amc_fixed_now
+print(f"closing_bal zeros/NaNs among AMC-fixed rows — before: {before:,} | after: {after.sum():,}")
+
+lmis[mask_edit].to_csv(outputfilepath / 'exploring_amc_0_after_fix.csv', index=False)
+
 # TODO assume something else about location of these facilities with missing location - eg. centroid of district?
 # only 16 facilties have missing information
 
@@ -1125,12 +1177,13 @@ def redistribute_pooling_lp(
               "which violates a constraint in the LPP")
         # TODO see if this problem needs to be resolved
         print(temp[viol_lb][['Facility_Level', 'amc', 'OB', 'OB_prime']])
-
+        temp[viol_lb][['Facility_Level', 'fac_name', 'amc', 'OB', 'OB_prime']].to_csv('violates_lb.csv')
     if viol_ub.any():
         print("For the following rows (facility, item and month combinations), unclear why OB_prime > tau_max * AMC "
               "which violates a constraint in the LPP")
         # TODO see if this problem needs to be resolved
         print(temp[viol_ub][['Facility_Level', 'amc', 'OB', 'OB_prime']])
+        temp[viol_ub][['Facility_Level', 'fac_name', 'amc', 'OB', 'OB_prime']].to_csv('violates_ub.csv')
 
     if return_move_log:
         move_log = pd.DataFrame(move_rows)
@@ -1303,6 +1356,7 @@ def run_redistribution_scenarios(
 # cluster_series is a pd.Series: index=facility_id, value like "District A#C00", "District A#C01", ...
 
 # b) Run optimisation at district level
+print("Now running pooled redistribution at District level")
 pooled_district_df, cluster_district_moves = redistribute_pooling_lp(
     df=lmis,  # the LMIS dataframe
     tau_min=0, tau_max=4.0,
@@ -1326,6 +1380,7 @@ def run_redistribution_scenarios(
 
 
 #  c) Run optimisation at cluster (size = 3) level
+print("Now running pooled redistribution at Cluster (Size = 3) level")
 pooled_cluster_df, cluster_moves = redistribute_pooling_lp(
     df=lmis,  # the LMIS dataframe
     tau_min=0, tau_max=4.0,

From a0487be30ec615cdd848b5ead16a4e90d4142470 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Thu, 23 Oct 2025 17:18:13 +0100
Subject: [PATCH 15/16] update how stock in/adequacy plots are generated
 (exclude bad AMC/OB rows)

---
 .../create_consumable_redistribution_scenarios.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
index 4a057748b6..f21ebb3acd 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -214,6 +214,21 @@ def generate_stock_adequacy_heatmap(
                amc=("amc", "sum"))
           .reset_index()
     )
+    # Identify groups (fac_name, item_code) that have any non-zero amc
+    nonzero_groups = agg.loc[agg["amc"] != 0, ["fac_name", "item_code"]].drop_duplicates()
+
+    # Keep:
+    # - 1. all rows where amc != 0
+    # - 2. rows where the (fac_name, item_code) pair never had any non-zero amc
+    # (because this would indicate that their AMC may in fact be zero)
+    # - 3. rows where both Closing balance and AMC are not zero
+    agg = agg[
+        (agg["amc"] != 0)
+        | (~agg[["fac_name", "item_code"]]
+           .apply(tuple, axis=1)
+           .isin(nonzero_groups.apply(tuple, axis=1)))
+        ]
+    agg = agg[~((agg["amc"] == 0) & (agg["opening_bal"] == 0))]
 
     # ---- 3) Adequacy indicator per (month, district, item_code) ----
     if include_missing_as_fail:

From 07cfee10ae05704245e1b8685f4e9ff514fcd40a Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan08@gmail.com>
Date: Fri, 24 Oct 2025 12:03:57 +0100
Subject: [PATCH 16/16] drop seemingly incorrect rows from LMIS data at
 pre-processing

---
 ...ate_consumable_redistribution_scenarios.py | 119 +++++++-----------
 1 file changed, 45 insertions(+), 74 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
index f21ebb3acd..cc37c26629 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py
@@ -107,66 +107,13 @@ def _mode_or_first(s: pd.Series):
 lmis["closing_bal"] = pd.to_numeric(lmis["closing_bal"], errors="coerce")
 
 # Keep only those facilities whose location is available
+old_facility_count = lmis.fac_name.nunique()
 lmis = lmis[lmis.lat.notna()]
+new_facility_count = lmis.fac_name.nunique()
+print(f"{old_facility_count - new_facility_count} facilities out of {old_facility_count} in the lmis data dropped due to "
+      f"missing location information")
 
-lmis[(lmis.amc ==0)|(lmis.amc.isna())].to_csv(outputfilepath / 'exploring_amc_0.csv', index=False)
-
-# --- 1) Replace amc==0 with mean(amc>0) per (fac_name, item_code) ---
-# Mark AMC bad-before
-amc_bad_before = lmis["amc"].isna() | (lmis["amc"] == 0)
-
-# Fix AMC: fill zeros/NaNs with mean of positive AMC within (fac_name, item_code)
-def _fill_amc_pos_mean(s: pd.Series) -> pd.Series:
-    m = s[s > 0].mean()  # mean over positive AMC only
-    if pd.isna(m):
-        return s          # no positive months -> leave as-is
-    return s.where(~(s.isna() | (s == 0)), m)
-
-amc_before_zeros_and_nan = ((lmis["amc"] == 0)|(lmis['amc'].isna())).sum()
-lmis['amc_previously_zero_or_nan'] = ((lmis["amc"] == 0)|(lmis['amc'].isna()))
-lmis["amc"] = (
-    lmis.groupby(["fac_name", "item_code"], group_keys=False)["amc"]
-        .transform(_fill_amc_pos_mean)
-)
-amc_after_zeros_and_nan =  ((lmis["amc"] == 0)|(lmis['amc'].isna())).sum()
-print(f"AMC zeroes and nan before: {amc_before_zeros_and_nan:,} | after: {amc_after_zeros_and_nan:,}")
-
-# Rows where AMC was bad and is now fixed (>0)
-amc_fixed_now = amc_bad_before & lmis["amc"].notna() & (lmis["amc"] > 0)
-
-# Prepare group means for closing_bal (non-zero, non-NaN only)
-cb_group_mean = (
-    lmis.groupby(["fac_name", "item_code"])["closing_bal"]
-         .transform(lambda s: s.replace(0, np.nan).mean())
- )
-
-# Edit closing_bal ONLY where:
-#       (a) AMC was previously bad and is now fixed, AND
-#       (b) closing_bal is 0 or NaN, AND
-#       (c) a group mean exists
-cb_bad_now = lmis["closing_bal"].isna() | (lmis["closing_bal"] == 0)
-mask_edit = amc_fixed_now & cb_bad_now & cb_group_mean.notna()
-
-before = (cb_bad_now & amc_fixed_now).sum()
-
-lmis.loc[mask_edit, "closing_bal"] = cb_group_mean[mask_edit]
-
-after = ( (lmis["closing_bal"].isna()) | (lmis["closing_bal"] == 0) ) & amc_fixed_now
-print(f"closing_bal zeros/NaNs among AMC-fixed rows — before: {before:,} | after: {after.sum():,}")
-
-lmis[mask_edit].to_csv(outputfilepath / 'exploring_amc_0_after_fix.csv', index=False)
-
-# TODO assume something else about location of these facilities with missing location - eg. centroid of district?
-# only 16 facilties have missing information
-
-# For now, let's keep only 8 facilities in 1 district
-#lmis = lmis[lmis.district == 'Balaka']
-# Keep only facilities with Facility_Level == '1a'
-#lmis = lmis[lmis["Facility_Level"] == '1a']
-
-# -----------------------------------
-# 1) Data exploration
-# -----------------------------------
+# Explore missingness
 def compute_opening_balance(df: pd.DataFrame) -> pd.Series:
     """
     Opening balance from same-month records:
@@ -175,6 +122,45 @@ def compute_opening_balance(df: pd.DataFrame) -> pd.Series:
     """
     return df["closing_bal"] - df["received"] + df["dispensed"]
 
+# Drop the rows which seem suspicious
+# - 1. Rows with AMC and OB = 0 (these would be excluded from the redistribution algorithm anyway)
+# - 2. Rows where the (fac_name, item_code) pair only reported 0 AMC throughout the year
+lmis.reset_index(inplace=True, drop=True)
+lmis['opening_bal'] = compute_opening_balance(lmis)
+cb0 = (lmis.closing_bal == 0)|(lmis.closing_bal.isna())
+ob0 = (lmis.opening_bal == 0)|(lmis.opening_bal.isna())
+amc0 = (lmis.amc == 0)|(lmis.amc.isna())
+d0 = (lmis.dispensed == 0)|(lmis.dispensed.isna())
+r0 = (lmis.received == 0)|(lmis.received.isna())
+prob0 = (lmis.available_prop == 0)|(lmis.available_prop.isna())
+non_zero_amc_groups = lmis.loc[~amc0, ["fac_name", "item_code"]].drop_duplicates()
+non0_amc = lmis[['fac_name', 'item_code']].apply(tuple, axis=1).isin(non_zero_amc_groups.apply(tuple, axis=1))
+non_zero_ob_groups = lmis.loc[~ob0, ["fac_name", "item_code"]].drop_duplicates()
+non0_ob = lmis[['fac_name', 'item_code']].apply(tuple, axis=1).isin(non_zero_ob_groups.apply(tuple, axis=1))
+
+print(f"{(amc0 & non0_amc).sum()} rows ({(amc0 & non0_amc).sum()/len(lmis) * 100:.2f}%) where AMC is zero even though the same facility recorded a non-zero AMC for a specific item"
+      f"for a different month in the year")
+print(f"Among these, {(amc0 & cb0 & non0_amc).sum()} rows ({(amc0 & cb0 & non0_amc).sum()/len(lmis) * 100:.2f}% of "
+      f"total) also recorded CB as zero even though CB was non-zero for a different month")
+print(f"Among these, {(amc0 & cb0 & non0_amc & ~prob0).sum()} rows "
+      f"({(amc0 & cb0 & non0_amc& ~prob0).sum()/len(lmis) * 100:.2f}% of "
+      f"total) have been recorded as having a non-zero probability of drug availability based on data from other months.")
+
+old_row_count = len(lmis)
+drop_mask = (amc0 & ob0) | (~non0_amc)
+lmis = lmis[~drop_mask]
+new_row_count = len(lmis)
+
+print(f"{old_row_count - new_row_count} ({(old_row_count - new_row_count)/old_row_count * 100:.2f}%)"
+      f"rows out of {old_row_count} in the lmis data dropped due to seemingly incorrect entries")
+lmis.reset_index(inplace=True, drop = True)
+
+# TODO assume something else about location of these facilities with missing location - eg. centroid of district?
+# only 16 facilties have missing information
+
+# -----------------------------------
+# 1) Data exploration
+# -----------------------------------
 def generate_stock_adequacy_heatmap(
     df: pd.DataFrame,
     figures_path: Path = Path("figures"),
@@ -214,21 +200,6 @@ def generate_stock_adequacy_heatmap(
                amc=("amc", "sum"))
           .reset_index()
     )
-    # Identify groups (fac_name, item_code) that have any non-zero amc
-    nonzero_groups = agg.loc[agg["amc"] != 0, ["fac_name", "item_code"]].drop_duplicates()
-
-    # Keep:
-    # - 1. all rows where amc != 0
-    # - 2. rows where the (fac_name, item_code) pair never had any non-zero amc
-    # (because this would indicate that their AMC may in fact be zero)
-    # - 3. rows where both Closing balance and AMC are not zero
-    agg = agg[
-        (agg["amc"] != 0)
-        | (~agg[["fac_name", "item_code"]]
-           .apply(tuple, axis=1)
-           .isin(nonzero_groups.apply(tuple, axis=1)))
-        ]
-    agg = agg[~((agg["amc"] == 0) & (agg["opening_bal"] == 0))]
 
     # ---- 3) Adequacy indicator per (month, district, item_code) ----
     if include_missing_as_fail:
@@ -307,7 +278,7 @@ def generate_stock_adequacy_heatmap(
         cmap=cmap,
         cbar_kws={"label": value_label},
         ax=ax,
-        annot=True, annot_kws={"size": 10},
+        annot=annotation, annot_kws={"size": 10},
         vmin = 0, vmax = 100)
 
     ax.set_xlabel("Month")