From 8039d8371a6b19bfd37bc1a51a6377ed6870aa5d Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 15 May 2026 13:48:32 +0100
Subject: [PATCH 1/6] initiate the files for patient mix analysis

---
 .../analysis_scripts/analysis_patient_mix.py  | 166 ++++++++++++++++++
 .../scenarios/scenario_patient_mix.py         |  67 +++++++
 .../formatting_hrh_per_clinic_tlm_2024.py     |  27 +++
 3 files changed, 260 insertions(+)
 create mode 100644 src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
 create mode 100644 src/scripts/calibration_analyses/scenarios/scenario_patient_mix.py
 create mode 100644 src/scripts/data_file_processing/healthsystem/human_resources/formatting_hrh_per_clinic_tlm_2024.py

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
new file mode 100644
index 0000000000..879c1403c0
--- /dev/null
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
@@ -0,0 +1,166 @@
+"""
+This file analyses and plots the patient load and mix in service areas including outpatient (including ANC) and
+emergency care.
+
+The scenarios are defined in XXX.py.
+"""
+import argparse
+from collections import Counter
+from pathlib import Path
+from typing import Optional, Tuple
+
+import numpy as np
+import pandas as pd
+import statsmodels.api as sm
+from matplotlib import pyplot as plt
+
+from tlo import Date
+from tlo.analysis.utils import (
+    extract_results,
+    load_pickled_dataframes,
+    summarize,
+    unflatten_flattened_multi_index_in_logging,
+)
+
+
+def apply(results_folder: Path, output_folder: Path, resourcefilepath: Path = None,
+          the_target_period: Tuple[Date, Date] = None,
+          district_resolution: Optional[bool] = False,
+          level_resolution: Optional[bool] = False,
+          year_resolution: Optional[bool] = False,
+          month_resolution: Optional[bool] = False,
+          date_resolution: Optional[bool] = False):
+
+    TARGET_PERIOD = the_target_period
+
+    def get_patient_counts_in_emerg(_df):
+        _df = _df.loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), :]
+
+        # include HSIs of generic first attendance (nonemerg) and first and subsequent ANC visits
+        _df = _df.loc[_df['TREATMENT_ID'].isin(['FirstAttendance_Emergency',
+                                                'FirstAttendance_SpuriousEmergencyCare'])]
+
+        # count patient volume, dropping duplicated person_ids between first attendance and anc on the same day
+        _daily_person_counts = _df.groupby("date")["Person_ID"].nunique()
+        _daily_person_counts.index = pd.to_datetime(_daily_person_counts.index)
+        _daily_person_counts.name = "daily_patient_volume"
+        return _daily_person_counts
+
+    def get_staff_counts(_df):
+        _df = _df.loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), :]
+        _df_staff = (
+            pd.Series(_df.GenericClinic[0], name="staff_count")
+            .rename_axis("facility_officer")
+            .reset_index()
+        )
+
+        _df_staff[["facility_id", "officer_type"]] = _df_staff["facility_officer"].str.extract(
+            r"FacilityID_(\d+)_Officer_(.*)"
+        )
+
+        _df_staff["facility_id"] = _df_staff["facility_id"].astype(int)
+
+        _df_staff = _df_staff[["facility_id", "officer_type", "staff_count"]]
+
+        _df_staff = _df_staff.loc[_df_staff.officer_type != 'DCSA']
+
+        _df_staff = pd.Series(_df_staff.staff_count.sum())
+
+        _df_staff.index = [pd.to_datetime(_df["date"].iloc[0])]
+        _df_staff.name = 'yearly_staff_count'
+
+        return _df_staff
+
+    def get_patient_counts_in_opd(_df):
+        _df = _df.loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), :]
+
+        # include HSIs of generic first attendance (nonemerg) and first and subsequent ANC visits
+        _df = _df.loc[_df['TREATMENT_ID'].isin(['FirstAttendance_NonEmergency',
+                                                'AntenatalCare_Outpatient'])]
+
+        # count patient volume, dropping duplicated person_ids between first attendance and anc on the same day
+        _daily_person_counts = _df.groupby("date")["Person_ID"].nunique()
+        _daily_person_counts.index = pd.to_datetime(_daily_person_counts.index)
+        _daily_person_counts.name = "daily_patient_volume"
+
+        return _daily_person_counts
+
+    # log = load_pickled_dataframes(results_folder, 0, 0)
+    # h = pd.DataFrame(
+    #     log['tlo.methods.healthsystem.summary']['hsi_event_details'].iloc[0]['hsi_event_key_to_event_details']
+    # ).T
+
+    hcw_count = extract_results(
+        results_folder,
+        module="tlo.methods.healthsystem.summary",
+        key="number_of_hcw_staff",
+        custom_generate_series=get_staff_counts,
+        do_scaling=False
+    )
+    hcw_count = hcw_count[(0, 0)]
+
+    patient_volume_opd = extract_results(
+        results_folder,
+        module="tlo.methods.healthsystem",
+        key="HSI_Event",
+        custom_generate_series=get_patient_counts_in_opd,
+        do_scaling=True
+    )
+
+    patient_volume_emerg = extract_results(
+        results_folder,
+        module="tlo.methods.healthsystem",
+        key="HSI_Event",
+        custom_generate_series=get_patient_counts_in_emerg,
+        do_scaling=True
+    )
+
+    patient_load_per_hcw_in_opd = patient_volume_opd / hcw_count.values[0]
+    patient_load_per_hcw_in_emerg = patient_volume_emerg / hcw_count.values[0]
+
+    opd = patient_load_per_hcw_in_opd[(0, 0)].rename("Outpatient care")
+    emerg = patient_load_per_hcw_in_emerg[(0, 0)].rename("Emergency/Intensive care")
+
+    patient_load_per_hcw = pd.concat([opd, emerg], axis=1)
+    patient_load_per_hcw = patient_load_per_hcw.fillna(0)
+
+    patient_load_per_hcw = (
+        patient_load_per_hcw
+        .reset_index()
+        .melt(
+            id_vars="date",
+            var_name="coarse_loc_cat_str",
+            value_name="daily_patient_load_per_hcw"
+        )
+    )
+
+    path_to_tlm_folder = (
+        resourcefilepath
+        / "healthsystem"
+        / "human_resources"
+        / "TLM_2024"
+    )
+
+    patient_load_per_hcw.to_stata(
+        path_to_tlm_folder / "tlo_pat_load.dta",
+        write_index=False,
+        convert_dates={"date": "td"}
+    )
+
+    # issue: do not know how many staff members are in outpatient, emergency care, and inpatient
+    # issue: how to treat nan entries for emerg -> default 0?
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("results_folder", type=Path)
+    args = parser.parse_args()
+
+    apply(
+        results_folder=args.results_folder,
+        output_folder=args.results_folder,
+        resourcefilepath=Path('./resources'),
+        the_target_period=(Date(2010, 1, 1), Date(2010, 5, 31))
+    )
+
+
diff --git a/src/scripts/calibration_analyses/scenarios/scenario_patient_mix.py b/src/scripts/calibration_analyses/scenarios/scenario_patient_mix.py
new file mode 100644
index 0000000000..ec14c0e980
--- /dev/null
+++ b/src/scripts/calibration_analyses/scenarios/scenario_patient_mix.py
@@ -0,0 +1,67 @@
+"""
+This file defines a batch run of a large population for a long time with all disease modules and full use of HSIs
+It's used for calibrations (demographic patterns, health burdens and healthsystem usage)
+
+Run on the batch system using:
+```tlo batch-submit src/scripts/calibration_analyses/scenarios/scenario_patient_mix.py```
+
+or locally using:
+    ```tlo scenario-run src/scripts/calibration_analyses/scenarios/scenario_patient_mix.py```
+
+"""
+from typing import Dict
+
+from tlo import Date, logging
+from tlo.analysis.utils import get_parameters_for_status_quo, mix_scenarios
+from tlo.methods.fullmodel import fullmodel
+from tlo.scenario import BaseScenario
+
+
+class LongRun(BaseScenario):
+    def __init__(self):
+        super().__init__()
+        self.seed = 0
+        self.start_date = Date(2010, 1, 1)
+        self.end_date = Date(2025, 1, 1)  # The simulation will stop before reaching this date.
+        self.pop_size = 100_000
+        self._scenarios = self._get_scenarios()
+        self.number_of_draws = len(self._scenarios)
+        self.runs_per_draw = 1
+
+    def log_configuration(self):
+        return {
+            'filename': 'scenario_patient_mix',  # <- (specified only for local running)
+            'directory': './outputs',  # <- (specified only for local running)
+            'custom_levels': {
+                '*': logging.WARNING,
+                'tlo.methods.healthsystem': logging.DEBUG,
+                'tlo.methods.healthsystem.summary': logging.INFO,
+            }
+        }
+
+    def modules(self):
+        return fullmodel()
+
+    def draw_parameters(self, draw_number, rng):
+        if draw_number < self.number_of_draws:
+            return list(self._scenarios.values())[draw_number]
+
+    def _get_scenarios(self) -> Dict[str, Dict]:
+        """Return the Dict with values for the parameters that are changed, keyed by a name for the scenario."""
+        return {
+            "Mode 1 with historical HRH growth":
+                mix_scenarios(
+                    get_parameters_for_status_quo(),
+                    {
+                        'HealthSystem': {
+                            'yearly_HR_scaling_mode': 'historical_scaling',  # for 5 years between 2020-2024
+                        },
+                    }
+                )
+        }
+
+
+if __name__ == '__main__':
+    from tlo.cli import scenario_run
+
+    scenario_run([__file__])
diff --git a/src/scripts/data_file_processing/healthsystem/human_resources/formatting_hrh_per_clinic_tlm_2024.py b/src/scripts/data_file_processing/healthsystem/human_resources/formatting_hrh_per_clinic_tlm_2024.py
new file mode 100644
index 0000000000..33cb78f0fc
--- /dev/null
+++ b/src/scripts/data_file_processing/healthsystem/human_resources/formatting_hrh_per_clinic_tlm_2024.py
@@ -0,0 +1,27 @@
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+resourcefilepath = Path("./resources")
+
+path_to_tlm_tool_six = (
+    resourcefilepath
+    / "healthsystem"
+    / "human_resources"
+    / "TLM_2024"
+    / "TLM_Tool_6_Facility_Level_TMS_v1_cleaned_v4.xlsx"
+)
+
+hrh_per_clinic = pd.read_excel(path_to_tlm_tool_six, sheet_name="Facility Level TMS")
+
+id_vars = [
+    "Facility ID",
+    "Clinic/ward/department",
+    "Opening date and time",
+    "Closing date and time",
+]
+
+duplicated_rows = hrh_per_clinic[
+    hrh_per_clinic.duplicated(subset=id_vars, keep=False)
+]

From 85aafecb297dabe9d8b9bc7b288d5676879b4ab5 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 20 May 2026 11:22:33 +0100
Subject: [PATCH 2/6] update analysis file to calculate daily patient load
 across all service areas

---
 .../analysis_scripts/analysis_patient_mix.py  | 64 +++++++++++++------
 1 file changed, 43 insertions(+), 21 deletions(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
index 879c1403c0..8f877700a2 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
@@ -47,9 +47,10 @@ def get_patient_counts_in_emerg(_df):
         return _daily_person_counts
 
     def get_staff_counts(_df):
-        _df = _df.loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), :]
+        the_hrh_target_period = (Date(2024, 1, 1), Date(2024, 1, 1))
+        _df = _df.loc[pd.to_datetime(_df['date']).between(*the_hrh_target_period), :]
         _df_staff = (
-            pd.Series(_df.GenericClinic[0], name="staff_count")
+            pd.Series(_df.GenericClinic.iloc[0], name="staff_count")
             .rename_axis("facility_officer")
             .reset_index()
         )
@@ -85,9 +86,19 @@ def get_patient_counts_in_opd(_df):
 
         return _daily_person_counts
 
+    def get_patient_counts(_df):
+        _df = _df.loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), :]
+
+        # count overall patient volume, dropping duplicated person_ids between first attendance and anc on the same day
+        _daily_person_counts = _df.groupby("date")["Person_ID"].nunique()
+        _daily_person_counts.index = pd.to_datetime(_daily_person_counts.index)
+        _daily_person_counts.name = "daily_patient_volume"
+
+        return _daily_person_counts
+
     # log = load_pickled_dataframes(results_folder, 0, 0)
     # h = pd.DataFrame(
-    #     log['tlo.methods.healthsystem.summary']['hsi_event_details'].iloc[0]['hsi_event_key_to_event_details']
+    #     log['tlo.methods.healthsystem.']['hsi_event_details'].iloc[0]['hsi_event_key_to_event_details']
     # ).T
 
     hcw_count = extract_results(
@@ -99,29 +110,39 @@ def get_patient_counts_in_opd(_df):
     )
     hcw_count = hcw_count[(0, 0)]
 
-    patient_volume_opd = extract_results(
+    patient_volume = extract_results(
         results_folder,
         module="tlo.methods.healthsystem",
         key="HSI_Event",
-        custom_generate_series=get_patient_counts_in_opd,
+        custom_generate_series=get_patient_counts,
         do_scaling=True
     )
 
-    patient_volume_emerg = extract_results(
-        results_folder,
-        module="tlo.methods.healthsystem",
-        key="HSI_Event",
-        custom_generate_series=get_patient_counts_in_emerg,
-        do_scaling=True
-    )
-
-    patient_load_per_hcw_in_opd = patient_volume_opd / hcw_count.values[0]
-    patient_load_per_hcw_in_emerg = patient_volume_emerg / hcw_count.values[0]
-
-    opd = patient_load_per_hcw_in_opd[(0, 0)].rename("Outpatient care")
-    emerg = patient_load_per_hcw_in_emerg[(0, 0)].rename("Emergency/Intensive care")
-
-    patient_load_per_hcw = pd.concat([opd, emerg], axis=1)
+    # patient_volume_opd = extract_results(
+    #     results_folder,
+    #     module="tlo.methods.healthsystem",
+    #     key="HSI_Event",
+    #     custom_generate_series=get_patient_counts_in_opd,
+    #     do_scaling=True
+    # )
+    #
+    # patient_volume_emerg = extract_results(
+    #     results_folder,
+    #     module="tlo.methods.healthsystem",
+    #     key="HSI_Event",
+    #     custom_generate_series=get_patient_counts_in_emerg,
+    #     do_scaling=True
+    # )
+
+    # patient_load_per_hcw_in_opd = patient_volume_opd / hcw_count.values[0]
+    # patient_load_per_hcw_in_emerg = patient_volume_emerg / hcw_count.values[0]
+    #
+    # opd = patient_load_per_hcw_in_opd[(0, 0)].rename("Outpatient care")
+    # emerg = patient_load_per_hcw_in_emerg[(0, 0)].rename("Emergency/Intensive care")
+    #
+    # patient_load_per_hcw = pd.concat([opd, emerg], axis=1)
+
+    patient_load_per_hcw = patient_volume / hcw_count.values[0]
     patient_load_per_hcw = patient_load_per_hcw.fillna(0)
 
     patient_load_per_hcw = (
@@ -133,6 +154,7 @@ def get_patient_counts_in_opd(_df):
             value_name="daily_patient_load_per_hcw"
         )
     )
+    patient_load_per_hcw["coarse_loc_cat_str"] = "All service area"
 
     path_to_tlm_folder = (
         resourcefilepath
@@ -160,7 +182,7 @@ def get_patient_counts_in_opd(_df):
         results_folder=args.results_folder,
         output_folder=args.results_folder,
         resourcefilepath=Path('./resources'),
-        the_target_period=(Date(2010, 1, 1), Date(2010, 5, 31))
+        the_target_period=(Date(2024, 1, 1), Date(2024, 5, 31))
     )
 
 

From 910136045af4c4da951cb2f13b8cce2674fa7e8d Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Thu, 28 May 2026 10:25:41 +0100
Subject: [PATCH 3/6] prepare daily patient load by district and
 level/facility_id

---
 .../analysis_scripts/analysis_patient_mix.py  | 108 +++++++++++++++++-
 1 file changed, 103 insertions(+), 5 deletions(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
index 8f877700a2..49129de5b7 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
@@ -46,7 +46,7 @@ def get_patient_counts_in_emerg(_df):
         _daily_person_counts.name = "daily_patient_volume"
         return _daily_person_counts
 
-    def get_staff_counts(_df):
+    def get_hcw_count(_df):
         the_hrh_target_period = (Date(2024, 1, 1), Date(2024, 1, 1))
         _df = _df.loc[pd.to_datetime(_df['date']).between(*the_hrh_target_period), :]
         _df_staff = (
@@ -86,26 +86,124 @@ def get_patient_counts_in_opd(_df):
 
         return _daily_person_counts
 
-    def get_patient_counts(_df):
+    def get_patient_count(_df):
         _df = _df.loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), :]
 
-        # count overall patient volume, dropping duplicated person_ids between first attendance and anc on the same day
+        # count overall patient volume, dropping duplicated person_ids on the same day
         _daily_person_counts = _df.groupby("date")["Person_ID"].nunique()
         _daily_person_counts.index = pd.to_datetime(_daily_person_counts.index)
         _daily_person_counts.name = "daily_patient_volume"
 
         return _daily_person_counts
 
+    def get_hcw_count_facility_id(_df):
+        the_hrh_target_period = (Date(2024, 1, 1), Date(2024, 1, 1))
+        _df = _df.loc[pd.to_datetime(_df['date']).between(*the_hrh_target_period), :]
+
+        _df_staff = (
+            pd.Series(_df.GenericClinic.iloc[0], name="Staff_Count")
+            .rename_axis("facility_officer")
+            .reset_index()
+        )
+
+        _df_staff[["Facility_ID", "Officer_Type"]] = _df_staff["facility_officer"].str.extract(
+            r"FacilityID_(\d+)_Officer_(.*)"
+        )
+
+        _df_staff["Facility_ID"] = _df_staff["Facility_ID"].astype(int)
+
+        _df_staff = _df_staff[["Facility_ID", "Officer_Type", "Staff_Count"]]
+
+        _df_staff = _df_staff.groupby("Facility_ID")["Staff_Count"].sum()
+
+        return _df_staff
+
+    def get_patient_count_facility_id(_df):
+        _df = _df.loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), :]
+
+        # count overall patient volume, dropping duplicated person_ids on the same day
+        _daily_person_counts = _df.groupby(["date", "Facility_ID"])["Person_ID"].nunique().reset_index()
+        _daily_person_counts["date"] = pd.to_datetime(_daily_person_counts.date)
+        _daily_person_counts = _daily_person_counts.set_index(["date", "Facility_ID"])["Person_ID"]
+        _daily_person_counts.name = "daily_patient_volume"
+
+        return _daily_person_counts
+
+    def merge_info_from_mfl(_df):
+        # should merge on "left" as original designed level 1b is now merged in to level 2
+        _df = _df.merge(mfl[["Facility_ID", "District", "Facility_Level", "Region"]], on="Facility_ID", how="left")
+
+        return _df
+
+
     # log = load_pickled_dataframes(results_folder, 0, 0)
     # h = pd.DataFrame(
     #     log['tlo.methods.healthsystem.']['hsi_event_details'].iloc[0]['hsi_event_key_to_event_details']
     # ).T
 
+    mfl = pd.read_csv(resourcefilepath / 'healthsystem' / 'organisation' / 'ResourceFile_Master_Facilities_List.csv')
+
+    ## patient volume
+    patient_volume_facility_id = extract_results(
+        results_folder,
+        module="tlo.methods.healthsystem",
+        key="HSI_Event",
+        custom_generate_series=get_patient_count_facility_id,
+        do_scaling=True
+    )
+
+    patient_volume_facility_id.columns = patient_volume_facility_id.columns.droplevel('run')
+    patient_volume_facility_id = patient_volume_facility_id.reset_index().rename(
+        columns={0: "Patient_Volume", "date": "Date"})
+    patient_volume_facility_id = merge_info_from_mfl(patient_volume_facility_id)
+
+    ## hcw count
+    hcw_count_facility_id = extract_results(
+        results_folder,
+        module="tlo.methods.healthsystem.summary",
+        key="number_of_hcw_staff",
+        custom_generate_series=get_hcw_count_facility_id,
+        do_scaling=False
+    )
+    hcw_count_facility_id.columns = hcw_count_facility_id.columns.droplevel('run')
+    hcw_count_facility_id = hcw_count_facility_id.reset_index().rename(columns={0: 'Staff_Count'})
+    hcw_count_facility_id = merge_info_from_mfl(hcw_count_facility_id)
+
+    # fix levels in hcw_count: 1b has no staff now as merged to 2; ZMH at 4 to be merged to 3; drop HQ at 5
+    hcw_count_facility_id.drop(index=hcw_count_facility_id[hcw_count_facility_id["Facility_Level"] == "5"].index,
+                               inplace=True)
+    assert (hcw_count_facility_id.loc[hcw_count_facility_id["Facility_Level"] == "1b", "Staff_Count"] == 0).all()
+    hcw_count_facility_id.drop(index=hcw_count_facility_id[hcw_count_facility_id["Facility_Level"] == "1b"].index,
+                               inplace=True)
+    ## patient load
+    assert set(patient_volume_facility_id.Facility_ID.drop_duplicates()).issubset(
+        set(hcw_count_facility_id.Facility_ID.drop_duplicates())
+    )
+    daily_patient_load_per_hcw = patient_volume_facility_id[["Facility_ID", "Patient_Volume"]].merge(
+        hcw_count_facility_id[["Facility_ID", "District", "Facility_Level", "Region", "Staff_Count"]],
+        on=["Facility_ID"], how="right")
+    # fill NAN entries
+    daily_patient_load_per_hcw.loc[
+        daily_patient_load_per_hcw["Facility_Level"] == "4", ["Patient_Volume", "District", "Region"]
+    ] = [0, "Central Hospitals (Southern)", "Southern"]  # ZMH
+    daily_patient_load_per_hcw.loc[
+        daily_patient_load_per_hcw["Facility_ID"] == 128, "District"
+    ] = "Central Hospitals (Southern)"
+    daily_patient_load_per_hcw.loc[
+        daily_patient_load_per_hcw["Facility_ID"] == 129, "District"
+    ] = "Central Hospitals (Northern)"
+    daily_patient_load_per_hcw.loc[
+        daily_patient_load_per_hcw["Facility_ID"] == 130, "District"
+    ] = "Central Hospitals (Central)"
+
+    daily_patient_load_per_hcw['Daily_Patient_Load_Per_HCW'] = (daily_patient_load_per_hcw['Patient_Volume']
+                                                                / daily_patient_load_per_hcw['Staff_Count'])
+
     hcw_count = extract_results(
         results_folder,
         module="tlo.methods.healthsystem.summary",
         key="number_of_hcw_staff",
-        custom_generate_series=get_staff_counts,
+        custom_generate_series=get_hcw_count,
         do_scaling=False
     )
     hcw_count = hcw_count[(0, 0)]
@@ -114,7 +212,7 @@ def get_patient_counts(_df):
         results_folder,
         module="tlo.methods.healthsystem",
         key="HSI_Event",
-        custom_generate_series=get_patient_counts,
+        custom_generate_series=get_patient_count,
         do_scaling=True
     )
 

From 2779d468048125e0d21b4621c85843cbbad5f9fe Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Thu, 28 May 2026 16:51:41 +0100
Subject: [PATCH 4/6] merge pat load estimates from three sources

---
 .../analysis_scripts/analysis_patient_mix.py  | 129 ++++++++++++------
 1 file changed, 89 insertions(+), 40 deletions(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
index 49129de5b7..625d1569fa 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
@@ -199,23 +199,73 @@ def merge_info_from_mfl(_df):
     daily_patient_load_per_hcw['Daily_Patient_Load_Per_HCW'] = (daily_patient_load_per_hcw['Patient_Volume']
                                                                 / daily_patient_load_per_hcw['Staff_Count'])
 
-    hcw_count = extract_results(
-        results_folder,
-        module="tlo.methods.healthsystem.summary",
-        key="number_of_hcw_staff",
-        custom_generate_series=get_hcw_count,
-        do_scaling=False
+    # read in TLM estimates
+    path_to_tlm_folder = (
+        resourcefilepath
+        / "healthsystem"
+        / "human_resources"
+        / "TLM_2024"
     )
-    hcw_count = hcw_count[(0, 0)]
 
-    patient_volume = extract_results(
-        results_folder,
-        module="tlo.methods.healthsystem",
-        key="HSI_Event",
-        custom_generate_series=get_patient_count,
-        do_scaling=True
+    hcw_tms_pat_load = pd.read_stata(path_to_tlm_folder/"tool_3_pat_load.dta", convert_categoricals=True)
+    fac_tms_pat_load = pd.read_stata(path_to_tlm_folder / "tool_6_pat_load.dta", convert_categoricals=True)
+
+    # check that districts and facility levels in the two tools are a subset of TLO output;
+    # district and facility level consistency in the two tools already checked in Stata
+    hcw_tms_pat_load["district"] = hcw_tms_pat_load["district"].replace({"Mzuzu": "Mzuzu City"})
+    fac_tms_pat_load["district"] = fac_tms_pat_load["district"].replace({"Mzuzu": "Mzuzu City"})
+
+    assert set(hcw_tms_pat_load['district'].unique()).issubset(
+        set(daily_patient_load_per_hcw['District'].unique())
+    )
+    assert set(hcw_tms_pat_load['fac_level'].unique()).issubset(
+        set(daily_patient_load_per_hcw['Facility_Level'].unique())
     )
 
+    # merge all three estimates in one dataframe, noting the source and keeping all observations
+    hcw_tms_pat_load = hcw_tms_pat_load.rename(columns={
+        "fac_level": "Facility_Level",
+        "district": "District",
+        "pat_day_tms_nr_adj": "Daily_Patient_Load_Per_HCW",
+    })
+    fac_tms_pat_load = fac_tms_pat_load.rename(columns={
+        "fac_level": "Facility_Level",
+        "district": "District",
+        "pat_load_per_hcw": "Daily_Patient_Load_Per_HCW",
+    })
+
+    hcw_tms_pat_load["Source"] = "HCW TMS"
+    fac_tms_pat_load["Source"] = "Facility Summary"
+    daily_patient_load_per_hcw["Source"] = "TLO"
+
+    pat_load_comparison = pd.concat([
+        hcw_tms_pat_load[["District", "Facility_Level", "Daily_Patient_Load_Per_HCW", "Source",]],
+        fac_tms_pat_load[["District", "Facility_Level", "Daily_Patient_Load_Per_HCW", "Source"]],
+        daily_patient_load_per_hcw[["District", "Facility_Level", "Daily_Patient_Load_Per_HCW", "Source"]],
+    ], ignore_index=True)
+
+    assert len(hcw_tms_pat_load) + len(fac_tms_pat_load) + len(daily_patient_load_per_hcw) == len(pat_load_comparison)
+
+    # make plots for patient load comparison at each level (1b, 2 and 3);
+    # y-axis is the district and x-axis is the patient load
+
+    # hcw_count = extract_results(
+    #     results_folder,
+    #     module="tlo.methods.healthsystem.summary",
+    #     key="number_of_hcw_staff",
+    #     custom_generate_series=get_hcw_count,
+    #     do_scaling=False
+    # )
+    # hcw_count = hcw_count[(0, 0)]
+    #
+    # patient_volume = extract_results(
+    #     results_folder,
+    #     module="tlo.methods.healthsystem",
+    #     key="HSI_Event",
+    #     custom_generate_series=get_patient_count,
+    #     do_scaling=True
+    # )
+
     # patient_volume_opd = extract_results(
     #     results_folder,
     #     module="tlo.methods.healthsystem",
@@ -240,35 +290,34 @@ def merge_info_from_mfl(_df):
     #
     # patient_load_per_hcw = pd.concat([opd, emerg], axis=1)
 
-    patient_load_per_hcw = patient_volume / hcw_count.values[0]
-    patient_load_per_hcw = patient_load_per_hcw.fillna(0)
-
-    patient_load_per_hcw = (
-        patient_load_per_hcw
-        .reset_index()
-        .melt(
-            id_vars="date",
-            var_name="coarse_loc_cat_str",
-            value_name="daily_patient_load_per_hcw"
-        )
-    )
-    patient_load_per_hcw["coarse_loc_cat_str"] = "All service area"
-
-    path_to_tlm_folder = (
-        resourcefilepath
-        / "healthsystem"
-        / "human_resources"
-        / "TLM_2024"
-    )
-
-    patient_load_per_hcw.to_stata(
-        path_to_tlm_folder / "tlo_pat_load.dta",
-        write_index=False,
-        convert_dates={"date": "td"}
-    )
+    # patient_load_per_hcw = patient_volume / hcw_count.values[0]
+    # patient_load_per_hcw = patient_load_per_hcw.fillna(0)
+    #
+    # patient_load_per_hcw = (
+    #     patient_load_per_hcw
+    #     .reset_index()
+    #     .melt(
+    #         id_vars="date",
+    #         var_name="coarse_loc_cat_str",
+    #         value_name="daily_patient_load_per_hcw"
+    #     )
+    # )
+    # patient_load_per_hcw["coarse_loc_cat_str"] = "All service area"
+    #
+    # path_to_tlm_folder = (
+    #     resourcefilepath
+    #     / "healthsystem"
+    #     / "human_resources"
+    #     / "TLM_2024"
+    # )
+    #
+    # patient_load_per_hcw.to_stata(
+    #     path_to_tlm_folder / "tlo_pat_load.dta",
+    #     write_index=False,
+    #     convert_dates={"date": "td"}
+    # )
 
     # issue: do not know how many staff members are in outpatient, emergency care, and inpatient
-    # issue: how to treat nan entries for emerg -> default 0?
 
 
 if __name__ == "__main__":

From cdfef316e1773444a925c2a2a2bafc95846037b1 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 29 May 2026 15:51:52 +0100
Subject: [PATCH 5/6] select common districts across sources and make pat load
 comparison plots, both mean and median

---
 .../analysis_scripts/analysis_patient_mix.py  | 682 +++++++++++++++++-
 1 file changed, 680 insertions(+), 2 deletions(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
index 625d1569fa..5feccd1e70 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
@@ -246,8 +246,686 @@ def merge_info_from_mfl(_df):
 
     assert len(hcw_tms_pat_load) + len(fac_tms_pat_load) + len(daily_patient_load_per_hcw) == len(pat_load_comparison)
 
-    # make plots for patient load comparison at each level (1b, 2 and 3);
-    # y-axis is the district and x-axis is the patient load
+    # *** make comparison plots ***
+    from matplotlib.ticker import MultipleLocator
+
+    # shared settings and helper function
+
+    facility_levels = ["1a", "2", "3", "4"]
+    sources = ["TLO", "HCW TMS", "Facility Summary"]
+
+    markers = {
+        "HCW TMS": "o",
+        "Facility Summary": "^",
+        "TLO": "d"
+    }
+
+    source_colors = {
+        "TLO": "green",
+        "HCW TMS": "blue",
+        "Facility Summary": "orange"
+    }
+
+    PLOT_STYLE = {
+        "font.size": 16,
+        "axes.labelsize": 16,
+        "xtick.labelsize": 15,
+        "ytick.labelsize": 15,
+        "legend.fontsize": 16
+    }
+
+    plt.rcParams.update(PLOT_STYLE)
+
+    patient_load_col = "Daily_Patient_Load_Per_HCW"
+
+    def annotate_small_n_horizontal(
+        ax,
+        y_positions,
+        upper_bounds,
+        counts,
+        threshold=10,
+        symbol="*"
+    ):
+        mask = counts < threshold
+
+        for yy, xx in zip(
+            np.asarray(y_positions)[mask],
+            np.asarray(upper_bounds)[mask]
+        ):
+            ax.text(
+                xx,
+                yy,
+                symbol,
+                ha="left",
+                va="center",
+                fontsize=15,
+                color="red"
+            )
+
+    def annotate_small_n_vertical(
+        ax,
+        x_positions,
+        upper_bounds,
+        counts,
+        threshold=10,
+        symbol="*"
+    ):
+        mask = counts < threshold
+
+        for xx, yy in zip(
+            np.asarray(x_positions)[mask],
+            np.asarray(upper_bounds)[mask]
+        ):
+            ax.text(
+                xx,
+                yy,
+                symbol,
+                ha="center",
+                va="bottom",
+                fontsize=14,
+                color="red"
+            )
+
+    # prepare data: keep districts appearing in HCW TMS
+
+    common_districts = (
+        pat_load_comparison
+        .loc[pat_load_comparison["Source"] == "HCW TMS", "District"]
+        .unique()
+    )
+
+    df_plot = pat_load_comparison[
+        pat_load_comparison["District"].isin(common_districts)
+    ].copy()
+
+    # ** median + IQR plots **
+
+    def summarise_median_iqr(data, group_cols):
+        return (
+            data
+            .groupby(group_cols)[patient_load_col]
+            .agg(
+                median="median",
+                q25=lambda x: x.quantile(0.25),
+                q75=lambda x: x.quantile(0.75),
+                n="count"
+            )
+            .reset_index()
+        )
+
+    # Plot 1: by facility level
+    # y-axis = district, x-axis = patient load
+
+    summary = summarise_median_iqr(
+        df_plot,
+        ["District", "Facility_Level", "Source"]
+    )
+
+    offset = 0.2
+
+    fig, axes = plt.subplots(
+        1,
+        len(facility_levels),
+        figsize=(20, 12),
+        sharex="all"
+    )
+
+    for ax, fac_level in zip(axes, facility_levels):
+
+        temp = summary[
+            summary["Facility_Level"] == fac_level
+            ].copy()
+
+        districts = sorted(temp["District"].unique())
+        y = np.arange(len(districts))
+
+        for i, src in enumerate(sources):
+            dat = (
+                temp[temp["Source"] == src]
+                .set_index("District")
+                .reindex(districts)
+            )
+
+            xerr = np.vstack([
+                dat["median"] - dat["q25"],
+                dat["q75"] - dat["median"]
+            ])
+
+            ax.errorbar(
+                x=dat["median"],
+                y=y + (i - 1) * offset,
+                xerr=xerr,
+                fmt=markers[src],
+                color=source_colors[src],
+                markersize=8,
+                elinewidth=1.5,
+                capsize=2,
+                capthick=1.5,
+                linestyle="none",
+                label=src
+            )
+
+            annotate_small_n_horizontal(
+                ax=ax,
+                y_positions=y + (i - 1) * offset,
+                upper_bounds=dat["q75"],
+                counts=dat["n"]
+            )
+
+        ax.set_title(f"Facility Level {fac_level}")
+
+        ax.set_yticks(y)
+        ax.set_yticklabels(districts)
+        ax.invert_yaxis()
+
+        ax.xaxis.set_major_locator(MultipleLocator(20))
+        ax.xaxis.set_minor_locator(MultipleLocator(10))
+
+        ax.grid(axis="x", which="major", alpha=0.5)
+        ax.grid(axis="x", which="minor", alpha=0.25, linestyle=":")
+
+        ax.tick_params(axis="both")
+
+    # axes[0].set_ylabel("District")
+    fig.supxlabel(
+        "Median Daily Patient Load per HCW",
+        y=0.04
+    )
+
+    handles, labels = axes[0].get_legend_handles_labels()
+
+    fig.legend(
+        handles,
+        labels,
+        loc="upper center",
+        ncol=3,
+        frameon=False
+    )
+
+    plt.tight_layout(rect=[0, 0.05, 1, 0.95])
+    plt.show()
+
+    # Plot 2: overall facility levels
+    # y-axis = district, x-axis = patient load
+
+    summary = summarise_median_iqr(
+        df_plot,
+        ["District", "Source"]
+    )
+
+    fig, ax = plt.subplots(figsize=(10, 12))
+
+    districts = sorted(summary["District"].unique())
+    y = np.arange(len(districts))
+
+    offset = 0.2
+
+    for i, src in enumerate(sources):
+        dat = (
+            summary[summary["Source"] == src]
+            .set_index("District")
+            .reindex(districts)
+        )
+
+        xerr = np.vstack([
+            dat["median"] - dat["q25"],
+            dat["q75"] - dat["median"]
+        ])
+
+        ax.errorbar(
+            x=dat["median"],
+            y=y + (i - 1) * offset,
+            xerr=xerr,
+            fmt=markers[src],
+            color=source_colors[src],
+            markersize=8,
+            capsize=2,
+            capthick=1.5,
+            elinewidth=1.5,
+            linestyle="none",
+            label=src
+        )
+
+        annotate_small_n_horizontal(
+            ax=ax,
+            y_positions=y + (i - 1) * offset,
+            upper_bounds=dat["q75"],
+            counts=dat["n"]
+        )
+
+    ax.set_yticks(y)
+    ax.set_yticklabels(districts)
+    ax.invert_yaxis()
+
+    ax.set_xlabel("Median Daily Patient Load per HCW")
+    ax.set_ylabel("District or Central Hospital")
+
+    ax.xaxis.set_major_locator(MultipleLocator(20))
+    ax.xaxis.set_minor_locator(MultipleLocator(10))
+
+    ax.grid(axis="x", which="major", alpha=0.5)
+    ax.grid(axis="x", which="minor", alpha=0.25, linestyle=":")
+
+    ax.legend(frameon=False, loc="best")
+
+    plt.tight_layout()
+    plt.show()
+
+    # Plot 3: over all districts
+    # x-axis = facility level, y-axis = patient load
+
+    summary = summarise_median_iqr(
+        df_plot,
+        ["Facility_Level", "Source"]
+    )
+
+    fig, ax = plt.subplots(figsize=(8, 6))
+
+    x = np.arange(len(facility_levels))
+    offset = 0.12
+
+    for i, src in enumerate(sources):
+        dat = (
+            summary[summary["Source"] == src]
+            .set_index("Facility_Level")
+            .reindex(facility_levels)
+        )
+
+        y = dat["median"]
+
+        yerr = np.vstack([
+            dat["median"] - dat["q25"],
+            dat["q75"] - dat["median"]
+        ])
+
+        ax.errorbar(
+            x=x + (i - 1) * offset,
+            y=y,
+            yerr=yerr,
+            fmt=markers[src],
+            color=source_colors[src],
+            markersize=8,
+            capsize=2,
+            capthick=1.5,
+            elinewidth=1.5,
+            linestyle="none",
+            label=src
+        )
+
+        annotate_small_n_vertical(
+            ax=ax,
+            x_positions=x + (i - 1) * offset,
+            upper_bounds=dat["q75"],
+            counts=dat["n"]
+        )
+
+    ax.set_xticks(x)
+    ax.set_xticklabels(facility_levels)
+
+    ax.grid(axis="y", which="major", alpha=0.5)
+    ax.grid(axis="y", which="minor", alpha=0.25, linestyle=":")
+
+    ax.set_xlabel("Facility Level")
+    ax.set_ylabel("Median Daily Patient Load per HCW")
+
+    ax.legend(frameon=False)
+
+    plt.tight_layout()
+    plt.show()
+
+    # Plot 4: over all districts and facility levels
+    # x-axis = source, y-axis = patient load
+
+    summary = (
+        summarise_median_iqr(
+            df_plot,
+            ["Source"]
+        )
+        .set_index("Source")
+        .reindex(sources)
+        .reset_index()
+    )
+
+    yerr = np.vstack([
+        summary["median"] - summary["q25"],
+        summary["q75"] - summary["median"]
+    ])
+
+    colors = [source_colors[src] for src in sources]
+
+    fig, ax = plt.subplots(figsize=(7, 6))
+
+    bars = ax.bar(
+        x=np.arange(len(sources)),
+        height=summary["median"],
+        yerr=yerr,
+        capsize=5,
+        color=colors,
+        alpha=0.7,
+        width=0.7
+    )
+
+    annotate_small_n_vertical(
+        ax=ax,
+        x_positions=np.arange(len(sources)),
+        upper_bounds=summary["q75"],
+        counts=summary["n"]
+    )
+
+    ax.set_xticks(np.arange(len(sources)))
+    ax.set_xticklabels(sources)
+
+    ax.grid(axis="y", which="major", alpha=0.5)
+    ax.grid(axis="y", which="minor", alpha=0.25, linestyle=":")
+
+    ax.set_xlabel("Source")
+    ax.set_ylabel("Median Daily Patient Load per HCW")
+
+    plt.tight_layout()
+    plt.show()
+
+    # ** mean + 95%CI plots **
+
+    def summarise_mean_ci95(data, group_cols):
+        summary = (
+            data
+            .groupby(group_cols)[patient_load_col]
+            .agg(
+                mean="mean",
+                sd="std",
+                n="count"
+            )
+            .reset_index()
+        )
+
+        summary["se"] = summary["sd"] / np.sqrt(summary["n"])
+        summary["ci95"] = 1.96 * summary["se"]
+
+        summary["ci95"] = summary["ci95"].fillna(0)
+
+        return summary
+
+    # Plot 1: by facility level
+    # y-axis = district, x-axis = patient load
+
+    summary = summarise_mean_ci95(
+        df_plot,
+        ["District", "Facility_Level", "Source"]
+    )
+
+    offset = 0.2
+
+    fig, axes = plt.subplots(
+        1,
+        len(facility_levels),
+        figsize=(20, 12),
+        sharex="all"
+    )
+
+    for ax, fac_level in zip(axes, facility_levels):
+
+        temp = summary[
+            summary["Facility_Level"] == fac_level
+            ].copy()
+
+        districts = sorted(temp["District"].unique())
+        y = np.arange(len(districts))
+
+        for i, src in enumerate(sources):
+            dat = (
+                temp[temp["Source"] == src]
+                .set_index("District")
+                .reindex(districts)
+            )
+
+            lower_err = np.minimum(dat["ci95"], dat["mean"])
+            upper_err = dat["ci95"]
+
+            xerr = np.vstack([
+                lower_err,
+                upper_err
+            ])
+
+            ax.errorbar(
+                x=dat["mean"],
+                y=y + (i - 1) * offset,
+                xerr=xerr,
+                fmt=markers[src],
+                color=source_colors[src],
+                markersize=8,
+                elinewidth=1.5,
+                capsize=2,
+                capthick=1.5,
+                linestyle="none",
+                label=src
+            )
+
+            annotate_small_n_horizontal(
+                ax=ax,
+                y_positions=y + (i - 1) * offset,
+                upper_bounds=dat["mean"] + dat["ci95"],
+                counts=dat["n"]
+            )
+
+        ax.set_title(f"Facility Level {fac_level}")
+
+        ax.set_yticks(y)
+        ax.set_yticklabels(districts)
+        ax.invert_yaxis()
+
+        ax.xaxis.set_major_locator(MultipleLocator(20))
+        ax.xaxis.set_minor_locator(MultipleLocator(10))
+
+        ax.grid(axis="x", which="major", alpha=0.5)
+        ax.grid(axis="x", which="minor", alpha=0.25, linestyle=":")
+
+        ax.tick_params(axis="both")
+
+    fig.supxlabel(
+        "Daily Patient Load per HCW",
+        y=0.04
+    )
+
+    handles, labels = axes[0].get_legend_handles_labels()
+
+    fig.legend(
+        handles,
+        labels,
+        loc="upper center",
+        ncol=3,
+        frameon=False
+    )
+
+    plt.tight_layout(rect=[0, 0.05, 1, 0.95])
+    plt.show()
+
+    # Plot 2: overall facility levels
+    # y-axis = district, x-axis = patient load
+
+    summary = summarise_mean_ci95(
+        df_plot,
+        ["District", "Source"]
+    )
+
+    fig, ax = plt.subplots(figsize=(10, 12))
+
+    districts = sorted(summary["District"].unique())
+    y = np.arange(len(districts))
+
+    offset = 0.2
+
+    for i, src in enumerate(sources):
+        dat = (
+            summary[summary["Source"] == src]
+            .set_index("District")
+            .reindex(districts)
+        )
+
+        lower_err = np.minimum(dat["ci95"], dat["mean"])
+        upper_err = dat["ci95"]
+
+        xerr = np.vstack([
+            lower_err,
+            upper_err
+        ])
+
+        ax.errorbar(
+            x=dat["mean"],
+            y=y + (i - 1) * offset,
+            xerr=xerr,
+            fmt=markers[src],
+            color=source_colors[src],
+            markersize=8,
+            capsize=2,
+            capthick=1.5,
+            elinewidth=1.5,
+            linestyle="none",
+            label=src
+        )
+
+        annotate_small_n_horizontal(
+            ax=ax,
+            y_positions=y + (i - 1) * offset,
+            upper_bounds=dat["mean"] + dat["ci95"],
+            counts=dat["n"]
+        )
+
+    ax.set_yticks(y)
+    ax.set_yticklabels(districts)
+    ax.invert_yaxis()
+
+    ax.set_xlabel("Mean Daily Patient Load per HCW")
+    ax.set_ylabel("District or Central Hospital")
+
+    ax.xaxis.set_major_locator(MultipleLocator(20))
+    ax.xaxis.set_minor_locator(MultipleLocator(10))
+
+    ax.grid(axis="x", which="major", alpha=0.5)
+    ax.grid(axis="x", which="minor", alpha=0.25, linestyle=":")
+
+    ax.legend(frameon=False, loc="best")
+
+    plt.tight_layout()
+    plt.show()
+
+    # Plot 3: over all districts
+    # x-axis = facility level, y-axis = patient load
+
+    summary = summarise_mean_ci95(
+        df_plot,
+        ["Facility_Level", "Source"]
+    )
+
+    fig, ax = plt.subplots(figsize=(8, 6))
+
+    x = np.arange(len(facility_levels))
+    offset = 0.12
+
+    for i, src in enumerate(sources):
+        dat = (
+            summary[summary["Source"] == src]
+            .set_index("Facility_Level")
+            .reindex(facility_levels)
+        )
+
+        y = dat["mean"]
+
+        lower_err = np.minimum(dat["ci95"], dat["mean"])
+        upper_err = dat["ci95"]
+
+        yerr = np.vstack([
+            lower_err,
+            upper_err
+        ])
+
+        ax.errorbar(
+            x=x + (i - 1) * offset,
+            y=y,
+            yerr=yerr,
+            fmt=markers[src],
+            color=source_colors[src],
+            markersize=8,
+            capsize=2,
+            capthick=1.5,
+            elinewidth=1.5,
+            linestyle="none",
+            label=src
+        )
+
+        annotate_small_n_vertical(
+            ax=ax,
+            x_positions=x + (i - 1) * offset,
+            upper_bounds=dat["mean"] + dat["ci95"],
+            counts=dat["n"]
+        )
+
+    ax.set_xticks(x)
+    ax.set_xticklabels(facility_levels)
+
+    ax.grid(axis="y", which="major", alpha=0.5)
+    ax.grid(axis="y", which="minor", alpha=0.25, linestyle=":")
+
+    ax.set_xlabel("Facility Level")
+    ax.set_ylabel("Mean Daily Patient Load per HCW")
+
+    ax.legend(frameon=False)
+
+    plt.tight_layout()
+    plt.show()
+
+    # Plot 4: over all districts and facility levels
+    # x-axis = source, y-axis = patient load
+
+    summary = (
+        summarise_mean_ci95(
+            df_plot,
+            ["Source"]
+        )
+        .set_index("Source")
+        .reindex(sources)
+        .reset_index()
+    )
+
+    lower_err = np.minimum(summary["ci95"], summary["mean"])
+    upper_err = summary["ci95"]
+
+    yerr = np.vstack([
+        lower_err,
+        upper_err
+    ])
+
+    colors = [source_colors[src] for src in sources]
+
+    fig, ax = plt.subplots(figsize=(7, 6))
+
+    bars = ax.bar(
+        x=np.arange(len(sources)),
+        height=summary["mean"],
+        yerr=yerr,
+        capsize=5,
+        color=colors,
+        alpha=0.7,
+        width=0.7
+    )
+
+    annotate_small_n_vertical(
+        ax=ax,
+        x_positions=np.arange(len(sources)),
+        upper_bounds=summary["mean"] + summary["ci95"],
+        counts=summary["n"]
+    )
+
+    ax.set_xticks(np.arange(len(sources)))
+    ax.set_xticklabels(sources)
+
+    ax.grid(axis="y", which="major", alpha=0.5)
+    ax.grid(axis="y", which="minor", alpha=0.25, linestyle=":")
+
+    ax.set_xlabel("Source")
+    ax.set_ylabel("Mean Daily Patient Load per HCW")
+
+    plt.tight_layout()
+    plt.show()
 
     # hcw_count = extract_results(
     #     results_folder,

From 9adb29df271424f1da9f7242245ca79dc5227606 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 29 May 2026 17:31:26 +0100
Subject: [PATCH 6/6] add the choice to adjust daily staff counts considering
 some HCWs may not be on duty on any given day

---
 .../analysis_scripts/analysis_patient_mix.py          | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
index 5feccd1e70..eb75ed12df 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_patient_mix.py
@@ -196,8 +196,17 @@ def merge_info_from_mfl(_df):
         daily_patient_load_per_hcw["Facility_ID"] == 130, "District"
     ] = "Central Hospitals (Central)"
 
+    # 0.5649 is the prob. that any HCW is on duty on any day (estimates from CHAI data)
+    # given TLO assume the same HCWs in the HS
+    # alongside, we assume the patient seeking care independently of the availability of HCWs?
+    # otherwise, would need to adjust patient volume in the nominator, too; by which, the adjustments may cancel out.
+    # daily_patient_load_per_hcw['Daily_Patient_Load_Per_HCW'] = (daily_patient_load_per_hcw['Patient_Volume']
+    #                                                             / (daily_patient_load_per_hcw['Staff_Count'] * 0.5649)
+    #                                                             )
+
     daily_patient_load_per_hcw['Daily_Patient_Load_Per_HCW'] = (daily_patient_load_per_hcw['Patient_Volume']
-                                                                / daily_patient_load_per_hcw['Staff_Count'])
+                                                                / (daily_patient_load_per_hcw['Staff_Count'] * 1.0)
+                                                                )
 
     # read in TLM estimates
     path_to_tlm_folder = (