From 83e4fbf70ad041ab3fc8f9d7c3f6f5120fef576e Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 28 Aug 2025 14:39:04 +0000
Subject: [PATCH 01/21] Updating STAR to use slurm and newer yirgacheffe

---
 prepare_layers/make_masks.py                 |  47 ++++----
 prepare_species/extract_species_data_psql.py |  11 +-
 requirements.txt                             |   2 +-
 scripts/slurm.sh                             | 115 +++++++++++++++++++
 threats/threat_processing.py                 |   7 +-
 threats/threat_summation.py                  |   7 +-
 6 files changed, 148 insertions(+), 41 deletions(-)
 create mode 100644 scripts/slurm.sh

diff --git a/prepare_layers/make_masks.py b/prepare_layers/make_masks.py
index f5d9756..a4b5ce1 100644
--- a/prepare_layers/make_masks.py
+++ b/prepare_layers/make_masks.py
@@ -1,62 +1,57 @@
 import argparse
 import os
 import sys
-from glob import glob
+from pathlib import Path
 from typing import Set
 
-import numpy as np
-from yirgacheffe.layers import RasterLayer
+import yirgacheffe as yg
+import yirgacheffe.operators as yo
 
 OPEN_SEA_LCC = "lcc_200.tif"
 NO_DATA_LCC = "lcc_0.tif"
 
 def prepare_mask(
-    layers: Set[str],
-    output_path: str,
+    layers: Set[Path],
+    output_path: Path,
     at_least: bool = True,
 ) -> None:
     assert layers
-    rasters = [RasterLayer.layer_from_file(x) for x in layers]
-
-    intersection = RasterLayer.find_intersection(rasters)
-    for r in rasters:
-        r.set_window_for_intersection(intersection)
+    rasters = [yg.read_raster(x) for x in layers]
 
     calc = rasters[0]
     for r in rasters[1:]:
         calc = calc + r
     if at_least:
-        calc = calc.numpy_apply(lambda a: np.where(a >= 0.5, 1.0, 0.0))
+        calc = yo.where(calc >= 0.5, 1.0, 0.0)
     else:
-        calc = calc.numpy_apply(lambda a: np.where(a > 0.5, 1.0, 0.0))
+        calc = yo.where(calc > 0.5, 1.0, 0.0))
 
-    with RasterLayer.empty_raster_layer_like(rasters[0], filename=output_path) as result:
-        calc.parallel_save(result)
+    calc.to_geotiff(output_path, parallelism=128)
 
 def prepare_masks(
-    habitat_layers_path: str,
-    output_directory_path: str,
+    habitat_layers_path: Path,
+    output_directory_path: Path,
 ) -> None:
     os.makedirs(output_directory_path, exist_ok=True)
 
-    layer_files = set(glob("lcc_*.tif", root_dir=habitat_layers_path))
+    layer_files = set(habitat_layers_path.glob("lcc_*.tif"))
     if not layer_files:
         sys.exit(f"Found no habitat layers in {habitat_layers_path}")
 
-    marine_layers = layer_files & set([OPEN_SEA_LCC])
-    terrerstrial_layers = layer_files - set([OPEN_SEA_LCC, NO_DATA_LCC])
+    marine_layers = {x for x in layer_files if x.name == OPEN_SEA_LCC}
+    terrerstrial_layers = {x for x in layer_files if x.name not in [OPEN_SEA_LCC, NO_DATA_LCC]}
 
     assert len(marine_layers) == 1
-    assert len(terrerstrial_layers) == len(layer_files) - 2
+    assert len(terrerstrial_layers) < len(layer_files)
 
     prepare_mask(
-        {os.path.join(habitat_layers_path, x) for x in marine_layers},
-        os.path.join(output_directory_path, "marine_mask.tif"),
+        marine_layers,
+        output_directory_path / "marine_mask.tif",
     )
 
     prepare_mask(
-        {os.path.join(habitat_layers_path, x) for x in terrerstrial_layers},
-        os.path.join(output_directory_path, "terrestrial_mask.tif"),
+        terrerstrial_layers,
+        output_directory_path / "terrestrial_mask.tif",
         at_least=True,
     )
 
@@ -66,14 +61,14 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Generate terrestrial and marine masks.")
     parser.add_argument(
         '--habitat_layers',
-        type=str,
+        type=Path,
         help="directory with split and scaled habitat layers",
         required=True,
         dest="habitat_layers"
     )
     parser.add_argument(
         '--output_directory',
-        type=str,
+        type=Path,
         help="Folder for output mask layers",
         required=True,
         dest="output_directory"
diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py
index 0ceda78..5f61eb8 100644
--- a/prepare_species/extract_species_data_psql.py
+++ b/prepare_species/extract_species_data_psql.py
@@ -23,7 +23,7 @@
 logger.setLevel(logging.DEBUG)
 
 # To match the FABDEM elevation map we use
-# different range min/max/seperation
+# different range min/max/separation
 ELEVATION_MAX = 8580
 ELEVATION_MIN = -427
 ELEVATION_SPREAD = 12
@@ -31,6 +31,7 @@
 COLUMNS = [
     "id_no",
     "assessment_id",
+    "assessment_year",
     "season",
     "systems",
     "elevation_lower",
@@ -61,6 +62,7 @@
 SELECT
     assessments.sis_taxon_id as id_no,
     assessments.id as assessment_id,
+    DATE_PART('year', assessments.assessment_date) as assessment_year,
     assessments.possibly_extinct,
     assessments.possibly_extinct_in_the_wild,
     (assessment_supplementary_infos.supplementary_fields->>'ElevationLower.limit')::numeric AS elevation_lower,
@@ -336,7 +338,7 @@ def process_row(
     register(connection)
     cursor = connection.cursor()
 
-    id_no, assessment_id, possibly_extinct, possibly_extinct_in_the_wild, \
+    id_no, assessment_id, assessment_year, possibly_extinct, possibly_extinct_in_the_wild, \
         elevation_lower, elevation_upper, scientific_name, family_name, category = row
 
     report = SpeciesReport(id_no, assessment_id, scientific_name)
@@ -378,6 +380,7 @@ def process_row(
         [[
             id_no,
             assessment_id,
+            assessment_year,
             "all",
             systems,
             int(elevation_lower) if elevation_lower is not None else None,
@@ -471,10 +474,6 @@ def extract_data_per_species(
                 partial(process_row, class_name, era_output_directory_path, target_projection, presence),
                 results
             )
-        # reports = [
-        #     process_row(class_name,  era_output_directory_path, target_projection, presence, x)
-        #     for x in results[:10]
-        # ]
 
         reports_df = pd.DataFrame(
             [x.as_row() for x in reports],
diff --git a/requirements.txt b/requirements.txt
index 7c9a8ef..2a7a3dd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,12 +9,12 @@ pyproj
 scikit-image
 requests
 zenodo_search
+yirgacheffe
 
 gdal[numpy]
 
 git+https://github.com/quantifyearth/iucn_modlib
 git+https://github.com/quantifyearth/pyshark
-git+https://github.com/quantifyearth/yirgacheffe@4a2cab77f4a64e3f09497ee7098dc9ba499cda90
 
 pylint
 mypy
diff --git a/scripts/slurm.sh b/scripts/slurm.sh
new file mode 100644
index 0000000..c0eed55
--- /dev/null
+++ b/scripts/slurm.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+#
+# Assumes you've set up a python virtual environement in the current directory.
+#
+# In addition to the Python environemnt, you will need the following extra command line tools:
+#
+# https://github.com/quantifyearth/reclaimer - used to download inputs from Zenodo directly
+# https://github.com/quantifyearth/littlejohn - used to run batch jobs in parallel
+
+set -e
+
+# shellcheck disable=SC1091
+source ${HOME}/venvs/life/bin/activate
+cd ${HOME}/dev/star
+export PATH=$PATH:$HOME/go/bin
+
+if [ -z "${DATADIR}" ]; then
+    echo "Please specify DATADIR"
+    exit 1
+fi
+
+if [ -z "${VIRTUAL_ENV}" ]; then
+    echo "Please specify run in a virtualenv"
+    exit 1
+fi
+
+declare -a TAXALIST=("AMPHIBIA" "AVES" "MAMMALIA" "REPTILIA")
+
+# Get habitat layer and prepare for use
+if [ ! -d ${DATADIR}/habitat_layers ]; then
+    if [ ! -f ${DATADIR}/habitat/raw.tif ]; then
+        echo "Fetching habitat map..."
+        reclaimer zenodo --zenodo_id 3939050 --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif --output ${DATADIR}/habitat/raw.tif
+    fi
+
+    echo "Processing habitat map..."
+    python3 ./aoh-calculator/habitat_process.py --habitat ${DATADIR}/habitat/raw.tif \
+                                                --scale 1000.0 \
+                                                --projection "ESRI:54009" \
+                                                --output ${DATADIR}/tmp_habitat_layers/current
+    mv ${DATADIR}/tmp_habitat_layers ${DATADIR}/habitat_layers
+fi
+
+if [ ! -d ${DATADIR}/masks ]; then
+    echo "Processing masks..."
+    python3 ./prepare_layers/make_masks.py --habitat_layers ${DATADIR}/habitat_layers/current \
+                                        --output_directory ${DATADIR}/masks
+fi
+
+# Fetch and prepare the elevation layers
+if [[ ! -f ${DATADIR}/elevation/elevation-max-1k.tif || ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]]; then
+    if [ ! -f ${DATADIR}/elevation/elevation.tif ]; then
+        echo "Fetching elevation map..."
+        reclaimer zenodo --zenodo_id 5719984  --filename dem-100m-esri54017.tif --output ${DATADIR}/elevation/elevation.tif
+    fi
+    if [ ! -f ${DATADIR}/elevation/elevation-max-1k.tif ]; then
+        echo "Generating elevation max layer..."
+        gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r max -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-max-1k.tif
+    fi
+    if [ ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]; then
+        echo "Generating elevation min layer..."
+        gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r min -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-min-1k.tif
+    fi
+fi
+
+# Generate the crosswalk table
+if [ ! -f ${DATADIR}/crosswalk.csv ]; then
+    echo "Generating crosswalk table..."
+    python3 ./prepare_layers/convert_crosswalk.py --original ${PWD}/data/crosswalk_bin_T.csv --output ${DATADIR}/crosswalk.csv
+fi
+
+# Get species data per taxa from IUCN data
+for TAXA in "${TAXALIST[@]}"
+do
+    echo "Extracting species data for ${TAXA}..."
+    python3 ./prepare_species/extract_species_data_psql.py --class ${TAXA} --output ${DATADIR}/species-info/${TAXA}/ --projection "ESRI:54009" --excludes ${DATADIR}/SpeciesList_generalisedRangePolygons.csv
+done
+
+if [ -f data/BL_Species_Elevations_2023.csv ]; then
+    echo "Applying birdlife data..."
+    python3 ./prepare_species/apply_birdlife_data.py --geojsons ${DATADIR}/species-info/AVES --overrides data/BL_Species_Elevations_2023.csv
+fi
+
+echo "Generating AoH task list..."
+python3 ./utils/aoh_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/aohbatch.csv
+
+echo "Generating AoHs..."
+littlejohn -j ${SLURM_JOB_CPUS_PER_NODE} -o ${DATADIR}/aohbatch.log -c ${DATADIR}/aohbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./aoh-calculator/aohcalc.py
+
+# Calculate predictors from AoHs
+echo "Generating species richness..."
+python3 ./aoh-calculator/summaries/species_richness.py --aohs_folder ${DATADIR}/aohs/current/ \
+                                                       --output ${DATADIR}/summaries/species_richness.tif
+echo "Generating endemism..."
+python3 ./aoh-calculator/summaries/endemism.py --aohs_folder ${DATADIR}/aohs/current/ \
+                                               --species_richness ${DATADIR}/summaries/species_richness.tif \
+                                               --output ${DATADIR}/summaries/endemism.tif
+
+# Aoh Validation
+echo "Collating validation data..."
+python3 ./aoh-calculator/validation/collate_data.py --aoh_results ${DATADIR}/aohs/current/ \
+                                                    --output ${DATADIR}/validation/aohs.csv
+echo "Calculating model validation..."
+python3 ./aoh-calculator/validation/validate_map_prevalence.py --collated_aoh_data ${DATADIR}/validation/aohs.csv \
+                                                               --output ${DATADIR}/validation/model_validation.csv
+
+# Threats
+echo "Generating threat task list..."
+python3 ./utils/threats_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/threatbatch.csv
+
+echo "Generating threat rasters..."
+littlejohn -j ${SLURM_JOB_CPUS_PER_NODE} -o ${DATADIR}/threatbatch.log -c ${DATADIR}/threatbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./threats/threat_processing.py
+
+echo "Summarising threats..."
+python3 ./threats/threat_summation.py --threat_rasters ${DATADIR}/threat_rasters --output ${DATADIR}/threat_results
diff --git a/threats/threat_processing.py b/threats/threat_processing.py
index 2a0adca..8e6011e 100644
--- a/threats/threat_processing.py
+++ b/threats/threat_processing.py
@@ -4,8 +4,8 @@
 import sys
 
 import geopandas as gpd
+import yirgacheffe as yg
 from pyogrio.errors import DataSourceError
-from yirgacheffe.layers import RasterLayer
 
 def threat_processing_per_species(
     species_data_path: str,
@@ -17,7 +17,7 @@ def threat_processing_per_species(
     except DataSourceError:
         sys.exit(f"Failed to read {species_data_path}")
 
-    with RasterLayer.layer_from_file(aoh_path) as aoh:
+    with yg.read_raster(aoh_path) as aoh:
 
         os.makedirs(output_directory_path, exist_ok=True)
 
@@ -49,8 +49,7 @@ def threat_processing_per_species(
             threat_dir_path = os.path.join(output_directory_path, str(threat_id))
             os.makedirs(threat_dir_path, exist_ok=True)
             output_path = os.path.join(threat_dir_path, f"{taxon_id}.tif")
-            with RasterLayer.empty_raster_layer_like(aoh, filename=output_path) as result:
-                per_threat_per_species_score.save(result)
+            per_threat_per_species_score.to_geotiff(output_path)
 
 def main() -> None:
     os.environ["OGR_GEOJSON_MAX_OBJ_SIZE"] = "0"
diff --git a/threats/threat_summation.py b/threats/threat_summation.py
index 54bf6a5..8905479 100644
--- a/threats/threat_summation.py
+++ b/threats/threat_summation.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 from typing import List
 
-from yirgacheffe.layers import RasterLayer  # type: ignore
+import yirgacheffe as yg
 from osgeo import gdal
 
 gdal.SetCacheMax(1024 * 1024 * 32)
@@ -26,7 +26,7 @@ def worker(
         if path is None:
             break
 
-        with RasterLayer.layer_from_file(path) as partial_raster:
+        with yg.read_raster(path) as partial_raster:
             if merged_result is None:
                 merged_result = RasterLayer.empty_raster_layer_like(partial_raster)
                 cleaned_raster = partial_raster.nan_to_num()
@@ -38,8 +38,7 @@ def worker(
                 merged_result = temp
 
     if merged_result:
-        final = RasterLayer.empty_raster_layer_like(merged_result, filename=output_tif)
-        merged_result.save(final)
+        merged_result.to_geotiff(output_tif)
 
 def raster_sum(
     images_list: List[Path],

From 7d3597d467fb14696deca31ebb507e2493a54c12 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Fri, 29 Aug 2025 10:41:39 +0100
Subject: [PATCH 02/21] Add birdlife data script

---
 prepare_species/apply_birdlife_data.py | 81 ++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 prepare_species/apply_birdlife_data.py

diff --git a/prepare_species/apply_birdlife_data.py b/prepare_species/apply_birdlife_data.py
new file mode 100644
index 0000000..9b46216
--- /dev/null
+++ b/prepare_species/apply_birdlife_data.py
@@ -0,0 +1,81 @@
+import argparse
+import importlib
+import math
+import os
+
+import geopandas as gpd
+import pandas as pd
+
+aoh_cleaning = importlib.import_module("aoh-calculator.cleaning")
+
+
+# Columns from current BirdLife data overrides:
+# SIS ID
+# Assessment ID
+# WBDB ID
+# Sequence
+# Scientific name
+# Common name
+# RL Category
+# PE
+# PEW
+# Min altitude (m)
+# Max altitude (m)
+# Occasional lower elevation
+# Occasional upper elevation
+
+def apply_birdlife_data(
+    geojson_directory_path: str,
+    overrides_path: str,
+) -> None:
+    overrides = pd.read_csv(overrides_path, encoding="latin1")
+
+    for _, row in overrides.iterrows():
+        if math.isnan(row["Occasional lower elevation"]) and math.isnan(row["Occasional upper elevation"]):
+            continue
+
+        path = os.path.join(geojson_directory_path, "AVES", "current", f"{row["SIS ID"]}.geojson")
+        if not os.path.exists(path):
+            continue
+
+        species_info = gpd.read_file(path)
+        data = species_info.loc[0].copy()
+
+        if not math.isnan(row["Occasional lower elevation"]):
+            data.elevation_lower = float(row["Occasional lower elevation"])
+        else:
+            data.elevation_lower = float(data.elevation_lower)
+        if not math.isnan(row["Occasional upper elevation"]):
+            data.elevation_upper = float(row["Occasional upper elevation"])
+        else:
+            data.elevation_upper = float(data.elevation_upper)
+        data = aoh_cleaning.tidy_data(data)
+
+        res = gpd.GeoDataFrame(data.to_frame().transpose(), crs=species_info.crs, geometry="geometry")
+        res.to_file(path, driver="GeoJSON")
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Process agregate species data to per-species-file.")
+    parser.add_argument(
+        '--geojsons',
+        type=str,
+        help='Directory where per species Geojson is stored',
+        required=True,
+        dest='geojson_directory_path',
+    )
+    parser.add_argument(
+        '--overrides',
+        type=str,
+        help="CSV of overrides",
+        required=True,
+        dest="overrides",
+    )
+    args = parser.parse_args()
+
+    apply_birdlife_data(
+        args.geojson_directory_path,
+        args.overrides
+    )
+
+if __name__ == "__main__":
+    main()

From ceed2afacade9c693ef30af1fb4506987bd0e332 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Fri, 29 Aug 2025 10:42:23 +0100
Subject: [PATCH 03/21] Force year to int

---
 prepare_species/extract_species_data_psql.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py
index 5f61eb8..29d2872 100644
--- a/prepare_species/extract_species_data_psql.py
+++ b/prepare_species/extract_species_data_psql.py
@@ -380,7 +380,7 @@ def process_row(
         [[
             id_no,
             assessment_id,
-            assessment_year,
+            int(assessment_year),
             "all",
             systems,
             int(elevation_lower) if elevation_lower is not None else None,

From 41c99a0ed5047bfee468a38916ae1cc4fb080946 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 2 Oct 2025 15:56:29 +0100
Subject: [PATCH 04/21] Update AOH from sub module to pip

---
 .gitmodules                                  | 3 ---
 aoh-calculator                               | 1 -
 prepare_species/apply_birdlife_data.py       | 6 ++----
 prepare_species/extract_species_data_psql.py | 5 ++---
 requirements.txt                             | 1 +
 5 files changed, 5 insertions(+), 11 deletions(-)
 delete mode 160000 aoh-calculator

diff --git a/.gitmodules b/.gitmodules
index 30fd8a6..e69de29 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "aoh-calculator"]
-	path = aoh-calculator
-	url = git@github.com:quantifyearth/aoh-calculator.git
diff --git a/aoh-calculator b/aoh-calculator
deleted file mode 160000
index c24def9..0000000
--- a/aoh-calculator
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit c24def960799f170a9812af31d4c0e2dc5940dbf
diff --git a/prepare_species/apply_birdlife_data.py b/prepare_species/apply_birdlife_data.py
index 9b46216..f712d24 100644
--- a/prepare_species/apply_birdlife_data.py
+++ b/prepare_species/apply_birdlife_data.py
@@ -3,12 +3,10 @@
 import math
 import os
 
+import aoh
 import geopandas as gpd
 import pandas as pd
 
-aoh_cleaning = importlib.import_module("aoh-calculator.cleaning")
-
-
 # Columns from current BirdLife data overrides:
 # SIS ID
 # Assessment ID
@@ -49,7 +47,7 @@ def apply_birdlife_data(
             data.elevation_upper = float(row["Occasional upper elevation"])
         else:
             data.elevation_upper = float(data.elevation_upper)
-        data = aoh_cleaning.tidy_data(data)
+        data = aoh.tidy_data(data)
 
         res = gpd.GeoDataFrame(data.to_frame().transpose(), crs=species_info.crs, geometry="geometry")
         res.to_file(path, driver="GeoJSON")
diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py
index 29d2872..9671ba3 100644
--- a/prepare_species/extract_species_data_psql.py
+++ b/prepare_species/extract_species_data_psql.py
@@ -8,7 +8,7 @@
 from multiprocessing import Pool
 from typing import Any, List, Optional, Set, Tuple
 
-# import pyshark # pylint: disable=W0611
+import aoh
 import geopandas as gpd
 import pandas as pd
 import pyproj
@@ -16,7 +16,6 @@
 import shapely
 from postgis.psycopg import register
 
-aoh_cleaning = importlib.import_module("aoh-calculator.cleaning")
 
 logger = logging.getLogger(__name__)
 logging.basicConfig()
@@ -195,7 +194,7 @@ def tidy_reproject_save(
     target_crs = pyproj.CRS.from_string(target_projection) if target_projection else src_crs
 
     graw = gdf.loc[0].copy()
-    grow = aoh_cleaning.tidy_data(
+    grow = aoh.tidy_data(
         graw,
         elevation_max=ELEVATION_MAX,
         elevation_min=ELEVATION_MIN,
diff --git a/requirements.txt b/requirements.txt
index 2a7a3dd..31c1497 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ scikit-image
 requests
 zenodo_search
 yirgacheffe
+aoh
 
 gdal[numpy]
 

From 7af51e229ee569f82ab43a9807680691e252f8fd Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 2 Oct 2025 16:00:07 +0100
Subject: [PATCH 05/21] Fixes spotted by linter

---
 prepare_layers/make_masks.py           | 2 +-
 prepare_species/apply_birdlife_data.py | 2 +-
 threats/threat_summation.py            | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/prepare_layers/make_masks.py b/prepare_layers/make_masks.py
index a4b5ce1..2557280 100644
--- a/prepare_layers/make_masks.py
+++ b/prepare_layers/make_masks.py
@@ -24,7 +24,7 @@ def prepare_mask(
     if at_least:
         calc = yo.where(calc >= 0.5, 1.0, 0.0)
     else:
-        calc = yo.where(calc > 0.5, 1.0, 0.0))
+        calc = yo.where(calc > 0.5, 1.0, 0.0)
 
     calc.to_geotiff(output_path, parallelism=128)
 
diff --git a/prepare_species/apply_birdlife_data.py b/prepare_species/apply_birdlife_data.py
index f712d24..0380477 100644
--- a/prepare_species/apply_birdlife_data.py
+++ b/prepare_species/apply_birdlife_data.py
@@ -32,7 +32,7 @@ def apply_birdlife_data(
         if math.isnan(row["Occasional lower elevation"]) and math.isnan(row["Occasional upper elevation"]):
             continue
 
-        path = os.path.join(geojson_directory_path, "AVES", "current", f"{row["SIS ID"]}.geojson")
+        path = os.path.join(geojson_directory_path, "AVES", "current", f'{row["SIS ID"]}.geojson')
         if not os.path.exists(path):
             continue
 
diff --git a/threats/threat_summation.py b/threats/threat_summation.py
index 8905479..5f1ffdd 100644
--- a/threats/threat_summation.py
+++ b/threats/threat_summation.py
@@ -8,6 +8,7 @@
 from typing import List
 
 import yirgacheffe as yg
+from yirgacheffe.layers import RasterLayer
 from osgeo import gdal
 
 gdal.SetCacheMax(1024 * 1024 * 32)

From 9f9f686db20573916447ce90b8aea410d4134f7a Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 2 Oct 2025 16:00:18 +0100
Subject: [PATCH 06/21] Update method

---
 method.md | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/method.md b/method.md
index f9d3fce..06daf8d 100644
--- a/method.md
+++ b/method.md
@@ -120,8 +120,12 @@ python3 ./prepare_layers/make_masks.py --habitat_layers /data/habitat_layers/cur
 To assist with provenance, we download the data from the Zenodo ID.
 
 ```shark-run:reclaimer
-curl -o FABDEM.zip https://data.bris.ac.uk/datasets/tar/s5hqmjcdj8yo2ibzi9b4ew3sn.zip
-...
+curl -o /data/FABDEM.zip https://data.bris.ac.uk/datasets/tar/s5hqmjcdj8yo2ibzi9b4ew3sn.zip
+```
+
+```shark-run:gdalonly
+python3 tbd.py --input /data/FABDEM.zip \
+    --output /data/elevation.tif
 ```
 
 Similarly to the habitat map we need to resample to 1km, however rather than picking the mean elevation, we select both the min and max elevation for each pixel, and then check whether the species is in that range when we calculate AoH.
@@ -214,4 +218,18 @@ python3 ./aoh-calculator/validation/validate_map_prevelence.py --collated_aoh_da
 
 ```shark-publish
 /data/validation/model_validation.csv
-```
\ No newline at end of file
+```
+
+## Threats
+
+```shark-run:aohbuilder
+python3 ./threats/threat_processing.py --speciesdata /data/species-info/* \
+  --aoh /data/aohs/ \
+  --output /data/threat_rasters
+
+python3 ./threats/threat_summation.py --threat_rasters /data/threat_rasters --output /data/threat_results
+```
+
+```shark-publish
+/data/threat_results
+```

From d031255c7a3c02ec278b36796b0c67acb7699691 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 2 Oct 2025 16:15:49 +0100
Subject: [PATCH 07/21] Enforce mypy checks

---
 .github/workflows/python-package.yml         | 17 +++++++++++------
 .mypy.ini                                    |  4 ++++
 Dockerfile                                   |  1 +
 prepare_species/extract_species_data_psql.py | 12 ++++++------
 utils/aoh_generator.py                       | 15 +++++++--------
 utils/threats_generator.py                   | 17 ++++++++---------
 6 files changed, 37 insertions(+), 29 deletions(-)
 create mode 100644 .mypy.ini

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 8fd17a0..397183d 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -12,7 +12,7 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
-    container: ghcr.io/osgeo/gdal:ubuntu-small-3.10.3
+    container: ghcr.io/osgeo/gdal:ubuntu-small-3.11.3
     strategy:
       fail-fast: false
       matrix:
@@ -26,18 +26,23 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: 'true'
+
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v3
         with:
           python-version: ${{ matrix.python-version }}
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install gdal[numpy]==3.10.3
+          python -m pip install gdal[numpy]==3.11.3
           python -m pip install -r requirements.txt
+
       - name: Lint with pylint
-        run: |
-          python3 -m pylint utils prepare_layers prepare_species threats
+        run: python3 -m pylint utils prepare_layers prepare_species threats
+
+      - name: Type checking with mypy
+        run: python3 -m mypy utils prepare_layers prepare_species threats
+
       - name: Tests
-        run: |
-          python3 -m pytest ./tests
+        run: python3 -m pytest ./tests
diff --git a/.mypy.ini b/.mypy.ini
new file mode 100644
index 0000000..d8ac83e
--- /dev/null
+++ b/.mypy.ini
@@ -0,0 +1,4 @@
+[mypy]
+ignore_missing_imports = True
+explicit_package_bases = False
+no_namespace_packages = True
diff --git a/Dockerfile b/Dockerfile
index ade1030..1217663 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,3 +53,4 @@ ENV PYTHONPATH=/root/star
 
 RUN python3 -m pytest ./tests
 RUN python3 -m pylint prepare_layers prepare_species utils tests
+RUN python3 -m mypy prepare_layers prepare_speices utils tests
diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py
index 9671ba3..938547a 100644
--- a/prepare_species/extract_species_data_psql.py
+++ b/prepare_species/extract_species_data_psql.py
@@ -210,7 +210,7 @@ def tidy_reproject_save(
 def process_systems(
     systems_data: List[Tuple],
     report: SpeciesReport,
-) -> None:
+) -> List:
     if len(systems_data) == 0:
         raise ValueError("No systems found")
     if len(systems_data) > 1:
@@ -250,9 +250,9 @@ def process_systems(
 ]
 
 def process_threats(
-    threat_data: List,
+    threat_data: List[Tuple[int,str,str]],
     report: SpeciesReport,
-) -> bool:
+) -> List[Tuple[int,int]]:
     cleaned_threats = []
     for code, scope, severity in threat_data:
         if scope is None or scope.lower() == "unknown":
@@ -329,9 +329,9 @@ def process_row(
     class_name: str,
     output_directory_path: str,
     target_projection: Optional[str],
-    presence: Tuple[int],
+    presence: Tuple[int, ...],
     row: Tuple,
-) -> Tuple:
+) -> SpeciesReport:
 
     connection = psycopg2.connect(DB_CONFIG)
     register(connection)
@@ -436,7 +436,7 @@ def extract_data_per_species(
     connection = psycopg2.connect(DB_CONFIG)
     cursor = connection.cursor()
 
-    excludes = tuple([])
+    excludes: Tuple = tuple([])
     if excludes_path is not None:
         try:
             df = pd.read_csv(excludes_path)
diff --git a/utils/aoh_generator.py b/utils/aoh_generator.py
index 1392f36..2408cf0 100644
--- a/utils/aoh_generator.py
+++ b/utils/aoh_generator.py
@@ -7,12 +7,11 @@
 import pandas as pd
 
 def aoh_generator(
-    input_dir: str,
-    data_dir: str,
-    output_csv_path: str
+    input_dir: Path,
+    data_dir: Path,
+    output_csv_path: Path,
 ):
-    taxa_dirs = Path(input_dir).glob("[!.]*")
-    data_dir = Path(data_dir)
+    taxa_dirs = input_dir.glob("[!.]*")
 
     res = []
     for taxa_dir_path in taxa_dirs:
@@ -49,21 +48,21 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Species and seasonality generator.")
     parser.add_argument(
         '--input',
-        type=str,
+        type=Path,
         help="directory with taxa folders of species info",
         required=True,
         dest="input_dir"
     )
     parser.add_argument(
         '--datadir',
-        type=str,
+        type=Path,
         help="directory for results",
         required=True,
         dest="data_dir",
     )
     parser.add_argument(
         '--output',
-        type=str,
+        type=Path,
         help="name of output file for csv",
         required=True,
         dest="output"
diff --git a/utils/threats_generator.py b/utils/threats_generator.py
index 7b2a281..6538e8c 100644
--- a/utils/threats_generator.py
+++ b/utils/threats_generator.py
@@ -7,12 +7,11 @@
 import pandas as pd
 
 def threats_generator(
-    input_dir: str,
-    data_dir: str,
-    output_csv_path: str
+    input_dir: Path,
+    data_dir: Path,
+    output_csv_path: Path,
 ):
-    taxa_dirs = Path(input_dir).glob("[!.]*")
-    data_dir = Path(data_dir)
+    taxa_dirs = input_dir.glob("[!.]*")
 
     res = []
     for taxa_dir_path in taxa_dirs:
@@ -40,24 +39,24 @@ def threats_generator(
     df.to_csv(output_csv_path, index=False)
 
 def main() -> None:
-    parser = argparse.ArgumentParser(description="threat tasts generator.")
+    parser = argparse.ArgumentParser(description="threat tasks generator.")
     parser.add_argument(
         '--input',
-        type=str,
+        type=Path,
         help="directory with taxa folders of species info",
         required=True,
         dest="input_dir"
     )
     parser.add_argument(
         '--datadir',
-        type=str,
+        type=Path,
         help="directory for results",
         required=True,
         dest="data_dir",
     )
     parser.add_argument(
         '--output',
-        type=str,
+        type=Path,
         help="name of output file for csv",
         required=True,
         dest="output"

From 0970c4c455ac87dbb1551fdcbfab262058e962a9 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 2 Oct 2025 16:21:48 +0100
Subject: [PATCH 08/21] Remove unused imports

---
 prepare_species/apply_birdlife_data.py       | 1 -
 prepare_species/extract_species_data_psql.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/prepare_species/apply_birdlife_data.py b/prepare_species/apply_birdlife_data.py
index 0380477..60c2a78 100644
--- a/prepare_species/apply_birdlife_data.py
+++ b/prepare_species/apply_birdlife_data.py
@@ -1,5 +1,4 @@
 import argparse
-import importlib
 import math
 import os
 
diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py
index 938547a..f49087a 100644
--- a/prepare_species/extract_species_data_psql.py
+++ b/prepare_species/extract_species_data_psql.py
@@ -1,5 +1,4 @@
 import argparse
-import importlib
 import json
 import logging
 import math

From 18b4f8c337bf49d48227e47f3226458fdb3b8eb3 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 2 Oct 2025 18:54:00 +0100
Subject: [PATCH 09/21] Typing on the threats utils

---
 threats/threat_summation.py | 54 +++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/threats/threat_summation.py b/threats/threat_summation.py
index 5f1ffdd..194d157 100644
--- a/threats/threat_summation.py
+++ b/threats/threat_summation.py
@@ -14,12 +14,9 @@
 gdal.SetCacheMax(1024 * 1024 * 32)
 
 def worker(
-    filename: str,
-    result_dir: str,
+    output_tif: Path,
     input_queue: Queue,
 ) -> None:
-    output_tif = os.path.join(result_dir, filename)
-
     merged_result = None
 
     while True:
@@ -43,19 +40,18 @@ def worker(
 
 def raster_sum(
     images_list: List[Path],
-    output_filename: str,
+    output_filename: Path,
     processes_count: int
 ) -> None:
-    result_dir, filename = os.path.split(output_filename)
-    os.makedirs(result_dir, exist_ok=True)
+    os.makedirs(output_filename.parent, exist_ok=True)
 
-    with tempfile.TemporaryDirectory() as tempdir:
+    with tempfile.TemporaryDirectory() as tempdir_str:
+        tempdir = Path(tempdir_str)
         with Manager() as manager:
             source_queue = manager.Queue()
 
             workers = [Process(target=worker, args=(
-                f"{index}.tif",
-                tempdir,
+                tempdir / f"{index}.tif",
                 source_queue
             )) for index in range(processes_count)]
             for worker_process in workers:
@@ -80,8 +76,7 @@ def raster_sum(
 
             # here we should have now a set of images in tempdir to merge
             single_worker = Process(target=worker, args=(
-                filename,
-                result_dir,
+                output_filename,
                 source_queue
             ))
             single_worker.start()
@@ -103,17 +98,17 @@ def raster_sum(
                 time.sleep(1)
 
 def reduce_to_next_level(
-    rasters_directory: str,
-    output_directory: str,
+    rasters_directory: Path,
+    output_directory: Path,
     processes_count: int,
 ) -> None:
 
-    files = list(Path(rasters_directory).glob("**/*.tif"))
+    files = list(rasters_directory.glob("**/*.tif"))
     print(f"total items: {len(files)}")
     if not files:
         sys.exit(f"No files in {rasters_directory}, aborting")
 
-    buckets = {}
+    buckets: dict[str,list[Path]] = {}
     for filename in files:
         code, _ = os.path.splitext(filename.name)
         next_level_threat_id = ".".join(code.split('.')[:-1])
@@ -126,22 +121,22 @@ def reduce_to_next_level(
 
     print(f"Found {len(buckets)} threats at current level:")
     for code, files in buckets.items():
-        target_output = os.path.join(output_directory, f"{code}.tif")
+        target_output = output_directory / f"{code}.tif"
         print(f"processing {code}: {len(files)} items")
         raster_sum(files, target_output, processes_count)
 
 def reduce_from_species(
-    rasters_directory: str,
-    output_directory: str,
+    rasters_directory: Path,
+    output_directory: Path,
     processes_count: int,
 ) -> None:
 
-    files = list(Path(rasters_directory).glob("**/*.tif"))
+    files = list(rasters_directory.glob("**/*.tif"))
     print(f"total items: {len(files)}")
     if not files:
         sys.exit(f"No files in {rasters_directory}, aborting")
 
-    buckets = {}
+    buckets: dict[str,list[Path]] = {}
     for filename in files:
         threat_code = filename.parts[-2]
         levels = threat_code.split('.')
@@ -159,31 +154,30 @@ def reduce_from_species(
 
     print(f"Found {len(buckets)} threats at current level:")
     for code, files in buckets.items():
-        target_output = os.path.join(output_directory, f"{code}.tif")
+        target_output = output_directory / f"{code}.tif"
         print(f"processing {code}: {len(files)} items")
         raster_sum(files, target_output, processes_count)
 
-
 def threat_summation(
-    rasters_directory: str,
-    output_directory: str,
+    rasters_directory: Path,
+    output_directory: Path,
     processes_count: int,
 ) -> None:
     os.makedirs(output_directory, exist_ok=True)
 
     # All these files are at level3 to start with, so first make level2
     print("processing level 2")
-    level2_target = os.path.join(output_directory, "level2")
+    level2_target = output_directory / "level2"
     reduce_from_species(rasters_directory, level2_target, processes_count)
 
     # Now reduce level2 to level1
     print("processing level 1")
-    level1_target = os.path.join(output_directory, "level1")
+    level1_target = output_directory / "level1"
     reduce_to_next_level(level2_target, level1_target, processes_count)
 
     # Now build a final top level STAR
     print("processing level 0")
-    final_target = os.path.join(output_directory, "level0")
+    final_target = output_directory / "level0"
     reduce_to_next_level(level1_target, final_target, processes_count)
 
 
@@ -191,14 +185,14 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Generates the combined, and level 1 and level 2 threat rasters.")
     parser.add_argument(
         "--threat_rasters",
-        type=str,
+        type=Path,
         required=True,
         dest="rasters_directory",
         help="GeoTIFF file containing level three per species threats"
     )
     parser.add_argument(
         "--output",
-        type=str,
+        type=Path,
         required=True,
         dest="output_directory",
         help="Destination directory file for results."

From 33d74d4396d6d67b6236fdcf6335e6ab4fc8d747 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Fri, 3 Oct 2025 07:54:48 +0100
Subject: [PATCH 10/21] More Python modernisation

---
 prepare_layers/convert_crosswalk.py          |  9 ++--
 prepare_species/apply_birdlife_data.py       | 14 +++---
 prepare_species/extract_species_data_psql.py | 49 ++++++++++----------
 threats/threat_processing.py                 | 20 ++++----
 threats/threat_summation.py                  |  3 +-
 utils/collect_validation_data.py             | 13 +++---
 6 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/prepare_layers/convert_crosswalk.py b/prepare_layers/convert_crosswalk.py
index 37926ff..4049d6e 100644
--- a/prepare_layers/convert_crosswalk.py
+++ b/prepare_layers/convert_crosswalk.py
@@ -1,4 +1,5 @@
 import argparse
+from pathlib import Path
 
 import pandas as pd
 
@@ -28,8 +29,8 @@
 }
 
 def convert_crosswalk(
-    original_path: str,
-    output_path: str,
+    original_path: Path,
+    output_path: Path,
 ) -> None:
     original = pd.read_csv(original_path)
 
@@ -56,14 +57,14 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Convert IUCN crosswalk to minimal common format.")
     parser.add_argument(
         '--original',
-        type=str,
+        type=Path,
         help="Original format",
         required=True,
         dest="original_path",
     )
     parser.add_argument(
         '--output',
-        type=str,
+        type=Path,
         help='Destination minimal file',
         required=True,
         dest='output_path',
diff --git a/prepare_species/apply_birdlife_data.py b/prepare_species/apply_birdlife_data.py
index 60c2a78..da65746 100644
--- a/prepare_species/apply_birdlife_data.py
+++ b/prepare_species/apply_birdlife_data.py
@@ -1,6 +1,6 @@
 import argparse
 import math
-import os
+from pathlib import Path
 
 import aoh
 import geopandas as gpd
@@ -22,8 +22,8 @@
 # Occasional upper elevation
 
 def apply_birdlife_data(
-    geojson_directory_path: str,
-    overrides_path: str,
+    geojson_directory_path: Path,
+    overrides_path: Path,
 ) -> None:
     overrides = pd.read_csv(overrides_path, encoding="latin1")
 
@@ -31,8 +31,8 @@ def apply_birdlife_data(
         if math.isnan(row["Occasional lower elevation"]) and math.isnan(row["Occasional upper elevation"]):
             continue
 
-        path = os.path.join(geojson_directory_path, "AVES", "current", f'{row["SIS ID"]}.geojson')
-        if not os.path.exists(path):
+        path = geojson_directory_path / "AVES" / "current" / f'{row["SIS ID"]}.geojson'
+        if not path.exists():
             continue
 
         species_info = gpd.read_file(path)
@@ -55,14 +55,14 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Process agregate species data to per-species-file.")
     parser.add_argument(
         '--geojsons',
-        type=str,
+        type=Path,
         help='Directory where per species Geojson is stored',
         required=True,
         dest='geojson_directory_path',
     )
     parser.add_argument(
         '--overrides',
-        type=str,
+        type=Path,
         help="CSV of overrides",
         required=True,
         dest="overrides",
diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py
index f49087a..43b65c7 100644
--- a/prepare_species/extract_species_data_psql.py
+++ b/prepare_species/extract_species_data_psql.py
@@ -5,7 +5,8 @@
 import os
 from functools import partial
 from multiprocessing import Pool
-from typing import Any, List, Optional, Set, Tuple
+from pathlib import Path
+from typing import Any, Optional
 
 import aoh
 import geopandas as gpd
@@ -180,13 +181,13 @@ def __getattr__(self, name: str) -> Any:
             return self.info[name]
         return None
 
-    def as_row(self) -> List:
+    def as_row(self) -> list:
         return [self.info[k] for k in self.REPORT_COLUMNS]
 
 def tidy_reproject_save(
     gdf: gpd.GeoDataFrame,
     report: SpeciesReport,
-    output_directory_path: str,
+    output_directory_path: Path,
     target_projection: Optional[str],
 ) -> None:
     src_crs = pyproj.CRS.from_epsg(4326)
@@ -200,16 +201,16 @@ def tidy_reproject_save(
         elevation_seperation=ELEVATION_SPREAD,
     )
     os.makedirs(output_directory_path, exist_ok=True)
-    output_path = os.path.join(output_directory_path, f"{grow.id_no}.geojson")
+    output_path = output_directory_path / f"{grow.id_no}.geojson"
     res = gpd.GeoDataFrame(grow.to_frame().transpose(), crs=src_crs, geometry="geometry")
     res_projected = res.to_crs(target_crs)
     res_projected.to_file(output_path, driver="GeoJSON")
     report.filename = output_path
 
 def process_systems(
-    systems_data: List[Tuple],
+    systems_data: list[tuple],
     report: SpeciesReport,
-) -> List:
+) -> list:
     if len(systems_data) == 0:
         raise ValueError("No systems found")
     if len(systems_data) > 1:
@@ -249,9 +250,9 @@ def process_systems(
 ]
 
 def process_threats(
-    threat_data: List[Tuple[int,str,str]],
+    threat_data: list[tuple[int, str, str]],
     report: SpeciesReport,
-) -> List[Tuple[int,int]]:
+) -> list[tuple[int, int]]:
     cleaned_threats = []
     for code, scope, severity in threat_data:
         if scope is None or scope.lower() == "unknown":
@@ -267,9 +268,9 @@ def process_threats(
     return cleaned_threats
 
 def process_habitats(
-    habitats_data: List[List[str]],
+    habitats_data: list[list[str]],
     report: SpeciesReport,
-) -> Set:
+) -> set:
     if len(habitats_data) == 0:
         # Promote to "Unknown"
         habitats_data = [["18"]]
@@ -295,7 +296,7 @@ def process_habitats(
     return habitats
 
 def process_geometries(
-    geometries_data: List[Tuple[int,shapely.Geometry]],
+    geometries_data: list[tuple[int, shapely.Geometry]],
     report: SpeciesReport,
 ) -> shapely.Geometry:
     if len(geometries_data) == 0:
@@ -326,10 +327,10 @@ def process_geometries(
 
 def process_row(
     class_name: str,
-    output_directory_path: str,
+    output_directory_path: Path,
     target_projection: Optional[str],
-    presence: Tuple[int, ...],
-    row: Tuple,
+    presence: tuple[int, ...],
+    row: tuple,
 ) -> SpeciesReport:
 
     connection = psycopg2.connect(DB_CONFIG)
@@ -399,7 +400,7 @@ def process_row(
     return report
 
 def apply_overrides(
-    overrides_path: str,
+    overrides_path: Path,
     results,
 ):
     overrides = pd.read_csv(overrides_path, encoding="latin1")
@@ -426,16 +427,16 @@ def apply_overrides(
 
 def extract_data_per_species(
     class_name: str,
-    overrides_path: Optional[str],
-    excludes_path: Optional[str],
-    output_directory_path: str,
+    overrides_path: Optional[Path],
+    excludes_path: Optional[Path],
+    output_directory_path: Path,
     target_projection: Optional[str],
 ) -> None:
 
     connection = psycopg2.connect(DB_CONFIG)
     cursor = connection.cursor()
 
-    excludes: Tuple = tuple([])
+    excludes: tuple = tuple([])
     if excludes_path is not None:
         try:
             df = pd.read_csv(excludes_path)
@@ -447,7 +448,7 @@ def extract_data_per_species(
     # For STAR-R we need historic data, but for STAR-T we just need current.
     # for era, presence in [("current", (1, 2)), ("historic", (1, 2, 4, 5))]:
     for era, presence in [("current", (1, 2))]:
-        era_output_directory_path = os.path.join(output_directory_path, era)
+        era_output_directory_path = output_directory_path / era
 
         # You can't do NOT IN on an empty list in SQL
         if excludes:
@@ -478,7 +479,7 @@ def extract_data_per_species(
             columns=SpeciesReport.REPORT_COLUMNS
         ).sort_values('id_no')
         os.makedirs(era_output_directory_path, exist_ok=True)
-        reports_df.to_csv(os.path.join(era_output_directory_path, "report.csv"), index=False)
+        reports_df.to_csv(era_output_directory_path / "report.csv", index=False)
 
 def main() -> None:
     parser = argparse.ArgumentParser(description="Process agregate species data to per-species-file.")
@@ -491,21 +492,21 @@ def main() -> None:
     )
     parser.add_argument(
         '--overrides',
-        type=str,
+        type=Path,
         help="CSV of overrides",
         required=False,
         dest="overrides",
     )
     parser.add_argument(
         '--excludes',
-        type=str,
+        type=Path,
         help="CSV of taxon IDs to not include",
         required=False,
         dest="excludes"
     )
     parser.add_argument(
         '--output',
-        type=str,
+        type=Path,
         help='Directory where per species GeoJSON is stored',
         required=True,
         dest='output_directory_path',
diff --git a/threats/threat_processing.py b/threats/threat_processing.py
index 8e6011e..0e04c73 100644
--- a/threats/threat_processing.py
+++ b/threats/threat_processing.py
@@ -2,15 +2,16 @@
 import json
 import os
 import sys
+from pathlib import Path
 
 import geopandas as gpd
 import yirgacheffe as yg
 from pyogrio.errors import DataSourceError
 
 def threat_processing_per_species(
-    species_data_path: str,
-    aoh_path: str,
-    output_directory_path: str,
+    species_data_path: Path,
+    aoh_path: Path,
+    output_directory_path: Path,
 ) -> None:
     try:
         data = gpd.read_file(species_data_path)
@@ -26,8 +27,7 @@ def threat_processing_per_species(
         threat_data = json.loads(data.threats[0])
 
         try:
-            aoh_base, _ = os.path.splitext(aoh_path)
-            aoh_data_path = aoh_base + ".json"
+            aoh_data_path = aoh_path.with_suffix(".json")
             with open(aoh_data_path, "r", encoding="UTF-8") as f:
                 aoh_data = json.load(f)
             aoh_total = aoh_data["aoh_total"]
@@ -46,9 +46,9 @@ def threat_processing_per_species(
             per_threat_per_species_score = weighted_species * proportional_threat_weight
             print(per_threat_per_species_score.sum())
 
-            threat_dir_path = os.path.join(output_directory_path, str(threat_id))
+            threat_dir_path = output_directory_path / str(threat_id)
             os.makedirs(threat_dir_path, exist_ok=True)
-            output_path = os.path.join(threat_dir_path, f"{taxon_id}.tif")
+            output_path = threat_dir_path / f"{taxon_id}.tif"
             per_threat_per_species_score.to_geotiff(output_path)
 
 def main() -> None:
@@ -57,21 +57,21 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Calculate per species threat layers")
     parser.add_argument(
         '--speciesdata',
-        type=str,
+        type=Path,
         help="Single species/seasonality geojson.",
         required=True,
         dest="species_data_path"
     )
     parser.add_argument(
         '--aoh',
-        type=str,
+        type=Path,
         help="AoH raster  of speices.",
         required=True,
         dest="aoh_path"
     )
     parser.add_argument(
         '--output',
-        type=str,
+        type=Path,
         help='Directory where per species/threat layers are stored',
         required=True,
         dest='output_directory_path',
diff --git a/threats/threat_summation.py b/threats/threat_summation.py
index 194d157..5b93ede 100644
--- a/threats/threat_summation.py
+++ b/threats/threat_summation.py
@@ -5,7 +5,6 @@
 import time
 from multiprocessing import Manager, Process, Queue, cpu_count
 from pathlib import Path
-from typing import List
 
 import yirgacheffe as yg
 from yirgacheffe.layers import RasterLayer
@@ -39,7 +38,7 @@ def worker(
         merged_result.to_geotiff(output_tif)
 
 def raster_sum(
-    images_list: List[Path],
+    images_list: list[Path],
     output_filename: Path,
     processes_count: int
 ) -> None:
diff --git a/utils/collect_validation_data.py b/utils/collect_validation_data.py
index 278d71d..49a3563 100644
--- a/utils/collect_validation_data.py
+++ b/utils/collect_validation_data.py
@@ -1,13 +1,14 @@
 import argparse
 import os
 import shutil
+from pathlib import Path
 
 import pandas as pd
 
 def collect_validation_data(
-    model_results_path: str,
-    data_dir: str,
-    output_dir: str,
+    model_results_path: Path,
+    data_dir: Path,
+    output_dir: Path,
 ) -> None:
     model_results = pd.read_csv(model_results_path)
     os.makedirs(output_dir, exist_ok=True)
@@ -29,21 +30,21 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Collected range/AOH for species that failed validation")
     parser.add_argument(
         '--model_results',
-        type=str,
+        type=Path,
         help="directory with taxa folders of species info",
         required=True,
         dest="model_results_path"
     )
     parser.add_argument(
         '--datadir',
-        type=str,
+        type=Path,
         help="directory for results",
         required=True,
         dest="data_dir",
     )
     parser.add_argument(
         '--output',
-        type=str,
+        type=Path,
         help="name of output directory",
         required=True,
         dest="output"

From dace36fe5f29d08379f507a3dddb9d3d7f0c2b04 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Fri, 3 Oct 2025 08:58:25 +0100
Subject: [PATCH 11/21] Unify run and slurm scripts.

---
 .github/workflows/python-package.yml |   6 +-
 scripts/run.sh                       | 107 +++++++++++++++----------
 scripts/slurm.sh                     | 115 ---------------------------
 3 files changed, 71 insertions(+), 157 deletions(-)
 delete mode 100644 scripts/slurm.sh

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 397183d..94ca3e0 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -22,7 +22,7 @@ jobs:
       - name: Install system
         run: |
           apt-get update -qqy
-          apt-get install -y git python3-pip libpq5 libpq-dev r-base libtirpc-dev
+          apt-get install -y git python3-pip libpq5 libpq-dev r-base libtirpc-dev shellcheck
       - uses: actions/checkout@v4
         with:
           submodules: 'true'
@@ -46,3 +46,7 @@ jobs:
 
       - name: Tests
         run: python3 -m pytest ./tests
+
+      - name: Script checks
+        run: |
+          shellcheck ./scripts/run.sh
diff --git a/scripts/run.sh b/scripts/run.sh
index 2ce4863..c507bdc 100755
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -7,7 +7,34 @@
 # https://github.com/quantifyearth/reclaimer - used to download inputs from Zenodo directly
 # https://github.com/quantifyearth/littlejohn - used to run batch jobs in parallel
 
+# Set shell script to exit on first error (-e) and to output commands being run to make
+# reviewing logs easier (-x)
 set -e
+set -x
+
+# We know we use two Go tools, so add go/bin to our path as in slurm world they're likely
+# to be installed locally
+export PATH="${PATH}":"${HOME}"/go/bin
+if ! hash littlejohn 2>/dev/null; then
+    echo "Please ensure littlejohn is available"
+    exit 1
+fi
+if ! hash reclaimer 2>/dev/null; then
+    echo "Please ensure reclaimer is available"
+    exit 1
+fi
+
+# Detect if we're running under SLURM
+if [[ -n "${SLURM_JOB_ID}" ]]; then
+    # Slurm users will probably need to customise this
+    # shellcheck disable=SC1091
+    source "${HOME}"/venvs/star/bin/activate
+    cd "${HOME}"/dev/star
+    PROCESS_COUNT="${SLURM_JOB_CPUS_PER_NODE}"
+else
+    PROCESS_COUNT=$(nproc --all)
+fi
+echo "Using ${PROCESS_COUNT} threads."
 
 if [ -z "${DATADIR}" ]; then
     echo "Please specify $DATADIR"
@@ -19,96 +46,94 @@ if [ -z "${VIRTUAL_ENV}" ]; then
     exit 1
 fi
 
-export CPUS=`getconf _NPROCESSORS_ONLN`
-export THREADS=$(($CPUS / 2))
-echo "Using $THREADS threads."
-
 declare -a TAXALIST=("AMPHIBIA" "AVES" "MAMMALIA" "REPTILIA")
 
 # Get habitat layer and prepare for use
-if [ ! -d ${DATADIR}/habitat_layers ]; then
-    if [ ! -f ${DATADIR}/habitat/raw.tif ]; then
+if [ ! -d "${DATADIR}"/habitat_layers ]; then
+    if [ ! -f "${DATADIR}"/habitat/raw.tif ]; then
         echo "Fetching habitat map..."
-        reclaimer zenodo --zenodo_id 3939050 --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif --output ${DATADIR}/habitat/raw.tif
+        reclaimer zenodo --zenodo_id 3939050 --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif --output "${DATADIR}"/habitat/raw.tif
     fi
 
     echo "Processing habitat map..."
-    python3 ./aoh-calculator/habitat_process.py --habitat ${DATADIR}/habitat/raw.tif \
-                                                --scale 1000.0 \
-                                                --projection "ESRI:54009" \
-                                                --output ${DATADIR}/tmp_habitat_layers/current
-    mv ${DATADIR}/tmp_habitat_layers ${DATADIR}/habitat_layers
+    aoh-habitat-process --habitat "${DATADIR}"/habitat/raw.tif \
+                        --scale 1000.0 \
+                        --projection "ESRI:54009" \
+                        --output "${DATADIR}"/tmp_habitat_layers/current
+    mv "${DATADIR}"/tmp_habitat_layers "${DATADIR}"/habitat_layers
 fi
 
-if [ ! -d ${DATADIR}/masks ]; then
+if [ ! -d "${DATADIR}"/masks ]; then
     echo "Processing masks..."
-    python3 ./prepare_layers/make_masks.py --habitat_layers ${DATADIR}/habitat_layers/current \
-                                        --output_directory ${DATADIR}/masks
+    python3 ./prepare_layers/make_masks.py --habitat_layers "${DATADIR}"/habitat_layers/current \
+                                        --output_directory "${DATADIR}"/masks
 fi
 
 # Fetch and prepare the elevation layers
-if [[ ! -f ${DATADIR}/elevation/elevation-max-1k.tif || ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]]; then
-    if [ ! -f ${DATADIR}/elevation/elevation.tif ]; then
+if [[ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif || ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]]; then
+    if [ ! -f "${DATADIR}"/elevation/elevation.tif ]; then
         echo "Fetching elevation map..."
-        reclaimer zenodo --zenodo_id 5719984  --filename dem-100m-esri54017.tif --output ${DATADIR}/elevation/elevation.tif
+        reclaimer zenodo --zenodo_id 5719984  --filename dem-100m-esri54017.tif --output "${DATADIR}"/elevation/elevation.tif
     fi
-    if [ ! -f ${DATADIR}/elevation/elevation-max-1k.tif ]; then
+    if [ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif ]; then
         echo "Generating elevation max layer..."
-        gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r max -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-max-1k.tif
+        gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r max -co COMPRESS=LZW -wo NUM_THREADS=40 "${DATADIR}"/elevation/elevation.tif "${DATADIR}"/elevation/elevation-max-1k.tif
     fi
-    if [ ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]; then
+    if [ ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]; then
         echo "Generating elevation min layer..."
-        gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r min -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-min-1k.tif
+        gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r min -co COMPRESS=LZW -wo NUM_THREADS=40 "${DATADIR}"/elevation/elevation.tif "${DATADIR}"/elevation/elevation-min-1k.tif
     fi
 fi
 
 # Generate the crosswalk table
-if [ ! -f ${DATADIR}/crosswalk.csv ]; then
+if [ ! -f "${DATADIR}"/crosswalk.csv ]; then
     echo "Generating crosswalk table..."
-    python3 ./prepare_layers/convert_crosswalk.py --original ${PWD}/data/crosswalk_bin_T.csv --output ${DATADIR}/crosswalk.csv
+    python3 ./prepare_layers/convert_crosswalk.py --original "${PWD}"/data/crosswalk_bin_T.csv --output "${DATADIR}"/crosswalk.csv
 fi
 
 # Get species data per taxa from IUCN data
 for TAXA in "${TAXALIST[@]}"
 do
-    echo "Extracting species data for ${TAXA}..."
-    python3 ./prepare_species/extract_species_data_psql.py --class ${TAXA} --output ${DATADIR}/species-info/${TAXA}/ --projection "ESRI:54009" --excludes ${DATADIR}/SpeciesList_generalisedRangePolygons.csv
+    if [ ! -d "${DATADIR}"/species-info/"${TAXA}"/ ]; then
+        echo "Extracting species data for ${TAXA}..."
+        python3 ./prepare_species/extract_species_data_psql.py --class "${TAXA}" --output "${DATADIR}"/species-info/"${TAXA}"/ --projection "ESRI:54009" --excludes "${DATADIR}"/SpeciesList_generalisedRangePolygons.csv
+    fi
 done
 
 if [ -f data/BL_Species_Elevations_2023.csv ]; then
     echo "Applying birdlife data..."
-    python3 ./prepare_species/apply_birdlife_data.py --geojsons ${DATADIR}/species-info/AVES --overrides data/BL_Species_Elevations_2023.csv
+    python3 ./prepare_species/apply_birdlife_data.py --geojsons "${DATADIR}"/species-info/AVES --overrides data/BL_Species_Elevations_2023.csv
 fi
 
 echo "Generating AoH task list..."
-python3 ./utils/aoh_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/aohbatch.csv
+python3 ./utils/aoh_generator.py --input "${DATADIR}"/species-info --datadir "${DATADIR}" --output "${DATADIR}"/aohbatch.csv
 
 echo "Generating AoHs..."
-littlejohn -j ${THREADS} -o ${DATADIR}/aohbatch.log -c ${DATADIR}/aohbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./aoh-calculator/aohcalc.py
+littlejohn -j "${PROCESS_COUNT}" -o "${DATADIR}"/aohbatch.log -c "${DATADIR}"/aohbatch.csv "${VIRTUAL_ENV}"/bin/aoh-calc
 
 # Calculate predictors from AoHs
 echo "Generating species richness..."
-python3 ./aoh-calculator/summaries/species_richness.py --aohs_folder ${DATADIR}/aohs/current/ \
-                                                       --output ${DATADIR}/summaries/species_richness.tif
+python3 ./aoh-calculator/summaries/species_richness.py --aohs_folder "${DATADIR}"/aohs/current/ \
+                                                       --output "${DATADIR}"/summaries/species_richness.tif
 echo "Generating endemism..."
-python3 ./aoh-calculator/summaries/endemism.py --aohs_folder ${DATADIR}/aohs/current/ \
-                                               --species_richness ${DATADIR}/summaries/species_richness.tif \
-                                               --output ${DATADIR}/summaries/endemism.tif
+python3 ./aoh-calculator/summaries/endemism.py --aohs_folder "${DATADIR}"/aohs/current/ \
+                                               --species_richness "${DATADIR}"/summaries/species_richness.tif \
+                                               --output "${DATADIR}"/summaries/endemism.tif
 
 # Aoh Validation
 echo "Collating validation data..."
-python3 ./aoh-calculator/validation/collate_data.py --aoh_results ${DATADIR}/aohs/current/ \
-                                                    --output ${DATADIR}/validation/aohs.csv
+python3 ./aoh-calculator/validation/collate_data.py --aoh_results "${DATADIR}"/aohs/current/ \
+                                                    --output "${DATADIR}"/validation/aohs.csv
 echo "Calculating model validation..."
-python3 ./aoh-calculator/validation/validate_map_prevalence.py --collated_aoh_data ${DATADIR}/validation/aohs.csv \
-                                                               --output ${DATADIR}/validation/model_validation.csv
+python3 ./aoh-calculator/validation/validate_map_prevalence.py --collated_aoh_data "${DATADIR}"/validation/aohs.csv \
+                                                               --output "${DATADIR}"/validation/model_validation.csv
 
 # Threats
 echo "Generating threat task list..."
-python3 ./utils/threats_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/threatbatch.csv
+python3 ./utils/threats_generator.py --input "${DATADIR}"/species-info --datadir "${DATADIR}" --output "${DATADIR}"/threatbatch.csv
 
 echo "Generating threat rasters..."
-littlejohn -j ${THREADS} -o ${DATADIR}/threatbatch.log -c ${DATADIR}/threatbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./threats/threat_processing.py
+littlejohn -j "${PROCESS_COUNT}" -o "${DATADIR}"/threatbatch.log -c "${DATADIR}"/threatbatch.csv "${VIRTUAL_ENV}"/bin/python3 -- ./threats/threat_processing.py
 
 echo "Summarising threats..."
-python3 ./threats/threat_summation.py --threat_rasters ${DATADIR}/threat_rasters --output ${DATADIR}/threat_results
+python3 ./threats/threat_summation.py --threat_rasters "${DATADIR}"/threat_rasters --output "${DATADIR}"/threat_results
diff --git a/scripts/slurm.sh b/scripts/slurm.sh
deleted file mode 100644
index c0eed55..0000000
--- a/scripts/slurm.sh
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/bin/bash
-#
-# Assumes you've set up a python virtual environement in the current directory.
-#
-# In addition to the Python environemnt, you will need the following extra command line tools:
-#
-# https://github.com/quantifyearth/reclaimer - used to download inputs from Zenodo directly
-# https://github.com/quantifyearth/littlejohn - used to run batch jobs in parallel
-
-set -e
-
-# shellcheck disable=SC1091
-source ${HOME}/venvs/life/bin/activate
-cd ${HOME}/dev/star
-export PATH=$PATH:$HOME/go/bin
-
-if [ -z "${DATADIR}" ]; then
-    echo "Please specify DATADIR"
-    exit 1
-fi
-
-if [ -z "${VIRTUAL_ENV}" ]; then
-    echo "Please specify run in a virtualenv"
-    exit 1
-fi
-
-declare -a TAXALIST=("AMPHIBIA" "AVES" "MAMMALIA" "REPTILIA")
-
-# Get habitat layer and prepare for use
-if [ ! -d ${DATADIR}/habitat_layers ]; then
-    if [ ! -f ${DATADIR}/habitat/raw.tif ]; then
-        echo "Fetching habitat map..."
-        reclaimer zenodo --zenodo_id 3939050 --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif --output ${DATADIR}/habitat/raw.tif
-    fi
-
-    echo "Processing habitat map..."
-    python3 ./aoh-calculator/habitat_process.py --habitat ${DATADIR}/habitat/raw.tif \
-                                                --scale 1000.0 \
-                                                --projection "ESRI:54009" \
-                                                --output ${DATADIR}/tmp_habitat_layers/current
-    mv ${DATADIR}/tmp_habitat_layers ${DATADIR}/habitat_layers
-fi
-
-if [ ! -d ${DATADIR}/masks ]; then
-    echo "Processing masks..."
-    python3 ./prepare_layers/make_masks.py --habitat_layers ${DATADIR}/habitat_layers/current \
-                                        --output_directory ${DATADIR}/masks
-fi
-
-# Fetch and prepare the elevation layers
-if [[ ! -f ${DATADIR}/elevation/elevation-max-1k.tif || ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]]; then
-    if [ ! -f ${DATADIR}/elevation/elevation.tif ]; then
-        echo "Fetching elevation map..."
-        reclaimer zenodo --zenodo_id 5719984  --filename dem-100m-esri54017.tif --output ${DATADIR}/elevation/elevation.tif
-    fi
-    if [ ! -f ${DATADIR}/elevation/elevation-max-1k.tif ]; then
-        echo "Generating elevation max layer..."
-        gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r max -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-max-1k.tif
-    fi
-    if [ ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]; then
-        echo "Generating elevation min layer..."
-        gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r min -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-min-1k.tif
-    fi
-fi
-
-# Generate the crosswalk table
-if [ ! -f ${DATADIR}/crosswalk.csv ]; then
-    echo "Generating crosswalk table..."
-    python3 ./prepare_layers/convert_crosswalk.py --original ${PWD}/data/crosswalk_bin_T.csv --output ${DATADIR}/crosswalk.csv
-fi
-
-# Get species data per taxa from IUCN data
-for TAXA in "${TAXALIST[@]}"
-do
-    echo "Extracting species data for ${TAXA}..."
-    python3 ./prepare_species/extract_species_data_psql.py --class ${TAXA} --output ${DATADIR}/species-info/${TAXA}/ --projection "ESRI:54009" --excludes ${DATADIR}/SpeciesList_generalisedRangePolygons.csv
-done
-
-if [ -f data/BL_Species_Elevations_2023.csv ]; then
-    echo "Applying birdlife data..."
-    python3 ./prepare_species/apply_birdlife_data.py --geojsons ${DATADIR}/species-info/AVES --overrides data/BL_Species_Elevations_2023.csv
-fi
-
-echo "Generating AoH task list..."
-python3 ./utils/aoh_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/aohbatch.csv
-
-echo "Generating AoHs..."
-littlejohn -j ${SLURM_JOB_CPUS_PER_NODE} -o ${DATADIR}/aohbatch.log -c ${DATADIR}/aohbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./aoh-calculator/aohcalc.py
-
-# Calculate predictors from AoHs
-echo "Generating species richness..."
-python3 ./aoh-calculator/summaries/species_richness.py --aohs_folder ${DATADIR}/aohs/current/ \
-                                                       --output ${DATADIR}/summaries/species_richness.tif
-echo "Generating endemism..."
-python3 ./aoh-calculator/summaries/endemism.py --aohs_folder ${DATADIR}/aohs/current/ \
-                                               --species_richness ${DATADIR}/summaries/species_richness.tif \
-                                               --output ${DATADIR}/summaries/endemism.tif
-
-# Aoh Validation
-echo "Collating validation data..."
-python3 ./aoh-calculator/validation/collate_data.py --aoh_results ${DATADIR}/aohs/current/ \
-                                                    --output ${DATADIR}/validation/aohs.csv
-echo "Calculating model validation..."
-python3 ./aoh-calculator/validation/validate_map_prevalence.py --collated_aoh_data ${DATADIR}/validation/aohs.csv \
-                                                               --output ${DATADIR}/validation/model_validation.csv
-
-# Threats
-echo "Generating threat task list..."
-python3 ./utils/threats_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/threatbatch.csv
-
-echo "Generating threat rasters..."
-littlejohn -j ${SLURM_JOB_CPUS_PER_NODE} -o ${DATADIR}/threatbatch.log -c ${DATADIR}/threatbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./threats/threat_processing.py
-
-echo "Summarising threats..."
-python3 ./threats/threat_summation.py --threat_rasters ${DATADIR}/threat_rasters --output ${DATADIR}/threat_results

From e70cc3402343903766161c75a69c757bc1e130a1 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Wed, 15 Oct 2025 11:52:44 +0000
Subject: [PATCH 12/21] Small tidying

---
 requirements.txt | 2 --
 scripts/run.sh   | 5 +++++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 31c1497..b1514f4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,14 +8,12 @@ pymer4
 pyproj
 scikit-image
 requests
-zenodo_search
 yirgacheffe
 aoh
 
 gdal[numpy]
 
 git+https://github.com/quantifyearth/iucn_modlib
-git+https://github.com/quantifyearth/pyshark
 
 pylint
 mypy
diff --git a/scripts/run.sh b/scripts/run.sh
index c507bdc..a5dcab6 100755
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -48,6 +48,10 @@ fi
 
 declare -a TAXALIST=("AMPHIBIA" "AVES" "MAMMALIA" "REPTILIA")
 
+if [ ! -d "${DATADIR}" ]; then
+    mkdir "${DATADIR}"
+fi
+
 # Get habitat layer and prepare for use
 if [ ! -d "${DATADIR}"/habitat_layers ]; then
     if [ ! -f "${DATADIR}"/habitat/raw.tif ]; then
@@ -73,6 +77,7 @@ fi
 if [[ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif || ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]]; then
     if [ ! -f "${DATADIR}"/elevation/elevation.tif ]; then
         echo "Fetching elevation map..."
+        mkdir -p "${DATADIR}"/elevation
         reclaimer zenodo --zenodo_id 5719984  --filename dem-100m-esri54017.tif --output "${DATADIR}"/elevation/elevation.tif
     fi
     if [ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif ]; then

From 99bf082ac14f914236f0393fd04fc685a8f87d9e Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 16 Oct 2025 07:27:03 +0100
Subject: [PATCH 13/21] Full run with updated aoh

---
 requirements.txt |  2 +-
 scripts/run.sh   | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b1514f4..56c2f4b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ pyproj
 scikit-image
 requests
 yirgacheffe
-aoh
+aoh[validation]
 
 gdal[numpy]
 
diff --git a/scripts/run.sh b/scripts/run.sh
index a5dcab6..8e875bb 100755
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -93,7 +93,7 @@ fi
 # Generate the crosswalk table
 if [ ! -f "${DATADIR}"/crosswalk.csv ]; then
     echo "Generating crosswalk table..."
-    python3 ./prepare_layers/convert_crosswalk.py --original "${PWD}"/data/crosswalk_bin_T.csv --output "${DATADIR}"/crosswalk.csv
+    python3 ./prepare_layers/convert_crosswalk.py --original "${DATADIR}"/crosswalk_bin_T.csv --output "${DATADIR}"/crosswalk.csv
 fi
 
 # Get species data per taxa from IUCN data
@@ -105,9 +105,9 @@ do
     fi
 done
 
-if [ -f data/BL_Species_Elevations_2023.csv ]; then
+if [ -f "${DATADIR}"/BL_Species_Elevations_2023.csv ]; then
     echo "Applying birdlife data..."
-    python3 ./prepare_species/apply_birdlife_data.py --geojsons "${DATADIR}"/species-info/AVES --overrides data/BL_Species_Elevations_2023.csv
+    python3 ./prepare_species/apply_birdlife_data.py --geojsons "${DATADIR}"/species-info/AVES --overrides "${DATADIR}"/BL_Species_Elevations_2023.csv
 fi
 
 echo "Generating AoH task list..."
@@ -118,20 +118,20 @@ littlejohn -j "${PROCESS_COUNT}" -o "${DATADIR}"/aohbatch.log -c "${DATADIR}"/ao
 
 # Calculate predictors from AoHs
 echo "Generating species richness..."
-python3 ./aoh-calculator/summaries/species_richness.py --aohs_folder "${DATADIR}"/aohs/current/ \
-                                                       --output "${DATADIR}"/summaries/species_richness.tif
+aoh-species-richness --aohs_folder "${DATADIR}"/aohs/current/ \
+                     --output "${DATADIR}"/summaries/species_richness.tif
 echo "Generating endemism..."
-python3 ./aoh-calculator/summaries/endemism.py --aohs_folder "${DATADIR}"/aohs/current/ \
-                                               --species_richness "${DATADIR}"/summaries/species_richness.tif \
-                                               --output "${DATADIR}"/summaries/endemism.tif
+aoh-endemism --aohs_folder "${DATADIR}"/aohs/current/ \
+             --species_richness "${DATADIR}"/summaries/species_richness.tif \
+             --output "${DATADIR}"/summaries/endemism.tif
 
 # Aoh Validation
 echo "Collating validation data..."
-python3 ./aoh-calculator/validation/collate_data.py --aoh_results "${DATADIR}"/aohs/current/ \
-                                                    --output "${DATADIR}"/validation/aohs.csv
+aoh-collate-data --aoh_results "${DATADIR}"/aohs/current/ \
+                 --output "${DATADIR}"/validation/aohs.csv
 echo "Calculating model validation..."
-python3 ./aoh-calculator/validation/validate_map_prevalence.py --collated_aoh_data "${DATADIR}"/validation/aohs.csv \
-                                                               --output "${DATADIR}"/validation/model_validation.csv
+aoh-validate-prevalence --collated_aoh_data "${DATADIR}"/validation/aohs.csv \
+                        --output "${DATADIR}"/validation/model_validation.csv
 
 # Threats
 echo "Generating threat task list..."

From 6568275536d9bd30cd6febd8a935f6c3314a1d75 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 16 Oct 2025 07:41:47 +0100
Subject: [PATCH 14/21] Updated readme for required inputs

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 93895b4..948ce0e 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,14 @@ $ cd star
 $ git submodule update --init --recursive
 ```
 
+## Additional inputs
+
+There are some additional inputs required to run the pipeline, which should be plated in the directory you use to store the pipeline results.
+
+* crosswalk_bin_T.csv - the crosswalk table from the [Lumbierres et al 2021](https://conbio.onlinelibrary.wiley.com/doi/10.1111/cobi.13851)
+* SpeciesList_generalisedRangePolygons.csv - A list of species with generalised ranges on the IUCN Redlist.
+* BL_Species_Elevations_2023.csv (optional) - corrections to the elevation of birdlife species on the IUCN Redlist taken from the BirdLife data.
+
 ## Running the pipeline
 
 The easiest way to get started will be to run `scripts/run.sh` under a linux environment.

From a619824a37f9a34af6df5f95fc96045afe509c29 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 16 Oct 2025 07:43:21 +0100
Subject: [PATCH 15/21] script formatting

---
 scripts/run.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/run.sh b/scripts/run.sh
index 8e875bb..a1d153f 100755
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -56,7 +56,9 @@ fi
 if [ ! -d "${DATADIR}"/habitat_layers ]; then
     if [ ! -f "${DATADIR}"/habitat/raw.tif ]; then
         echo "Fetching habitat map..."
-        reclaimer zenodo --zenodo_id 3939050 --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif --output "${DATADIR}"/habitat/raw.tif
+        reclaimer zenodo --zenodo_id 3939050 \
+                         --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif \
+                         --output "${DATADIR}"/habitat/raw.tif
     fi
 
     echo "Processing habitat map..."

From 2186abd3af14778e2062a1fa7e66d590ced320cc Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 16 Oct 2025 07:54:15 +0100
Subject: [PATCH 16/21] More readme instructions

---
 README.md        | 30 +++++++++++++++++++++++++-----
 requirements.txt |  2 --
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 948ce0e..497b609 100644
--- a/README.md
+++ b/README.md
@@ -6,14 +6,34 @@ See [method.md](method.md) for a description of the methodology, or `scripts/run
 
 # Running the pipeline
 
-## Checking out the code
+## Requirements
 
-This repository uses submodules, so once you have cloned it, you need to fetch the submodules:
+The easiest way to run the pipeline is using the included Dockerfile to build a Docker container which will have all the dependancies installed in it.
+
+If not, you will need:
+
+* Python3 >= 3.10
+* GDAL
+* R (required for validation)
+
+If you are using macOS please note that the default Python install that Apple ships is now several years out of date (Python 3.9, released Oct 2020) and you'll need to install a more recent version (for example, using [homebrew](https://brew.sh)).
+
+With those you should set up a Python virtual environment to install all the required packages. The one trick to this is you need to match the Python GDAL package to your installed GDAL version.
+
+```shell
+$ python3 -m venv ./venv
+$ . ./venv/bin/activate
+(venv) $ gdalinfo --version
+GDAL 3.11.3 "Eganville", released 2025/07/12
+(venv) $ pip install gdal[numpy]==3.11.3
+...
+(venv) $ pip install -r requirements.txt
+```
+
+You will also need to install the R stats packages required for the validation stage:
 
 ```shell
-$ git clone https://github.com/quantifyearth/star.git
-$ cd star
-$ git submodule update --init --recursive
+$ R -e "install.packages(c('lme4', 'lmerTest'), repos='https://cran.rstudio.com/')"
 ```
 
 ## Additional inputs
diff --git a/requirements.txt b/requirements.txt
index 56c2f4b..6fad873 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-alive-progress
 numpy
 geopandas
 postgis
@@ -7,7 +6,6 @@ psutil
 pymer4
 pyproj
 scikit-image
-requests
 yirgacheffe
 aoh[validation]
 

From 0f32da808522aadb535fdd2e8b73a62a3d480a55 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 16 Oct 2025 08:45:31 +0100
Subject: [PATCH 17/21] Update to GDAL 3.11.4

---
 .github/workflows/python-package.yml | 4 ++--
 Dockerfile                           | 6 ++++--
 README.md                            | 2 ++
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 94ca3e0..5724e1c 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -12,7 +12,7 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
-    container: ghcr.io/osgeo/gdal:ubuntu-small-3.11.3
+    container: ghcr.io/osgeo/gdal:ubuntu-small-3.11.4
     strategy:
       fail-fast: false
       matrix:
@@ -35,7 +35,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install gdal[numpy]==3.11.3
+          python -m pip install gdal[numpy]==3.11.4
           python -m pip install -r requirements.txt
 
       - name: Lint with pylint
diff --git a/Dockerfile b/Dockerfile
index 1217663..5adc593 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,13 +10,14 @@ WORKDIR /go/littlejohn
 RUN go mod tidy
 RUN go build
 
-FROM ghcr.io/osgeo/gdal:ubuntu-small-3.10.0
+FROM ghcr.io/osgeo/gdal:ubuntu-small-3.11.4
 
 RUN apt-get update -qqy && \
 	apt-get install -qy \
 		git \
 		cmake \
 		python3-pip \
+		shellcheck \
 		r-base \
 		libpq-dev \
 		libtirpc-dev \
@@ -27,7 +28,7 @@ COPY --from=reclaimerbuild /go/reclaimer/reclaimer /bin/reclaimer
 COPY --from=littlejohnbuild /go/littlejohn/littlejohn /bin/littlejohn
 
 RUN rm /usr/lib/python3.*/EXTERNALLY-MANAGED
-RUN pip install gdal[numpy]==3.10.0
+RUN pip install gdal[numpy]==3.11.4
 
 COPY requirements.txt /tmp/
 RUN pip install -r /tmp/requirements.txt
@@ -54,3 +55,4 @@ ENV PYTHONPATH=/root/star
 RUN python3 -m pytest ./tests
 RUN python3 -m pylint prepare_layers prepare_species utils tests
 RUN python3 -m mypy prepare_layers prepare_speices utils tests
+RUN shellcheck ./scripts/run.sh
diff --git a/README.md b/README.md
index 497b609..87c8fd8 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@ If not, you will need:
 * Python3 >= 3.10
 * GDAL
 * R (required for validation)
+* [Reclaimer](https://github.com/quantifyearth/reclaimer/) - a Go tool for fetching data from Zenodo
+* [Littlejohn](https://github.com/quantifyearth/littlejohn/) - a Go tool for running scripts in parallel
 
 If you are using macOS please note that the default Python install that Apple ships is now several years out of date (Python 3.9, released Oct 2020) and you'll need to install a more recent version (for example, using [homebrew](https://brew.sh)).
 

From 380f34fffe394322dfa911c1c9886e3d05e61d38 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 16 Oct 2025 08:46:17 +0100
Subject: [PATCH 18/21] README update

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 87c8fd8..f41a1d7 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ If not, you will need:
 
 If you are using macOS please note that the default Python install that Apple ships is now several years out of date (Python 3.9, released Oct 2020) and you'll need to install a more recent version (for example, using [homebrew](https://brew.sh)).
 
-With those you should set up a Python virtual environment to install all the required packages. The one trick to this is you need to match the Python GDAL package to your installed GDAL version.
+With those you should set up a Python virtual environment to install all the required packages. The one trick to this is you need to match the Python GDAL package to your installed GDAL version. For example, on my machine I did the following:
 
 ```shell
 $ python3 -m venv ./venv

From ad47f640911fbbed39dbd9302e8c10aa42387024 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 16 Oct 2025 08:48:53 +0100
Subject: [PATCH 19/21] Fix typo in dockerfile

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 5adc593..913f289 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -54,5 +54,5 @@ ENV PYTHONPATH=/root/star
 
 RUN python3 -m pytest ./tests
 RUN python3 -m pylint prepare_layers prepare_species utils tests
-RUN python3 -m mypy prepare_layers prepare_speices utils tests
+RUN python3 -m mypy prepare_layers prepare_species utils tests
 RUN shellcheck ./scripts/run.sh

From 502d23b74694ac6f087368f9862828445bc0ac56 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 16 Oct 2025 09:05:49 +0100
Subject: [PATCH 20/21] Address review comments

---
 .gitmodules      |  0
 README.md        | 90 +++++++++++++++++++++++++++++-------------------
 requirements.txt |  7 ++--
 scripts/run.sh   |  3 +-
 4 files changed, 58 insertions(+), 42 deletions(-)
 delete mode 100644 .gitmodules

diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index e69de29..0000000
diff --git a/README.md b/README.md
index f41a1d7..1749c29 100644
--- a/README.md
+++ b/README.md
@@ -4,13 +4,53 @@ An implementation of the threat based [STAR biodiversity metric by Muir et al](h
 
 See [method.md](method.md) for a description of the methodology, or `scripts/run.sh` for how to execute the pipeline.
 
-# Running the pipeline
+## Checking out the code
 
-## Requirements
+The code is available on github, and can be checked out from there:
 
-The easiest way to run the pipeline is using the included Dockerfile to build a Docker container which will have all the dependancies installed in it.
+```shell
+$ git clone https://github.com/quantifyearth/STAR.git
+...
+$ cd STAR
+```
+
+## Additional inputs
+
+There are some additional inputs required to run the pipeline, which should be placed in the directory you use to store the pipeline results.
+
+* crosswalk_bin_T.csv - the crosswalk table from the [Lumbierres et al 2021](https://conbio.onlinelibrary.wiley.com/doi/10.1111/cobi.13851)
+* SpeciesList_generalisedRangePolygons.csv - A list of species with generalised ranges on the IUCN Redlist.
+* BL_Species_Elevations_2023.csv (optional) - corrections to the elevation of birdlife species on the IUCN Redlist taken from the BirdLife data.
+
+The script also assumes you have a Postgres database with the IUCN Redlist database in it.
+
+## Running the pipeline
+
+There are two ways to run the pipeline. The easiest way is to use Docker if you have it available to you, as it will manage all the dependencies for you. But you can check out and run it locally if you want to also, but it requires a little more effort.
 
-If not, you will need:
+### Running with Docker
+
+
+There is included a docker file, which is based on the GDAL container image, which is set up to install everything ready to use. You can build that using:
+
+```shell
+$ docker buildx build -t star .
+```
+
+You can then invoke the run script using this. You should map an external folder into the container as a place to store the intermediary data and final results, and you should provide details about the Postgres instance with the IUCN redlist:
+
+```shell
+$ docker run --rm -v /some/local/dir:/data \
+	-e DB_HOST=localhost \
+	-e DB_NAME=iucnredlist \
+	-e DB_PASSWORD=supersecretpassword \
+	-e DB_USER=postgres \
+	star ./scripts/run.sh
+```
+
+### Running without Docker
+
+If you prefer not to use Docker, you will need:
 
 * Python3 >= 3.10
 * GDAL
@@ -38,42 +78,20 @@ You will also need to install the R stats packages required for the validation s
 $ R -e "install.packages(c('lme4', 'lmerTest'), repos='https://cran.rstudio.com/')"
 ```
 
-## Additional inputs
-
-There are some additional inputs required to run the pipeline, which should be plated in the directory you use to store the pipeline results.
-
-* crosswalk_bin_T.csv - the crosswalk table from the [Lumbierres et al 2021](https://conbio.onlinelibrary.wiley.com/doi/10.1111/cobi.13851)
-* SpeciesList_generalisedRangePolygons.csv - A list of species with generalised ranges on the IUCN Redlist.
-* BL_Species_Elevations_2023.csv (optional) - corrections to the elevation of birdlife species on the IUCN Redlist taken from the BirdLife data.
-
-## Running the pipeline
-
-The easiest way to get started will be to run `scripts/run.sh` under a linux environment.
-
-### Running on Ubuntu
-
-The following extra utilities will need to be installed:
-
-* [Reclaimer](https://github.com/quantifyearth/reclaimer/) - a utility for downloading data from various primary sources.
-* [Littlejohn](https://github.com/quantifyearth/littlejohn/) - a utility to run jobs in parallel driven by a CSV file.
+Before running the pipeline you will need to set several environmental variables to tell the script where to store data and where the database with the IUCN Redlist is. You can set these manually, or we recommend using a tool like [direnv](https://direnv.net).
 
-### Running in Docker
-
-There is included a docker file, which is based on the GDAL container image, which is set up to install everything ready to use. You can build that using:
-
-```
-$ docker buildx build -t star .
+```shell
+export DATADIR=[PATH WHERE YOU WANT THE RESULTS]
+export DB_HOST=localhost
+export DB_NAME=iucnredlist
+export DB_PASSWORD=supersecretpassword
+export DB_USER=postgres
 ```
 
-You can then invoke the run script using this. You should map an external folder into the container as a place to store the intermediary data and final results, and you should provide details about the Postgres instance with the IUCN redlist:
+Once you have all that you can then run the pipeline:
 
-```
-$ docker run --rm -v /some/local/dir:/data \
-	-e DB_HOST=localhost \
-	-e DB_NAME=iucnredlist \
-	-e DB_PASSWORD=supersecretpassword \
-	-e DB_USER=postgres \
-	star ./scripts/run.sh
+```shell
+(venv) $ ./scripts/run.sh
 ```
 
 # Credits
diff --git a/requirements.txt b/requirements.txt
index 6fad873..720c71a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,13 +6,12 @@ psutil
 pymer4
 pyproj
 scikit-image
-yirgacheffe
-aoh[validation]
+yirgacheffe>=1.9
+aoh[validation]>=1.0
 
+# GDAL should be installed manually to match the version of the library installed on your machine
 gdal[numpy]
 
-git+https://github.com/quantifyearth/iucn_modlib
-
 pylint
 mypy
 pytest
diff --git a/scripts/run.sh b/scripts/run.sh
index a1d153f..f3c193f 100755
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -32,7 +32,7 @@ if [[ -n "${SLURM_JOB_ID}" ]]; then
     cd "${HOME}"/dev/star
     PROCESS_COUNT="${SLURM_JOB_CPUS_PER_NODE}"
 else
-    PROCESS_COUNT=$(nproc --all)
+    PROCESS_COUNT=$(getconf _NPROCESSORS_ONLN)
 fi
 echo "Using ${PROCESS_COUNT} threads."
 
@@ -79,7 +79,6 @@ fi
 if [[ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif || ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]]; then
     if [ ! -f "${DATADIR}"/elevation/elevation.tif ]; then
         echo "Fetching elevation map..."
-        mkdir -p "${DATADIR}"/elevation
         reclaimer zenodo --zenodo_id 5719984  --filename dem-100m-esri54017.tif --output "${DATADIR}"/elevation/elevation.tif
     fi
     if [ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif ]; then

From 8346a594a5f260264fd856285362654f9271f467 Mon Sep 17 00:00:00 2001
From: Michael Dales <mwd24@cam.ac.uk>
Date: Thu, 16 Oct 2025 09:14:28 +0100
Subject: [PATCH 21/21] Add Lumbierres crosswalk

---
 README.md                | 15 ++++++++++++++-
 data/crosswalk_bin_T.csv | 18 ++++++++++++++++++
 scripts/run.sh           |  2 +-
 3 files changed, 33 insertions(+), 2 deletions(-)
 create mode 100644 data/crosswalk_bin_T.csv

diff --git a/README.md b/README.md
index 1749c29..bc552b6 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,6 @@ $ cd STAR
 
 There are some additional inputs required to run the pipeline, which should be placed in the directory you use to store the pipeline results.
 
-* crosswalk_bin_T.csv - the crosswalk table from the [Lumbierres et al 2021](https://conbio.onlinelibrary.wiley.com/doi/10.1111/cobi.13851)
 * SpeciesList_generalisedRangePolygons.csv - A list of species with generalised ranges on the IUCN Redlist.
 * BL_Species_Elevations_2023.csv (optional) - corrections to the elevation of birdlife species on the IUCN Redlist taken from the BirdLife data.
 
@@ -97,3 +96,17 @@ Once you have all that you can then run the pipeline:
 # Credits
 
 The author of this package is greatly indebted to both [Francesca Ridley](https://www.ncl.ac.uk/nes/people/profile/francescaridley.html) from the University of Newcastle and [Simon Tarr](https://www.linkedin.com/in/simon-tarr-22069b209/) of the IUCN for their guidance and review.
+
+## Data Attribution
+
+The crosswalk table `data/crosswalk_bin_T.csv` was created by [Francesca Ridley](https://www.ncl.ac.uk/nes/people/profile/francescaridley.html) and is derived from:
+
+```
+Lumbierres, M., Dahal, P.R., Di Marco, M., Butchart, S.H.M., Donald, P.F.,
+& Rondinini, C. (2022). Translating habitat class to land cover to map area
+of habitat of terrestrial vertebrates. Conservation Biology, 36, e13851.
+https://doi.org/10.1111/cobi.13851
+```
+
+The paper is licensed under CC BY-NC. It is used in this STAR implementation to crosswalk between the IUCN Habitat classes in the Redlist and the land classes in the Copernicus data layers.
+
diff --git a/data/crosswalk_bin_T.csv b/data/crosswalk_bin_T.csv
new file mode 100644
index 0000000..f068cf8
--- /dev/null
+++ b/data/crosswalk_bin_T.csv
@@ -0,0 +1,18 @@
+CGLS100_name,CGLS100_value,Label,H_1,H_2,H_3,H_4,H_5,H_6,H_7,H_8,H_14.1,H_14.2,H_14.3,H_14.6,H_14.4,H_14.5,H_15
+CLS_20_shrubs,20,shrubs,0,1,1,0,0,0,U,1,0,0,0,0,0,0,0
+CLS_30_Herbaceous_vegetation,30,Herbaceous_vegetation,0,0,0,1,0,0,U,0,0,0,0,0,0,0,0
+CLS_40_CultivatedandManaged_VegetationAgriculture,40,CultivatedandManaged_VegetationAgriculture,0,0,0,1,1,0,U,0,1,1,0,0,0,0,0
+CLS_50_Urban_builtup,50,Urban_builtup,0,0,0,0,0,0,U,0,0,0,0,0,1,1,0
+CLS_60_bare_sparsevegetation,60,bare_sparsevegetation,0,0,1,0,0,1,U,1,0,0,0,0,0,0,0
+CLS_80_permanent_water,80,permanent_water,0,0,0,0,1,0,U,0,0,0,0,0,0,0,0
+CLS_90_Herbaceous_wetland,90,Herbaceous_wetland,0,0,0,0,1,0,U,0,0,0,0,0,0,0,1
+CLS_111_Closedforest_evergreen_needle,111,Closedforest_evergreen_needle,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0
+CLS_112_Closedforest_evergreen_broad,112,Closedforest_evergreen_broad,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0
+CLS_114_Closedforest_deciduous_broad,114,Closedforest_deciduous_broad,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0
+CLS_115_Closedforest_mixed,115,Closedforest_mixed,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0
+CLS_116_Closedforest_unknown,116,Closedforest_unknown,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0
+CLS_121_Openforest_evergreen_needle,121,Openforest_evergreen_needle,1,0,0,0,0,1,U,0,0,0,0,0,0,0,0
+CLS_122_Openforest_evergreen_broad,122,Openforest_evergreen_broad,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0
+CLS_124_Openforest_deciduous_broad,124,Openforest_deciduous_broad,0,1,0,0,0,0,U,0,0,0,0,0,0,0,0
+CLS_125_Openforest_mixed,125,Openforest_mixed,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0
+CLS_126_Openforest_unknown,126,Openforest_unknown,0,0,0,0,0,0,U,0,0,0,0,0,0,0,0
diff --git a/scripts/run.sh b/scripts/run.sh
index f3c193f..d88e05b 100755
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -94,7 +94,7 @@ fi
 # Generate the crosswalk table
 if [ ! -f "${DATADIR}"/crosswalk.csv ]; then
     echo "Generating crosswalk table..."
-    python3 ./prepare_layers/convert_crosswalk.py --original "${DATADIR}"/crosswalk_bin_T.csv --output "${DATADIR}"/crosswalk.csv
+    python3 ./prepare_layers/convert_crosswalk.py --original ./data/crosswalk_bin_T.csv --output "${DATADIR}"/crosswalk.csv
 fi
 
 # Get species data per taxa from IUCN data