From 83e4fbf70ad041ab3fc8f9d7c3f6f5120fef576e Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 28 Aug 2025 14:39:04 +0000 Subject: [PATCH 01/21] Updating STAR to use slurm and newer yirgacheffe --- prepare_layers/make_masks.py | 47 ++++---- prepare_species/extract_species_data_psql.py | 11 +- requirements.txt | 2 +- scripts/slurm.sh | 115 +++++++++++++++++++ threats/threat_processing.py | 7 +- threats/threat_summation.py | 7 +- 6 files changed, 148 insertions(+), 41 deletions(-) create mode 100644 scripts/slurm.sh diff --git a/prepare_layers/make_masks.py b/prepare_layers/make_masks.py index f5d9756..a4b5ce1 100644 --- a/prepare_layers/make_masks.py +++ b/prepare_layers/make_masks.py @@ -1,62 +1,57 @@ import argparse import os import sys -from glob import glob +from pathlib import Path from typing import Set -import numpy as np -from yirgacheffe.layers import RasterLayer +import yirgacheffe as yg +import yirgacheffe.operators as yo OPEN_SEA_LCC = "lcc_200.tif" NO_DATA_LCC = "lcc_0.tif" def prepare_mask( - layers: Set[str], - output_path: str, + layers: Set[Path], + output_path: Path, at_least: bool = True, ) -> None: assert layers - rasters = [RasterLayer.layer_from_file(x) for x in layers] - - intersection = RasterLayer.find_intersection(rasters) - for r in rasters: - r.set_window_for_intersection(intersection) + rasters = [yg.read_raster(x) for x in layers] calc = rasters[0] for r in rasters[1:]: calc = calc + r if at_least: - calc = calc.numpy_apply(lambda a: np.where(a >= 0.5, 1.0, 0.0)) + calc = yo.where(calc >= 0.5, 1.0, 0.0) else: - calc = calc.numpy_apply(lambda a: np.where(a > 0.5, 1.0, 0.0)) + calc = yo.where(calc > 0.5, 1.0, 0.0)) - with RasterLayer.empty_raster_layer_like(rasters[0], filename=output_path) as result: - calc.parallel_save(result) + calc.to_geotiff(output_path, parallelism=128) def prepare_masks( - habitat_layers_path: str, - output_directory_path: str, + habitat_layers_path: Path, + output_directory_path: Path, ) -> None: os.makedirs(output_directory_path, exist_ok=True) - layer_files = set(glob("lcc_*.tif", root_dir=habitat_layers_path)) + layer_files = set(habitat_layers_path.glob("lcc_*.tif")) if not layer_files: sys.exit(f"Found no habitat layers in {habitat_layers_path}") - marine_layers = layer_files & set([OPEN_SEA_LCC]) - terrerstrial_layers = layer_files - set([OPEN_SEA_LCC, NO_DATA_LCC]) + marine_layers = {x for x in layer_files if x.name == OPEN_SEA_LCC} + terrerstrial_layers = {x for x in layer_files if x.name not in [OPEN_SEA_LCC, NO_DATA_LCC]} assert len(marine_layers) == 1 - assert len(terrerstrial_layers) == len(layer_files) - 2 + assert len(terrerstrial_layers) < len(layer_files) prepare_mask( - {os.path.join(habitat_layers_path, x) for x in marine_layers}, - os.path.join(output_directory_path, "marine_mask.tif"), + marine_layers, + output_directory_path / "marine_mask.tif", ) prepare_mask( - {os.path.join(habitat_layers_path, x) for x in terrerstrial_layers}, - os.path.join(output_directory_path, "terrestrial_mask.tif"), + terrerstrial_layers, + output_directory_path / "terrestrial_mask.tif", at_least=True, ) @@ -66,14 +61,14 @@ def main() -> None: parser = argparse.ArgumentParser(description="Generate terrestrial and marine masks.") parser.add_argument( '--habitat_layers', - type=str, + type=Path, help="directory with split and scaled habitat layers", required=True, dest="habitat_layers" ) parser.add_argument( '--output_directory', - type=str, + type=Path, help="Folder for output mask layers", required=True, dest="output_directory" diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py index 0ceda78..5f61eb8 100644 --- a/prepare_species/extract_species_data_psql.py +++ b/prepare_species/extract_species_data_psql.py @@ -23,7 +23,7 @@ logger.setLevel(logging.DEBUG) # To match the FABDEM elevation map we use -# different range min/max/seperation +# different range min/max/separation ELEVATION_MAX = 8580 ELEVATION_MIN = -427 ELEVATION_SPREAD = 12 @@ -31,6 +31,7 @@ COLUMNS = [ "id_no", "assessment_id", + "assessment_year", "season", "systems", "elevation_lower", @@ -61,6 +62,7 @@ SELECT assessments.sis_taxon_id as id_no, assessments.id as assessment_id, + DATE_PART('year', assessments.assessment_date) as assessment_year, assessments.possibly_extinct, assessments.possibly_extinct_in_the_wild, (assessment_supplementary_infos.supplementary_fields->>'ElevationLower.limit')::numeric AS elevation_lower, @@ -336,7 +338,7 @@ def process_row( register(connection) cursor = connection.cursor() - id_no, assessment_id, possibly_extinct, possibly_extinct_in_the_wild, \ + id_no, assessment_id, assessment_year, possibly_extinct, possibly_extinct_in_the_wild, \ elevation_lower, elevation_upper, scientific_name, family_name, category = row report = SpeciesReport(id_no, assessment_id, scientific_name) @@ -378,6 +380,7 @@ def process_row( [[ id_no, assessment_id, + assessment_year, "all", systems, int(elevation_lower) if elevation_lower is not None else None, @@ -471,10 +474,6 @@ def extract_data_per_species( partial(process_row, class_name, era_output_directory_path, target_projection, presence), results ) - # reports = [ - # process_row(class_name, era_output_directory_path, target_projection, presence, x) - # for x in results[:10] - # ] reports_df = pd.DataFrame( [x.as_row() for x in reports], diff --git a/requirements.txt b/requirements.txt index 7c9a8ef..2a7a3dd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,12 +9,12 @@ pyproj scikit-image requests zenodo_search +yirgacheffe gdal[numpy] git+https://github.com/quantifyearth/iucn_modlib git+https://github.com/quantifyearth/pyshark -git+https://github.com/quantifyearth/yirgacheffe@4a2cab77f4a64e3f09497ee7098dc9ba499cda90 pylint mypy diff --git a/scripts/slurm.sh b/scripts/slurm.sh new file mode 100644 index 0000000..c0eed55 --- /dev/null +++ b/scripts/slurm.sh @@ -0,0 +1,115 @@ +#!/bin/bash +# +# Assumes you've set up a python virtual environement in the current directory. +# +# In addition to the Python environemnt, you will need the following extra command line tools: +# +# https://github.com/quantifyearth/reclaimer - used to download inputs from Zenodo directly +# https://github.com/quantifyearth/littlejohn - used to run batch jobs in parallel + +set -e + +# shellcheck disable=SC1091 +source ${HOME}/venvs/life/bin/activate +cd ${HOME}/dev/star +export PATH=$PATH:$HOME/go/bin + +if [ -z "${DATADIR}" ]; then + echo "Please specify DATADIR" + exit 1 +fi + +if [ -z "${VIRTUAL_ENV}" ]; then + echo "Please specify run in a virtualenv" + exit 1 +fi + +declare -a TAXALIST=("AMPHIBIA" "AVES" "MAMMALIA" "REPTILIA") + +# Get habitat layer and prepare for use +if [ ! -d ${DATADIR}/habitat_layers ]; then + if [ ! -f ${DATADIR}/habitat/raw.tif ]; then + echo "Fetching habitat map..." + reclaimer zenodo --zenodo_id 3939050 --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif --output ${DATADIR}/habitat/raw.tif + fi + + echo "Processing habitat map..." + python3 ./aoh-calculator/habitat_process.py --habitat ${DATADIR}/habitat/raw.tif \ + --scale 1000.0 \ + --projection "ESRI:54009" \ + --output ${DATADIR}/tmp_habitat_layers/current + mv ${DATADIR}/tmp_habitat_layers ${DATADIR}/habitat_layers +fi + +if [ ! -d ${DATADIR}/masks ]; then + echo "Processing masks..." + python3 ./prepare_layers/make_masks.py --habitat_layers ${DATADIR}/habitat_layers/current \ + --output_directory ${DATADIR}/masks +fi + +# Fetch and prepare the elevation layers +if [[ ! -f ${DATADIR}/elevation/elevation-max-1k.tif || ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]]; then + if [ ! -f ${DATADIR}/elevation/elevation.tif ]; then + echo "Fetching elevation map..." + reclaimer zenodo --zenodo_id 5719984 --filename dem-100m-esri54017.tif --output ${DATADIR}/elevation/elevation.tif + fi + if [ ! -f ${DATADIR}/elevation/elevation-max-1k.tif ]; then + echo "Generating elevation max layer..." + gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r max -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-max-1k.tif + fi + if [ ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]; then + echo "Generating elevation min layer..." + gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r min -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-min-1k.tif + fi +fi + +# Generate the crosswalk table +if [ ! -f ${DATADIR}/crosswalk.csv ]; then + echo "Generating crosswalk table..." + python3 ./prepare_layers/convert_crosswalk.py --original ${PWD}/data/crosswalk_bin_T.csv --output ${DATADIR}/crosswalk.csv +fi + +# Get species data per taxa from IUCN data +for TAXA in "${TAXALIST[@]}" +do + echo "Extracting species data for ${TAXA}..." + python3 ./prepare_species/extract_species_data_psql.py --class ${TAXA} --output ${DATADIR}/species-info/${TAXA}/ --projection "ESRI:54009" --excludes ${DATADIR}/SpeciesList_generalisedRangePolygons.csv +done + +if [ -f data/BL_Species_Elevations_2023.csv ]; then + echo "Applying birdlife data..." + python3 ./prepare_species/apply_birdlife_data.py --geojsons ${DATADIR}/species-info/AVES --overrides data/BL_Species_Elevations_2023.csv +fi + +echo "Generating AoH task list..." +python3 ./utils/aoh_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/aohbatch.csv + +echo "Generating AoHs..." +littlejohn -j ${SLURM_JOB_CPUS_PER_NODE} -o ${DATADIR}/aohbatch.log -c ${DATADIR}/aohbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./aoh-calculator/aohcalc.py + +# Calculate predictors from AoHs +echo "Generating species richness..." +python3 ./aoh-calculator/summaries/species_richness.py --aohs_folder ${DATADIR}/aohs/current/ \ + --output ${DATADIR}/summaries/species_richness.tif +echo "Generating endemism..." +python3 ./aoh-calculator/summaries/endemism.py --aohs_folder ${DATADIR}/aohs/current/ \ + --species_richness ${DATADIR}/summaries/species_richness.tif \ + --output ${DATADIR}/summaries/endemism.tif + +# Aoh Validation +echo "Collating validation data..." +python3 ./aoh-calculator/validation/collate_data.py --aoh_results ${DATADIR}/aohs/current/ \ + --output ${DATADIR}/validation/aohs.csv +echo "Calculating model validation..." +python3 ./aoh-calculator/validation/validate_map_prevalence.py --collated_aoh_data ${DATADIR}/validation/aohs.csv \ + --output ${DATADIR}/validation/model_validation.csv + +# Threats +echo "Generating threat task list..." +python3 ./utils/threats_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/threatbatch.csv + +echo "Generating threat rasters..." +littlejohn -j ${SLURM_JOB_CPUS_PER_NODE} -o ${DATADIR}/threatbatch.log -c ${DATADIR}/threatbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./threats/threat_processing.py + +echo "Summarising threats..." +python3 ./threats/threat_summation.py --threat_rasters ${DATADIR}/threat_rasters --output ${DATADIR}/threat_results diff --git a/threats/threat_processing.py b/threats/threat_processing.py index 2a0adca..8e6011e 100644 --- a/threats/threat_processing.py +++ b/threats/threat_processing.py @@ -4,8 +4,8 @@ import sys import geopandas as gpd +import yirgacheffe as yg from pyogrio.errors import DataSourceError -from yirgacheffe.layers import RasterLayer def threat_processing_per_species( species_data_path: str, @@ -17,7 +17,7 @@ def threat_processing_per_species( except DataSourceError: sys.exit(f"Failed to read {species_data_path}") - with RasterLayer.layer_from_file(aoh_path) as aoh: + with yg.read_raster(aoh_path) as aoh: os.makedirs(output_directory_path, exist_ok=True) @@ -49,8 +49,7 @@ def threat_processing_per_species( threat_dir_path = os.path.join(output_directory_path, str(threat_id)) os.makedirs(threat_dir_path, exist_ok=True) output_path = os.path.join(threat_dir_path, f"{taxon_id}.tif") - with RasterLayer.empty_raster_layer_like(aoh, filename=output_path) as result: - per_threat_per_species_score.save(result) + per_threat_per_species_score.to_geotiff(output_path) def main() -> None: os.environ["OGR_GEOJSON_MAX_OBJ_SIZE"] = "0" diff --git a/threats/threat_summation.py b/threats/threat_summation.py index 54bf6a5..8905479 100644 --- a/threats/threat_summation.py +++ b/threats/threat_summation.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import List -from yirgacheffe.layers import RasterLayer # type: ignore +import yirgacheffe as yg from osgeo import gdal gdal.SetCacheMax(1024 * 1024 * 32) @@ -26,7 +26,7 @@ def worker( if path is None: break - with RasterLayer.layer_from_file(path) as partial_raster: + with yg.read_raster(path) as partial_raster: if merged_result is None: merged_result = RasterLayer.empty_raster_layer_like(partial_raster) cleaned_raster = partial_raster.nan_to_num() @@ -38,8 +38,7 @@ def worker( merged_result = temp if merged_result: - final = RasterLayer.empty_raster_layer_like(merged_result, filename=output_tif) - merged_result.save(final) + merged_result.to_geotiff(output_tif) def raster_sum( images_list: List[Path], From 7d3597d467fb14696deca31ebb507e2493a54c12 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Fri, 29 Aug 2025 10:41:39 +0100 Subject: [PATCH 02/21] Add birdlife data script --- prepare_species/apply_birdlife_data.py | 81 ++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 prepare_species/apply_birdlife_data.py diff --git a/prepare_species/apply_birdlife_data.py b/prepare_species/apply_birdlife_data.py new file mode 100644 index 0000000..9b46216 --- /dev/null +++ b/prepare_species/apply_birdlife_data.py @@ -0,0 +1,81 @@ +import argparse +import importlib +import math +import os + +import geopandas as gpd +import pandas as pd + +aoh_cleaning = importlib.import_module("aoh-calculator.cleaning") + + +# Columns from current BirdLife data overrides: +# SIS ID +# Assessment ID +# WBDB ID +# Sequence +# Scientific name +# Common name +# RL Category +# PE +# PEW +# Min altitude (m) +# Max altitude (m) +# Occasional lower elevation +# Occasional upper elevation + +def apply_birdlife_data( + geojson_directory_path: str, + overrides_path: str, +) -> None: + overrides = pd.read_csv(overrides_path, encoding="latin1") + + for _, row in overrides.iterrows(): + if math.isnan(row["Occasional lower elevation"]) and math.isnan(row["Occasional upper elevation"]): + continue + + path = os.path.join(geojson_directory_path, "AVES", "current", f"{row["SIS ID"]}.geojson") + if not os.path.exists(path): + continue + + species_info = gpd.read_file(path) + data = species_info.loc[0].copy() + + if not math.isnan(row["Occasional lower elevation"]): + data.elevation_lower = float(row["Occasional lower elevation"]) + else: + data.elevation_lower = float(data.elevation_lower) + if not math.isnan(row["Occasional upper elevation"]): + data.elevation_upper = float(row["Occasional upper elevation"]) + else: + data.elevation_upper = float(data.elevation_upper) + data = aoh_cleaning.tidy_data(data) + + res = gpd.GeoDataFrame(data.to_frame().transpose(), crs=species_info.crs, geometry="geometry") + res.to_file(path, driver="GeoJSON") + +def main() -> None: + parser = argparse.ArgumentParser(description="Process agregate species data to per-species-file.") + parser.add_argument( + '--geojsons', + type=str, + help='Directory where per species Geojson is stored', + required=True, + dest='geojson_directory_path', + ) + parser.add_argument( + '--overrides', + type=str, + help="CSV of overrides", + required=True, + dest="overrides", + ) + args = parser.parse_args() + + apply_birdlife_data( + args.geojson_directory_path, + args.overrides + ) + +if __name__ == "__main__": + main() From ceed2afacade9c693ef30af1fb4506987bd0e332 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Fri, 29 Aug 2025 10:42:23 +0100 Subject: [PATCH 03/21] Force year to int --- prepare_species/extract_species_data_psql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py index 5f61eb8..29d2872 100644 --- a/prepare_species/extract_species_data_psql.py +++ b/prepare_species/extract_species_data_psql.py @@ -380,7 +380,7 @@ def process_row( [[ id_no, assessment_id, - assessment_year, + int(assessment_year), "all", systems, int(elevation_lower) if elevation_lower is not None else None, From 41c99a0ed5047bfee468a38916ae1cc4fb080946 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 2 Oct 2025 15:56:29 +0100 Subject: [PATCH 04/21] Update AOH from sub module to pip --- .gitmodules | 3 --- aoh-calculator | 1 - prepare_species/apply_birdlife_data.py | 6 ++---- prepare_species/extract_species_data_psql.py | 5 ++--- requirements.txt | 1 + 5 files changed, 5 insertions(+), 11 deletions(-) delete mode 160000 aoh-calculator diff --git a/.gitmodules b/.gitmodules index 30fd8a6..e69de29 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "aoh-calculator"] - path = aoh-calculator - url = git@github.com:quantifyearth/aoh-calculator.git diff --git a/aoh-calculator b/aoh-calculator deleted file mode 160000 index c24def9..0000000 --- a/aoh-calculator +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c24def960799f170a9812af31d4c0e2dc5940dbf diff --git a/prepare_species/apply_birdlife_data.py b/prepare_species/apply_birdlife_data.py index 9b46216..f712d24 100644 --- a/prepare_species/apply_birdlife_data.py +++ b/prepare_species/apply_birdlife_data.py @@ -3,12 +3,10 @@ import math import os +import aoh import geopandas as gpd import pandas as pd -aoh_cleaning = importlib.import_module("aoh-calculator.cleaning") - - # Columns from current BirdLife data overrides: # SIS ID # Assessment ID @@ -49,7 +47,7 @@ def apply_birdlife_data( data.elevation_upper = float(row["Occasional upper elevation"]) else: data.elevation_upper = float(data.elevation_upper) - data = aoh_cleaning.tidy_data(data) + data = aoh.tidy_data(data) res = gpd.GeoDataFrame(data.to_frame().transpose(), crs=species_info.crs, geometry="geometry") res.to_file(path, driver="GeoJSON") diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py index 29d2872..9671ba3 100644 --- a/prepare_species/extract_species_data_psql.py +++ b/prepare_species/extract_species_data_psql.py @@ -8,7 +8,7 @@ from multiprocessing import Pool from typing import Any, List, Optional, Set, Tuple -# import pyshark # pylint: disable=W0611 +import aoh import geopandas as gpd import pandas as pd import pyproj @@ -16,7 +16,6 @@ import shapely from postgis.psycopg import register -aoh_cleaning = importlib.import_module("aoh-calculator.cleaning") logger = logging.getLogger(__name__) logging.basicConfig() @@ -195,7 +194,7 @@ def tidy_reproject_save( target_crs = pyproj.CRS.from_string(target_projection) if target_projection else src_crs graw = gdf.loc[0].copy() - grow = aoh_cleaning.tidy_data( + grow = aoh.tidy_data( graw, elevation_max=ELEVATION_MAX, elevation_min=ELEVATION_MIN, diff --git a/requirements.txt b/requirements.txt index 2a7a3dd..31c1497 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ scikit-image requests zenodo_search yirgacheffe +aoh gdal[numpy] From 7af51e229ee569f82ab43a9807680691e252f8fd Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 2 Oct 2025 16:00:07 +0100 Subject: [PATCH 05/21] Fixes spotted by linter --- prepare_layers/make_masks.py | 2 +- prepare_species/apply_birdlife_data.py | 2 +- threats/threat_summation.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/prepare_layers/make_masks.py b/prepare_layers/make_masks.py index a4b5ce1..2557280 100644 --- a/prepare_layers/make_masks.py +++ b/prepare_layers/make_masks.py @@ -24,7 +24,7 @@ def prepare_mask( if at_least: calc = yo.where(calc >= 0.5, 1.0, 0.0) else: - calc = yo.where(calc > 0.5, 1.0, 0.0)) + calc = yo.where(calc > 0.5, 1.0, 0.0) calc.to_geotiff(output_path, parallelism=128) diff --git a/prepare_species/apply_birdlife_data.py b/prepare_species/apply_birdlife_data.py index f712d24..0380477 100644 --- a/prepare_species/apply_birdlife_data.py +++ b/prepare_species/apply_birdlife_data.py @@ -32,7 +32,7 @@ def apply_birdlife_data( if math.isnan(row["Occasional lower elevation"]) and math.isnan(row["Occasional upper elevation"]): continue - path = os.path.join(geojson_directory_path, "AVES", "current", f"{row["SIS ID"]}.geojson") + path = os.path.join(geojson_directory_path, "AVES", "current", f'{row["SIS ID"]}.geojson') if not os.path.exists(path): continue diff --git a/threats/threat_summation.py b/threats/threat_summation.py index 8905479..5f1ffdd 100644 --- a/threats/threat_summation.py +++ b/threats/threat_summation.py @@ -8,6 +8,7 @@ from typing import List import yirgacheffe as yg +from yirgacheffe.layers import RasterLayer from osgeo import gdal gdal.SetCacheMax(1024 * 1024 * 32) From 9f9f686db20573916447ce90b8aea410d4134f7a Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 2 Oct 2025 16:00:18 +0100 Subject: [PATCH 06/21] Update method --- method.md | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/method.md b/method.md index f9d3fce..06daf8d 100644 --- a/method.md +++ b/method.md @@ -120,8 +120,12 @@ python3 ./prepare_layers/make_masks.py --habitat_layers /data/habitat_layers/cur To assist with provenance, we download the data from the Zenodo ID. ```shark-run:reclaimer -curl -o FABDEM.zip https://data.bris.ac.uk/datasets/tar/s5hqmjcdj8yo2ibzi9b4ew3sn.zip -... +curl -o /data/FABDEM.zip https://data.bris.ac.uk/datasets/tar/s5hqmjcdj8yo2ibzi9b4ew3sn.zip +``` + +```shark-run:gdalonly +python3 tbd.py --input /data/FABDEM.zip \ + --output /data/elevation.tif ``` Similarly to the habitat map we need to resample to 1km, however rather than picking the mean elevation, we select both the min and max elevation for each pixel, and then check whether the species is in that range when we calculate AoH. @@ -214,4 +218,18 @@ python3 ./aoh-calculator/validation/validate_map_prevelence.py --collated_aoh_da ```shark-publish /data/validation/model_validation.csv -``` \ No newline at end of file +``` + +## Threats + +```shark-run:aohbuilder +python3 ./threats/threat_processing.py --speciesdata /data/species-info/* \ + --aoh /data/aohs/ \ + --output /data/threat_rasters + +python3 ./threats/threat_summation.py --threat_rasters /data/threat_rasters --output /data/threat_results +``` + +```shark-publish +/data/threat_results +``` From d031255c7a3c02ec278b36796b0c67acb7699691 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 2 Oct 2025 16:15:49 +0100 Subject: [PATCH 07/21] Enforce mypy checks --- .github/workflows/python-package.yml | 17 +++++++++++------ .mypy.ini | 4 ++++ Dockerfile | 1 + prepare_species/extract_species_data_psql.py | 12 ++++++------ utils/aoh_generator.py | 15 +++++++-------- utils/threats_generator.py | 17 ++++++++--------- 6 files changed, 37 insertions(+), 29 deletions(-) create mode 100644 .mypy.ini diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 8fd17a0..397183d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,7 +12,7 @@ on: jobs: build: runs-on: ubuntu-latest - container: ghcr.io/osgeo/gdal:ubuntu-small-3.10.3 + container: ghcr.io/osgeo/gdal:ubuntu-small-3.11.3 strategy: fail-fast: false matrix: @@ -26,18 +26,23 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} + - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install gdal[numpy]==3.10.3 + python -m pip install gdal[numpy]==3.11.3 python -m pip install -r requirements.txt + - name: Lint with pylint - run: | - python3 -m pylint utils prepare_layers prepare_species threats + run: python3 -m pylint utils prepare_layers prepare_species threats + + - name: Type checking with mypy + run: python3 -m mypy utils prepare_layers prepare_species threats + - name: Tests - run: | - python3 -m pytest ./tests + run: python3 -m pytest ./tests diff --git a/.mypy.ini b/.mypy.ini new file mode 100644 index 0000000..d8ac83e --- /dev/null +++ b/.mypy.ini @@ -0,0 +1,4 @@ +[mypy] +ignore_missing_imports = True +explicit_package_bases = False +no_namespace_packages = True diff --git a/Dockerfile b/Dockerfile index ade1030..1217663 100644 --- a/Dockerfile +++ b/Dockerfile @@ -53,3 +53,4 @@ ENV PYTHONPATH=/root/star RUN python3 -m pytest ./tests RUN python3 -m pylint prepare_layers prepare_species utils tests +RUN python3 -m mypy prepare_layers prepare_speices utils tests diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py index 9671ba3..938547a 100644 --- a/prepare_species/extract_species_data_psql.py +++ b/prepare_species/extract_species_data_psql.py @@ -210,7 +210,7 @@ def tidy_reproject_save( def process_systems( systems_data: List[Tuple], report: SpeciesReport, -) -> None: +) -> List: if len(systems_data) == 0: raise ValueError("No systems found") if len(systems_data) > 1: @@ -250,9 +250,9 @@ def process_systems( ] def process_threats( - threat_data: List, + threat_data: List[Tuple[int,str,str]], report: SpeciesReport, -) -> bool: +) -> List[Tuple[int,int]]: cleaned_threats = [] for code, scope, severity in threat_data: if scope is None or scope.lower() == "unknown": @@ -329,9 +329,9 @@ def process_row( class_name: str, output_directory_path: str, target_projection: Optional[str], - presence: Tuple[int], + presence: Tuple[int, ...], row: Tuple, -) -> Tuple: +) -> SpeciesReport: connection = psycopg2.connect(DB_CONFIG) register(connection) @@ -436,7 +436,7 @@ def extract_data_per_species( connection = psycopg2.connect(DB_CONFIG) cursor = connection.cursor() - excludes = tuple([]) + excludes: Tuple = tuple([]) if excludes_path is not None: try: df = pd.read_csv(excludes_path) diff --git a/utils/aoh_generator.py b/utils/aoh_generator.py index 1392f36..2408cf0 100644 --- a/utils/aoh_generator.py +++ b/utils/aoh_generator.py @@ -7,12 +7,11 @@ import pandas as pd def aoh_generator( - input_dir: str, - data_dir: str, - output_csv_path: str + input_dir: Path, + data_dir: Path, + output_csv_path: Path, ): - taxa_dirs = Path(input_dir).glob("[!.]*") - data_dir = Path(data_dir) + taxa_dirs = input_dir.glob("[!.]*") res = [] for taxa_dir_path in taxa_dirs: @@ -49,21 +48,21 @@ def main() -> None: parser = argparse.ArgumentParser(description="Species and seasonality generator.") parser.add_argument( '--input', - type=str, + type=Path, help="directory with taxa folders of species info", required=True, dest="input_dir" ) parser.add_argument( '--datadir', - type=str, + type=Path, help="directory for results", required=True, dest="data_dir", ) parser.add_argument( '--output', - type=str, + type=Path, help="name of output file for csv", required=True, dest="output" diff --git a/utils/threats_generator.py b/utils/threats_generator.py index 7b2a281..6538e8c 100644 --- a/utils/threats_generator.py +++ b/utils/threats_generator.py @@ -7,12 +7,11 @@ import pandas as pd def threats_generator( - input_dir: str, - data_dir: str, - output_csv_path: str + input_dir: Path, + data_dir: Path, + output_csv_path: Path, ): - taxa_dirs = Path(input_dir).glob("[!.]*") - data_dir = Path(data_dir) + taxa_dirs = input_dir.glob("[!.]*") res = [] for taxa_dir_path in taxa_dirs: @@ -40,24 +39,24 @@ def threats_generator( df.to_csv(output_csv_path, index=False) def main() -> None: - parser = argparse.ArgumentParser(description="threat tasts generator.") + parser = argparse.ArgumentParser(description="threat tasks generator.") parser.add_argument( '--input', - type=str, + type=Path, help="directory with taxa folders of species info", required=True, dest="input_dir" ) parser.add_argument( '--datadir', - type=str, + type=Path, help="directory for results", required=True, dest="data_dir", ) parser.add_argument( '--output', - type=str, + type=Path, help="name of output file for csv", required=True, dest="output" From 0970c4c455ac87dbb1551fdcbfab262058e962a9 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 2 Oct 2025 16:21:48 +0100 Subject: [PATCH 08/21] Remove unused imports --- prepare_species/apply_birdlife_data.py | 1 - prepare_species/extract_species_data_psql.py | 1 - 2 files changed, 2 deletions(-) diff --git a/prepare_species/apply_birdlife_data.py b/prepare_species/apply_birdlife_data.py index 0380477..60c2a78 100644 --- a/prepare_species/apply_birdlife_data.py +++ b/prepare_species/apply_birdlife_data.py @@ -1,5 +1,4 @@ import argparse -import importlib import math import os diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py index 938547a..f49087a 100644 --- a/prepare_species/extract_species_data_psql.py +++ b/prepare_species/extract_species_data_psql.py @@ -1,5 +1,4 @@ import argparse -import importlib import json import logging import math From 18b4f8c337bf49d48227e47f3226458fdb3b8eb3 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 2 Oct 2025 18:54:00 +0100 Subject: [PATCH 09/21] Typing on the threats utils --- threats/threat_summation.py | 54 +++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/threats/threat_summation.py b/threats/threat_summation.py index 5f1ffdd..194d157 100644 --- a/threats/threat_summation.py +++ b/threats/threat_summation.py @@ -14,12 +14,9 @@ gdal.SetCacheMax(1024 * 1024 * 32) def worker( - filename: str, - result_dir: str, + output_tif: Path, input_queue: Queue, ) -> None: - output_tif = os.path.join(result_dir, filename) - merged_result = None while True: @@ -43,19 +40,18 @@ def worker( def raster_sum( images_list: List[Path], - output_filename: str, + output_filename: Path, processes_count: int ) -> None: - result_dir, filename = os.path.split(output_filename) - os.makedirs(result_dir, exist_ok=True) + os.makedirs(output_filename.parent, exist_ok=True) - with tempfile.TemporaryDirectory() as tempdir: + with tempfile.TemporaryDirectory() as tempdir_str: + tempdir = Path(tempdir_str) with Manager() as manager: source_queue = manager.Queue() workers = [Process(target=worker, args=( - f"{index}.tif", - tempdir, + tempdir / f"{index}.tif", source_queue )) for index in range(processes_count)] for worker_process in workers: @@ -80,8 +76,7 @@ def raster_sum( # here we should have now a set of images in tempdir to merge single_worker = Process(target=worker, args=( - filename, - result_dir, + output_filename, source_queue )) single_worker.start() @@ -103,17 +98,17 @@ def raster_sum( time.sleep(1) def reduce_to_next_level( - rasters_directory: str, - output_directory: str, + rasters_directory: Path, + output_directory: Path, processes_count: int, ) -> None: - files = list(Path(rasters_directory).glob("**/*.tif")) + files = list(rasters_directory.glob("**/*.tif")) print(f"total items: {len(files)}") if not files: sys.exit(f"No files in {rasters_directory}, aborting") - buckets = {} + buckets: dict[str,list[Path]] = {} for filename in files: code, _ = os.path.splitext(filename.name) next_level_threat_id = ".".join(code.split('.')[:-1]) @@ -126,22 +121,22 @@ def reduce_to_next_level( print(f"Found {len(buckets)} threats at current level:") for code, files in buckets.items(): - target_output = os.path.join(output_directory, f"{code}.tif") + target_output = output_directory / f"{code}.tif" print(f"processing {code}: {len(files)} items") raster_sum(files, target_output, processes_count) def reduce_from_species( - rasters_directory: str, - output_directory: str, + rasters_directory: Path, + output_directory: Path, processes_count: int, ) -> None: - files = list(Path(rasters_directory).glob("**/*.tif")) + files = list(rasters_directory.glob("**/*.tif")) print(f"total items: {len(files)}") if not files: sys.exit(f"No files in {rasters_directory}, aborting") - buckets = {} + buckets: dict[str,list[Path]] = {} for filename in files: threat_code = filename.parts[-2] levels = threat_code.split('.') @@ -159,31 +154,30 @@ def reduce_from_species( print(f"Found {len(buckets)} threats at current level:") for code, files in buckets.items(): - target_output = os.path.join(output_directory, f"{code}.tif") + target_output = output_directory / f"{code}.tif" print(f"processing {code}: {len(files)} items") raster_sum(files, target_output, processes_count) - def threat_summation( - rasters_directory: str, - output_directory: str, + rasters_directory: Path, + output_directory: Path, processes_count: int, ) -> None: os.makedirs(output_directory, exist_ok=True) # All these files are at level3 to start with, so first make level2 print("processing level 2") - level2_target = os.path.join(output_directory, "level2") + level2_target = output_directory / "level2" reduce_from_species(rasters_directory, level2_target, processes_count) # Now reduce level2 to level1 print("processing level 1") - level1_target = os.path.join(output_directory, "level1") + level1_target = output_directory / "level1" reduce_to_next_level(level2_target, level1_target, processes_count) # Now build a final top level STAR print("processing level 0") - final_target = os.path.join(output_directory, "level0") + final_target = output_directory / "level0" reduce_to_next_level(level1_target, final_target, processes_count) @@ -191,14 +185,14 @@ def main() -> None: parser = argparse.ArgumentParser(description="Generates the combined, and level 1 and level 2 threat rasters.") parser.add_argument( "--threat_rasters", - type=str, + type=Path, required=True, dest="rasters_directory", help="GeoTIFF file containing level three per species threats" ) parser.add_argument( "--output", - type=str, + type=Path, required=True, dest="output_directory", help="Destination directory file for results." From 33d74d4396d6d67b6236fdcf6335e6ab4fc8d747 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Fri, 3 Oct 2025 07:54:48 +0100 Subject: [PATCH 10/21] More Python modernisation --- prepare_layers/convert_crosswalk.py | 9 ++-- prepare_species/apply_birdlife_data.py | 14 +++--- prepare_species/extract_species_data_psql.py | 49 ++++++++++---------- threats/threat_processing.py | 20 ++++---- threats/threat_summation.py | 3 +- utils/collect_validation_data.py | 13 +++--- 6 files changed, 55 insertions(+), 53 deletions(-) diff --git a/prepare_layers/convert_crosswalk.py b/prepare_layers/convert_crosswalk.py index 37926ff..4049d6e 100644 --- a/prepare_layers/convert_crosswalk.py +++ b/prepare_layers/convert_crosswalk.py @@ -1,4 +1,5 @@ import argparse +from pathlib import Path import pandas as pd @@ -28,8 +29,8 @@ } def convert_crosswalk( - original_path: str, - output_path: str, + original_path: Path, + output_path: Path, ) -> None: original = pd.read_csv(original_path) @@ -56,14 +57,14 @@ def main() -> None: parser = argparse.ArgumentParser(description="Convert IUCN crosswalk to minimal common format.") parser.add_argument( '--original', - type=str, + type=Path, help="Original format", required=True, dest="original_path", ) parser.add_argument( '--output', - type=str, + type=Path, help='Destination minimal file', required=True, dest='output_path', diff --git a/prepare_species/apply_birdlife_data.py b/prepare_species/apply_birdlife_data.py index 60c2a78..da65746 100644 --- a/prepare_species/apply_birdlife_data.py +++ b/prepare_species/apply_birdlife_data.py @@ -1,6 +1,6 @@ import argparse import math -import os +from pathlib import Path import aoh import geopandas as gpd @@ -22,8 +22,8 @@ # Occasional upper elevation def apply_birdlife_data( - geojson_directory_path: str, - overrides_path: str, + geojson_directory_path: Path, + overrides_path: Path, ) -> None: overrides = pd.read_csv(overrides_path, encoding="latin1") @@ -31,8 +31,8 @@ def apply_birdlife_data( if math.isnan(row["Occasional lower elevation"]) and math.isnan(row["Occasional upper elevation"]): continue - path = os.path.join(geojson_directory_path, "AVES", "current", f'{row["SIS ID"]}.geojson') - if not os.path.exists(path): + path = geojson_directory_path / "AVES" / "current" / f'{row["SIS ID"]}.geojson' + if not path.exists(): continue species_info = gpd.read_file(path) @@ -55,14 +55,14 @@ def main() -> None: parser = argparse.ArgumentParser(description="Process agregate species data to per-species-file.") parser.add_argument( '--geojsons', - type=str, + type=Path, help='Directory where per species Geojson is stored', required=True, dest='geojson_directory_path', ) parser.add_argument( '--overrides', - type=str, + type=Path, help="CSV of overrides", required=True, dest="overrides", diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py index f49087a..43b65c7 100644 --- a/prepare_species/extract_species_data_psql.py +++ b/prepare_species/extract_species_data_psql.py @@ -5,7 +5,8 @@ import os from functools import partial from multiprocessing import Pool -from typing import Any, List, Optional, Set, Tuple +from pathlib import Path +from typing import Any, Optional import aoh import geopandas as gpd @@ -180,13 +181,13 @@ def __getattr__(self, name: str) -> Any: return self.info[name] return None - def as_row(self) -> List: + def as_row(self) -> list: return [self.info[k] for k in self.REPORT_COLUMNS] def tidy_reproject_save( gdf: gpd.GeoDataFrame, report: SpeciesReport, - output_directory_path: str, + output_directory_path: Path, target_projection: Optional[str], ) -> None: src_crs = pyproj.CRS.from_epsg(4326) @@ -200,16 +201,16 @@ def tidy_reproject_save( elevation_seperation=ELEVATION_SPREAD, ) os.makedirs(output_directory_path, exist_ok=True) - output_path = os.path.join(output_directory_path, f"{grow.id_no}.geojson") + output_path = output_directory_path / f"{grow.id_no}.geojson" res = gpd.GeoDataFrame(grow.to_frame().transpose(), crs=src_crs, geometry="geometry") res_projected = res.to_crs(target_crs) res_projected.to_file(output_path, driver="GeoJSON") report.filename = output_path def process_systems( - systems_data: List[Tuple], + systems_data: list[tuple], report: SpeciesReport, -) -> List: +) -> list: if len(systems_data) == 0: raise ValueError("No systems found") if len(systems_data) > 1: @@ -249,9 +250,9 @@ def process_systems( ] def process_threats( - threat_data: List[Tuple[int,str,str]], + threat_data: list[tuple[int, str, str]], report: SpeciesReport, -) -> List[Tuple[int,int]]: +) -> list[tuple[int, int]]: cleaned_threats = [] for code, scope, severity in threat_data: if scope is None or scope.lower() == "unknown": @@ -267,9 +268,9 @@ def process_threats( return cleaned_threats def process_habitats( - habitats_data: List[List[str]], + habitats_data: list[list[str]], report: SpeciesReport, -) -> Set: +) -> set: if len(habitats_data) == 0: # Promote to "Unknown" habitats_data = [["18"]] @@ -295,7 +296,7 @@ def process_habitats( return habitats def process_geometries( - geometries_data: List[Tuple[int,shapely.Geometry]], + geometries_data: list[tuple[int, shapely.Geometry]], report: SpeciesReport, ) -> shapely.Geometry: if len(geometries_data) == 0: @@ -326,10 +327,10 @@ def process_geometries( def process_row( class_name: str, - output_directory_path: str, + output_directory_path: Path, target_projection: Optional[str], - presence: Tuple[int, ...], - row: Tuple, + presence: tuple[int, ...], + row: tuple, ) -> SpeciesReport: connection = psycopg2.connect(DB_CONFIG) @@ -399,7 +400,7 @@ def process_row( return report def apply_overrides( - overrides_path: str, + overrides_path: Path, results, ): overrides = pd.read_csv(overrides_path, encoding="latin1") @@ -426,16 +427,16 @@ def apply_overrides( def extract_data_per_species( class_name: str, - overrides_path: Optional[str], - excludes_path: Optional[str], - output_directory_path: str, + overrides_path: Optional[Path], + excludes_path: Optional[Path], + output_directory_path: Path, target_projection: Optional[str], ) -> None: connection = psycopg2.connect(DB_CONFIG) cursor = connection.cursor() - excludes: Tuple = tuple([]) + excludes: tuple = tuple([]) if excludes_path is not None: try: df = pd.read_csv(excludes_path) @@ -447,7 +448,7 @@ def extract_data_per_species( # For STAR-R we need historic data, but for STAR-T we just need current. # for era, presence in [("current", (1, 2)), ("historic", (1, 2, 4, 5))]: for era, presence in [("current", (1, 2))]: - era_output_directory_path = os.path.join(output_directory_path, era) + era_output_directory_path = output_directory_path / era # You can't do NOT IN on an empty list in SQL if excludes: @@ -478,7 +479,7 @@ def extract_data_per_species( columns=SpeciesReport.REPORT_COLUMNS ).sort_values('id_no') os.makedirs(era_output_directory_path, exist_ok=True) - reports_df.to_csv(os.path.join(era_output_directory_path, "report.csv"), index=False) + reports_df.to_csv(era_output_directory_path / "report.csv", index=False) def main() -> None: parser = argparse.ArgumentParser(description="Process agregate species data to per-species-file.") @@ -491,21 +492,21 @@ def main() -> None: ) parser.add_argument( '--overrides', - type=str, + type=Path, help="CSV of overrides", required=False, dest="overrides", ) parser.add_argument( '--excludes', - type=str, + type=Path, help="CSV of taxon IDs to not include", required=False, dest="excludes" ) parser.add_argument( '--output', - type=str, + type=Path, help='Directory where per species GeoJSON is stored', required=True, dest='output_directory_path', diff --git a/threats/threat_processing.py b/threats/threat_processing.py index 8e6011e..0e04c73 100644 --- a/threats/threat_processing.py +++ b/threats/threat_processing.py @@ -2,15 +2,16 @@ import json import os import sys +from pathlib import Path import geopandas as gpd import yirgacheffe as yg from pyogrio.errors import DataSourceError def threat_processing_per_species( - species_data_path: str, - aoh_path: str, - output_directory_path: str, + species_data_path: Path, + aoh_path: Path, + output_directory_path: Path, ) -> None: try: data = gpd.read_file(species_data_path) @@ -26,8 +27,7 @@ def threat_processing_per_species( threat_data = json.loads(data.threats[0]) try: - aoh_base, _ = os.path.splitext(aoh_path) - aoh_data_path = aoh_base + ".json" + aoh_data_path = aoh_path.with_suffix(".json") with open(aoh_data_path, "r", encoding="UTF-8") as f: aoh_data = json.load(f) aoh_total = aoh_data["aoh_total"] @@ -46,9 +46,9 @@ def threat_processing_per_species( per_threat_per_species_score = weighted_species * proportional_threat_weight print(per_threat_per_species_score.sum()) - threat_dir_path = os.path.join(output_directory_path, str(threat_id)) + threat_dir_path = output_directory_path / str(threat_id) os.makedirs(threat_dir_path, exist_ok=True) - output_path = os.path.join(threat_dir_path, f"{taxon_id}.tif") + output_path = threat_dir_path / f"{taxon_id}.tif" per_threat_per_species_score.to_geotiff(output_path) def main() -> None: @@ -57,21 +57,21 @@ def main() -> None: parser = argparse.ArgumentParser(description="Calculate per species threat layers") parser.add_argument( '--speciesdata', - type=str, + type=Path, help="Single species/seasonality geojson.", required=True, dest="species_data_path" ) parser.add_argument( '--aoh', - type=str, + type=Path, help="AoH raster of speices.", required=True, dest="aoh_path" ) parser.add_argument( '--output', - type=str, + type=Path, help='Directory where per species/threat layers are stored', required=True, dest='output_directory_path', diff --git a/threats/threat_summation.py b/threats/threat_summation.py index 194d157..5b93ede 100644 --- a/threats/threat_summation.py +++ b/threats/threat_summation.py @@ -5,7 +5,6 @@ import time from multiprocessing import Manager, Process, Queue, cpu_count from pathlib import Path -from typing import List import yirgacheffe as yg from yirgacheffe.layers import RasterLayer @@ -39,7 +38,7 @@ def worker( merged_result.to_geotiff(output_tif) def raster_sum( - images_list: List[Path], + images_list: list[Path], output_filename: Path, processes_count: int ) -> None: diff --git a/utils/collect_validation_data.py b/utils/collect_validation_data.py index 278d71d..49a3563 100644 --- a/utils/collect_validation_data.py +++ b/utils/collect_validation_data.py @@ -1,13 +1,14 @@ import argparse import os import shutil +from pathlib import Path import pandas as pd def collect_validation_data( - model_results_path: str, - data_dir: str, - output_dir: str, + model_results_path: Path, + data_dir: Path, + output_dir: Path, ) -> None: model_results = pd.read_csv(model_results_path) os.makedirs(output_dir, exist_ok=True) @@ -29,21 +30,21 @@ def main() -> None: parser = argparse.ArgumentParser(description="Collected range/AOH for species that failed validation") parser.add_argument( '--model_results', - type=str, + type=Path, help="directory with taxa folders of species info", required=True, dest="model_results_path" ) parser.add_argument( '--datadir', - type=str, + type=Path, help="directory for results", required=True, dest="data_dir", ) parser.add_argument( '--output', - type=str, + type=Path, help="name of output directory", required=True, dest="output" From dace36fe5f29d08379f507a3dddb9d3d7f0c2b04 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Fri, 3 Oct 2025 08:58:25 +0100 Subject: [PATCH 11/21] Unify run and slurm scripts. --- .github/workflows/python-package.yml | 6 +- scripts/run.sh | 107 +++++++++++++++---------- scripts/slurm.sh | 115 --------------------------- 3 files changed, 71 insertions(+), 157 deletions(-) delete mode 100644 scripts/slurm.sh diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 397183d..94ca3e0 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -22,7 +22,7 @@ jobs: - name: Install system run: | apt-get update -qqy - apt-get install -y git python3-pip libpq5 libpq-dev r-base libtirpc-dev + apt-get install -y git python3-pip libpq5 libpq-dev r-base libtirpc-dev shellcheck - uses: actions/checkout@v4 with: submodules: 'true' @@ -46,3 +46,7 @@ jobs: - name: Tests run: python3 -m pytest ./tests + + - name: Script checks + run: | + shellcheck ./scripts/run.sh diff --git a/scripts/run.sh b/scripts/run.sh index 2ce4863..c507bdc 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -7,7 +7,34 @@ # https://github.com/quantifyearth/reclaimer - used to download inputs from Zenodo directly # https://github.com/quantifyearth/littlejohn - used to run batch jobs in parallel +# Set shell script to exit on first error (-e) and to output commands being run to make +# reviewing logs easier (-x) set -e +set -x + +# We know we use two Go tools, so add go/bin to our path as in slurm world they're likely +# to be installed locally +export PATH="${PATH}":"${HOME}"/go/bin +if ! hash littlejohn 2>/dev/null; then + echo "Please ensure littlejohn is available" + exit 1 +fi +if ! hash reclaimer 2>/dev/null; then + echo "Please ensure reclaimer is available" + exit 1 +fi + +# Detect if we're running under SLURM +if [[ -n "${SLURM_JOB_ID}" ]]; then + # Slurm users will probably need to customise this + # shellcheck disable=SC1091 + source "${HOME}"/venvs/star/bin/activate + cd "${HOME}"/dev/star + PROCESS_COUNT="${SLURM_JOB_CPUS_PER_NODE}" +else + PROCESS_COUNT=$(nproc --all) +fi +echo "Using ${PROCESS_COUNT} threads." if [ -z "${DATADIR}" ]; then echo "Please specify $DATADIR" @@ -19,96 +46,94 @@ if [ -z "${VIRTUAL_ENV}" ]; then exit 1 fi -export CPUS=`getconf _NPROCESSORS_ONLN` -export THREADS=$(($CPUS / 2)) -echo "Using $THREADS threads." - declare -a TAXALIST=("AMPHIBIA" "AVES" "MAMMALIA" "REPTILIA") # Get habitat layer and prepare for use -if [ ! -d ${DATADIR}/habitat_layers ]; then - if [ ! -f ${DATADIR}/habitat/raw.tif ]; then +if [ ! -d "${DATADIR}"/habitat_layers ]; then + if [ ! -f "${DATADIR}"/habitat/raw.tif ]; then echo "Fetching habitat map..." - reclaimer zenodo --zenodo_id 3939050 --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif --output ${DATADIR}/habitat/raw.tif + reclaimer zenodo --zenodo_id 3939050 --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif --output "${DATADIR}"/habitat/raw.tif fi echo "Processing habitat map..." - python3 ./aoh-calculator/habitat_process.py --habitat ${DATADIR}/habitat/raw.tif \ - --scale 1000.0 \ - --projection "ESRI:54009" \ - --output ${DATADIR}/tmp_habitat_layers/current - mv ${DATADIR}/tmp_habitat_layers ${DATADIR}/habitat_layers + aoh-habitat-process --habitat "${DATADIR}"/habitat/raw.tif \ + --scale 1000.0 \ + --projection "ESRI:54009" \ + --output "${DATADIR}"/tmp_habitat_layers/current + mv "${DATADIR}"/tmp_habitat_layers "${DATADIR}"/habitat_layers fi -if [ ! -d ${DATADIR}/masks ]; then +if [ ! -d "${DATADIR}"/masks ]; then echo "Processing masks..." - python3 ./prepare_layers/make_masks.py --habitat_layers ${DATADIR}/habitat_layers/current \ - --output_directory ${DATADIR}/masks + python3 ./prepare_layers/make_masks.py --habitat_layers "${DATADIR}"/habitat_layers/current \ + --output_directory "${DATADIR}"/masks fi # Fetch and prepare the elevation layers -if [[ ! -f ${DATADIR}/elevation/elevation-max-1k.tif || ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]]; then - if [ ! -f ${DATADIR}/elevation/elevation.tif ]; then +if [[ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif || ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]]; then + if [ ! -f "${DATADIR}"/elevation/elevation.tif ]; then echo "Fetching elevation map..." - reclaimer zenodo --zenodo_id 5719984 --filename dem-100m-esri54017.tif --output ${DATADIR}/elevation/elevation.tif + reclaimer zenodo --zenodo_id 5719984 --filename dem-100m-esri54017.tif --output "${DATADIR}"/elevation/elevation.tif fi - if [ ! -f ${DATADIR}/elevation/elevation-max-1k.tif ]; then + if [ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif ]; then echo "Generating elevation max layer..." - gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r max -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-max-1k.tif + gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r max -co COMPRESS=LZW -wo NUM_THREADS=40 "${DATADIR}"/elevation/elevation.tif "${DATADIR}"/elevation/elevation-max-1k.tif fi - if [ ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]; then + if [ ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]; then echo "Generating elevation min layer..." - gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r min -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-min-1k.tif + gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r min -co COMPRESS=LZW -wo NUM_THREADS=40 "${DATADIR}"/elevation/elevation.tif "${DATADIR}"/elevation/elevation-min-1k.tif fi fi # Generate the crosswalk table -if [ ! -f ${DATADIR}/crosswalk.csv ]; then +if [ ! -f "${DATADIR}"/crosswalk.csv ]; then echo "Generating crosswalk table..." - python3 ./prepare_layers/convert_crosswalk.py --original ${PWD}/data/crosswalk_bin_T.csv --output ${DATADIR}/crosswalk.csv + python3 ./prepare_layers/convert_crosswalk.py --original "${PWD}"/data/crosswalk_bin_T.csv --output "${DATADIR}"/crosswalk.csv fi # Get species data per taxa from IUCN data for TAXA in "${TAXALIST[@]}" do - echo "Extracting species data for ${TAXA}..." - python3 ./prepare_species/extract_species_data_psql.py --class ${TAXA} --output ${DATADIR}/species-info/${TAXA}/ --projection "ESRI:54009" --excludes ${DATADIR}/SpeciesList_generalisedRangePolygons.csv + if [ ! -d "${DATADIR}"/species-info/"${TAXA}"/ ]; then + echo "Extracting species data for ${TAXA}..." + python3 ./prepare_species/extract_species_data_psql.py --class "${TAXA}" --output "${DATADIR}"/species-info/"${TAXA}"/ --projection "ESRI:54009" --excludes "${DATADIR}"/SpeciesList_generalisedRangePolygons.csv + fi done if [ -f data/BL_Species_Elevations_2023.csv ]; then echo "Applying birdlife data..." - python3 ./prepare_species/apply_birdlife_data.py --geojsons ${DATADIR}/species-info/AVES --overrides data/BL_Species_Elevations_2023.csv + python3 ./prepare_species/apply_birdlife_data.py --geojsons "${DATADIR}"/species-info/AVES --overrides data/BL_Species_Elevations_2023.csv fi echo "Generating AoH task list..." -python3 ./utils/aoh_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/aohbatch.csv +python3 ./utils/aoh_generator.py --input "${DATADIR}"/species-info --datadir "${DATADIR}" --output "${DATADIR}"/aohbatch.csv echo "Generating AoHs..." -littlejohn -j ${THREADS} -o ${DATADIR}/aohbatch.log -c ${DATADIR}/aohbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./aoh-calculator/aohcalc.py +littlejohn -j "${PROCESS_COUNT}" -o "${DATADIR}"/aohbatch.log -c "${DATADIR}"/aohbatch.csv "${VIRTUAL_ENV}"/bin/aoh-calc # Calculate predictors from AoHs echo "Generating species richness..." -python3 ./aoh-calculator/summaries/species_richness.py --aohs_folder ${DATADIR}/aohs/current/ \ - --output ${DATADIR}/summaries/species_richness.tif +python3 ./aoh-calculator/summaries/species_richness.py --aohs_folder "${DATADIR}"/aohs/current/ \ + --output "${DATADIR}"/summaries/species_richness.tif echo "Generating endemism..." -python3 ./aoh-calculator/summaries/endemism.py --aohs_folder ${DATADIR}/aohs/current/ \ - --species_richness ${DATADIR}/summaries/species_richness.tif \ - --output ${DATADIR}/summaries/endemism.tif +python3 ./aoh-calculator/summaries/endemism.py --aohs_folder "${DATADIR}"/aohs/current/ \ + --species_richness "${DATADIR}"/summaries/species_richness.tif \ + --output "${DATADIR}"/summaries/endemism.tif # Aoh Validation echo "Collating validation data..." -python3 ./aoh-calculator/validation/collate_data.py --aoh_results ${DATADIR}/aohs/current/ \ - --output ${DATADIR}/validation/aohs.csv +python3 ./aoh-calculator/validation/collate_data.py --aoh_results "${DATADIR}"/aohs/current/ \ + --output "${DATADIR}"/validation/aohs.csv echo "Calculating model validation..." -python3 ./aoh-calculator/validation/validate_map_prevalence.py --collated_aoh_data ${DATADIR}/validation/aohs.csv \ - --output ${DATADIR}/validation/model_validation.csv +python3 ./aoh-calculator/validation/validate_map_prevalence.py --collated_aoh_data "${DATADIR}"/validation/aohs.csv \ + --output "${DATADIR}"/validation/model_validation.csv # Threats echo "Generating threat task list..." -python3 ./utils/threats_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/threatbatch.csv +python3 ./utils/threats_generator.py --input "${DATADIR}"/species-info --datadir "${DATADIR}" --output "${DATADIR}"/threatbatch.csv echo "Generating threat rasters..." -littlejohn -j ${THREADS} -o ${DATADIR}/threatbatch.log -c ${DATADIR}/threatbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./threats/threat_processing.py +littlejohn -j "${PROCESS_COUNT}" -o "${DATADIR}"/threatbatch.log -c "${DATADIR}"/threatbatch.csv "${VIRTUAL_ENV}"/bin/python3 -- ./threats/threat_processing.py echo "Summarising threats..." -python3 ./threats/threat_summation.py --threat_rasters ${DATADIR}/threat_rasters --output ${DATADIR}/threat_results +python3 ./threats/threat_summation.py --threat_rasters "${DATADIR}"/threat_rasters --output "${DATADIR}"/threat_results diff --git a/scripts/slurm.sh b/scripts/slurm.sh deleted file mode 100644 index c0eed55..0000000 --- a/scripts/slurm.sh +++ /dev/null @@ -1,115 +0,0 @@ -#!/bin/bash -# -# Assumes you've set up a python virtual environement in the current directory. -# -# In addition to the Python environemnt, you will need the following extra command line tools: -# -# https://github.com/quantifyearth/reclaimer - used to download inputs from Zenodo directly -# https://github.com/quantifyearth/littlejohn - used to run batch jobs in parallel - -set -e - -# shellcheck disable=SC1091 -source ${HOME}/venvs/life/bin/activate -cd ${HOME}/dev/star -export PATH=$PATH:$HOME/go/bin - -if [ -z "${DATADIR}" ]; then - echo "Please specify DATADIR" - exit 1 -fi - -if [ -z "${VIRTUAL_ENV}" ]; then - echo "Please specify run in a virtualenv" - exit 1 -fi - -declare -a TAXALIST=("AMPHIBIA" "AVES" "MAMMALIA" "REPTILIA") - -# Get habitat layer and prepare for use -if [ ! -d ${DATADIR}/habitat_layers ]; then - if [ ! -f ${DATADIR}/habitat/raw.tif ]; then - echo "Fetching habitat map..." - reclaimer zenodo --zenodo_id 3939050 --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif --output ${DATADIR}/habitat/raw.tif - fi - - echo "Processing habitat map..." - python3 ./aoh-calculator/habitat_process.py --habitat ${DATADIR}/habitat/raw.tif \ - --scale 1000.0 \ - --projection "ESRI:54009" \ - --output ${DATADIR}/tmp_habitat_layers/current - mv ${DATADIR}/tmp_habitat_layers ${DATADIR}/habitat_layers -fi - -if [ ! -d ${DATADIR}/masks ]; then - echo "Processing masks..." - python3 ./prepare_layers/make_masks.py --habitat_layers ${DATADIR}/habitat_layers/current \ - --output_directory ${DATADIR}/masks -fi - -# Fetch and prepare the elevation layers -if [[ ! -f ${DATADIR}/elevation/elevation-max-1k.tif || ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]]; then - if [ ! -f ${DATADIR}/elevation/elevation.tif ]; then - echo "Fetching elevation map..." - reclaimer zenodo --zenodo_id 5719984 --filename dem-100m-esri54017.tif --output ${DATADIR}/elevation/elevation.tif - fi - if [ ! -f ${DATADIR}/elevation/elevation-max-1k.tif ]; then - echo "Generating elevation max layer..." - gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r max -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-max-1k.tif - fi - if [ ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]; then - echo "Generating elevation min layer..." - gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r min -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-min-1k.tif - fi -fi - -# Generate the crosswalk table -if [ ! -f ${DATADIR}/crosswalk.csv ]; then - echo "Generating crosswalk table..." - python3 ./prepare_layers/convert_crosswalk.py --original ${PWD}/data/crosswalk_bin_T.csv --output ${DATADIR}/crosswalk.csv -fi - -# Get species data per taxa from IUCN data -for TAXA in "${TAXALIST[@]}" -do - echo "Extracting species data for ${TAXA}..." - python3 ./prepare_species/extract_species_data_psql.py --class ${TAXA} --output ${DATADIR}/species-info/${TAXA}/ --projection "ESRI:54009" --excludes ${DATADIR}/SpeciesList_generalisedRangePolygons.csv -done - -if [ -f data/BL_Species_Elevations_2023.csv ]; then - echo "Applying birdlife data..." - python3 ./prepare_species/apply_birdlife_data.py --geojsons ${DATADIR}/species-info/AVES --overrides data/BL_Species_Elevations_2023.csv -fi - -echo "Generating AoH task list..." -python3 ./utils/aoh_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/aohbatch.csv - -echo "Generating AoHs..." -littlejohn -j ${SLURM_JOB_CPUS_PER_NODE} -o ${DATADIR}/aohbatch.log -c ${DATADIR}/aohbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./aoh-calculator/aohcalc.py - -# Calculate predictors from AoHs -echo "Generating species richness..." -python3 ./aoh-calculator/summaries/species_richness.py --aohs_folder ${DATADIR}/aohs/current/ \ - --output ${DATADIR}/summaries/species_richness.tif -echo "Generating endemism..." -python3 ./aoh-calculator/summaries/endemism.py --aohs_folder ${DATADIR}/aohs/current/ \ - --species_richness ${DATADIR}/summaries/species_richness.tif \ - --output ${DATADIR}/summaries/endemism.tif - -# Aoh Validation -echo "Collating validation data..." -python3 ./aoh-calculator/validation/collate_data.py --aoh_results ${DATADIR}/aohs/current/ \ - --output ${DATADIR}/validation/aohs.csv -echo "Calculating model validation..." -python3 ./aoh-calculator/validation/validate_map_prevalence.py --collated_aoh_data ${DATADIR}/validation/aohs.csv \ - --output ${DATADIR}/validation/model_validation.csv - -# Threats -echo "Generating threat task list..." -python3 ./utils/threats_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/threatbatch.csv - -echo "Generating threat rasters..." -littlejohn -j ${SLURM_JOB_CPUS_PER_NODE} -o ${DATADIR}/threatbatch.log -c ${DATADIR}/threatbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./threats/threat_processing.py - -echo "Summarising threats..." -python3 ./threats/threat_summation.py --threat_rasters ${DATADIR}/threat_rasters --output ${DATADIR}/threat_results From e70cc3402343903766161c75a69c757bc1e130a1 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Wed, 15 Oct 2025 11:52:44 +0000 Subject: [PATCH 12/21] Small tidying --- requirements.txt | 2 -- scripts/run.sh | 5 +++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 31c1497..b1514f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,14 +8,12 @@ pymer4 pyproj scikit-image requests -zenodo_search yirgacheffe aoh gdal[numpy] git+https://github.com/quantifyearth/iucn_modlib -git+https://github.com/quantifyearth/pyshark pylint mypy diff --git a/scripts/run.sh b/scripts/run.sh index c507bdc..a5dcab6 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -48,6 +48,10 @@ fi declare -a TAXALIST=("AMPHIBIA" "AVES" "MAMMALIA" "REPTILIA") +if [ ! -d "${DATADIR}" ]; then + mkdir "${DATADIR}" +fi + # Get habitat layer and prepare for use if [ ! -d "${DATADIR}"/habitat_layers ]; then if [ ! -f "${DATADIR}"/habitat/raw.tif ]; then @@ -73,6 +77,7 @@ fi if [[ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif || ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]]; then if [ ! -f "${DATADIR}"/elevation/elevation.tif ]; then echo "Fetching elevation map..." + mkdir -p "${DATADIR}"/elevation reclaimer zenodo --zenodo_id 5719984 --filename dem-100m-esri54017.tif --output "${DATADIR}"/elevation/elevation.tif fi if [ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif ]; then From 99bf082ac14f914236f0393fd04fc685a8f87d9e Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 16 Oct 2025 07:27:03 +0100 Subject: [PATCH 13/21] Full run with updated aoh --- requirements.txt | 2 +- scripts/run.sh | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/requirements.txt b/requirements.txt index b1514f4..56c2f4b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ pyproj scikit-image requests yirgacheffe -aoh +aoh[validation] gdal[numpy] diff --git a/scripts/run.sh b/scripts/run.sh index a5dcab6..8e875bb 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -93,7 +93,7 @@ fi # Generate the crosswalk table if [ ! -f "${DATADIR}"/crosswalk.csv ]; then echo "Generating crosswalk table..." - python3 ./prepare_layers/convert_crosswalk.py --original "${PWD}"/data/crosswalk_bin_T.csv --output "${DATADIR}"/crosswalk.csv + python3 ./prepare_layers/convert_crosswalk.py --original "${DATADIR}"/crosswalk_bin_T.csv --output "${DATADIR}"/crosswalk.csv fi # Get species data per taxa from IUCN data @@ -105,9 +105,9 @@ do fi done -if [ -f data/BL_Species_Elevations_2023.csv ]; then +if [ -f "${DATADIR}"/BL_Species_Elevations_2023.csv ]; then echo "Applying birdlife data..." - python3 ./prepare_species/apply_birdlife_data.py --geojsons "${DATADIR}"/species-info/AVES --overrides data/BL_Species_Elevations_2023.csv + python3 ./prepare_species/apply_birdlife_data.py --geojsons "${DATADIR}"/species-info/AVES --overrides "${DATADIR}"/BL_Species_Elevations_2023.csv fi echo "Generating AoH task list..." @@ -118,20 +118,20 @@ littlejohn -j "${PROCESS_COUNT}" -o "${DATADIR}"/aohbatch.log -c "${DATADIR}"/ao # Calculate predictors from AoHs echo "Generating species richness..." -python3 ./aoh-calculator/summaries/species_richness.py --aohs_folder "${DATADIR}"/aohs/current/ \ - --output "${DATADIR}"/summaries/species_richness.tif +aoh-species-richness --aohs_folder "${DATADIR}"/aohs/current/ \ + --output "${DATADIR}"/summaries/species_richness.tif echo "Generating endemism..." -python3 ./aoh-calculator/summaries/endemism.py --aohs_folder "${DATADIR}"/aohs/current/ \ - --species_richness "${DATADIR}"/summaries/species_richness.tif \ - --output "${DATADIR}"/summaries/endemism.tif +aoh-endemism --aohs_folder "${DATADIR}"/aohs/current/ \ + --species_richness "${DATADIR}"/summaries/species_richness.tif \ + --output "${DATADIR}"/summaries/endemism.tif # Aoh Validation echo "Collating validation data..." -python3 ./aoh-calculator/validation/collate_data.py --aoh_results "${DATADIR}"/aohs/current/ \ - --output "${DATADIR}"/validation/aohs.csv +aoh-collate-data --aoh_results "${DATADIR}"/aohs/current/ \ + --output "${DATADIR}"/validation/aohs.csv echo "Calculating model validation..." -python3 ./aoh-calculator/validation/validate_map_prevalence.py --collated_aoh_data "${DATADIR}"/validation/aohs.csv \ - --output "${DATADIR}"/validation/model_validation.csv +aoh-validate-prevalence --collated_aoh_data "${DATADIR}"/validation/aohs.csv \ + --output "${DATADIR}"/validation/model_validation.csv # Threats echo "Generating threat task list..." From 6568275536d9bd30cd6febd8a935f6c3314a1d75 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 16 Oct 2025 07:41:47 +0100 Subject: [PATCH 14/21] Updated readme for required inputs --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 93895b4..948ce0e 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,14 @@ $ cd star $ git submodule update --init --recursive ``` +## Additional inputs + +There are some additional inputs required to run the pipeline, which should be plated in the directory you use to store the pipeline results. + +* crosswalk_bin_T.csv - the crosswalk table from the [Lumbierres et al 2021](https://conbio.onlinelibrary.wiley.com/doi/10.1111/cobi.13851) +* SpeciesList_generalisedRangePolygons.csv - A list of species with generalised ranges on the IUCN Redlist. +* BL_Species_Elevations_2023.csv (optional) - corrections to the elevation of birdlife species on the IUCN Redlist taken from the BirdLife data. + ## Running the pipeline The easiest way to get started will be to run `scripts/run.sh` under a linux environment. From a619824a37f9a34af6df5f95fc96045afe509c29 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 16 Oct 2025 07:43:21 +0100 Subject: [PATCH 15/21] script formatting --- scripts/run.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/run.sh b/scripts/run.sh index 8e875bb..a1d153f 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -56,7 +56,9 @@ fi if [ ! -d "${DATADIR}"/habitat_layers ]; then if [ ! -f "${DATADIR}"/habitat/raw.tif ]; then echo "Fetching habitat map..." - reclaimer zenodo --zenodo_id 3939050 --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif --output "${DATADIR}"/habitat/raw.tif + reclaimer zenodo --zenodo_id 3939050 \ + --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif \ + --output "${DATADIR}"/habitat/raw.tif fi echo "Processing habitat map..." From 2186abd3af14778e2062a1fa7e66d590ced320cc Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 16 Oct 2025 07:54:15 +0100 Subject: [PATCH 16/21] More readme instructions --- README.md | 30 +++++++++++++++++++++++++----- requirements.txt | 2 -- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 948ce0e..497b609 100644 --- a/README.md +++ b/README.md @@ -6,14 +6,34 @@ See [method.md](method.md) for a description of the methodology, or `scripts/run # Running the pipeline -## Checking out the code +## Requirements -This repository uses submodules, so once you have cloned it, you need to fetch the submodules: +The easiest way to run the pipeline is using the included Dockerfile to build a Docker container which will have all the dependancies installed in it. + +If not, you will need: + +* Python3 >= 3.10 +* GDAL +* R (required for validation) + +If you are using macOS please note that the default Python install that Apple ships is now several years out of date (Python 3.9, released Oct 2020) and you'll need to install a more recent version (for example, using [homebrew](https://brew.sh)). + +With those you should set up a Python virtual environment to install all the required packages. The one trick to this is you need to match the Python GDAL package to your installed GDAL version. + +```shell +$ python3 -m venv ./venv +$ . ./venv/bin/activate +(venv) $ gdalinfo --version +GDAL 3.11.3 "Eganville", released 2025/07/12 +(venv) $ pip install gdal[numpy]==3.11.3 +... +(venv) $ pip install -r requirements.txt +``` + +You will also need to install the R stats packages required for the validation stage: ```shell -$ git clone https://github.com/quantifyearth/star.git -$ cd star -$ git submodule update --init --recursive +$ R -e "install.packages(c('lme4', 'lmerTest'), repos='https://cran.rstudio.com/')" ``` ## Additional inputs diff --git a/requirements.txt b/requirements.txt index 56c2f4b..6fad873 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -alive-progress numpy geopandas postgis @@ -7,7 +6,6 @@ psutil pymer4 pyproj scikit-image -requests yirgacheffe aoh[validation] From 0f32da808522aadb535fdd2e8b73a62a3d480a55 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 16 Oct 2025 08:45:31 +0100 Subject: [PATCH 17/21] Update to GDAL 3.11.4 --- .github/workflows/python-package.yml | 4 ++-- Dockerfile | 6 ++++-- README.md | 2 ++ 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 94ca3e0..5724e1c 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,7 +12,7 @@ on: jobs: build: runs-on: ubuntu-latest - container: ghcr.io/osgeo/gdal:ubuntu-small-3.11.3 + container: ghcr.io/osgeo/gdal:ubuntu-small-3.11.4 strategy: fail-fast: false matrix: @@ -35,7 +35,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install gdal[numpy]==3.11.3 + python -m pip install gdal[numpy]==3.11.4 python -m pip install -r requirements.txt - name: Lint with pylint diff --git a/Dockerfile b/Dockerfile index 1217663..5adc593 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,13 +10,14 @@ WORKDIR /go/littlejohn RUN go mod tidy RUN go build -FROM ghcr.io/osgeo/gdal:ubuntu-small-3.10.0 +FROM ghcr.io/osgeo/gdal:ubuntu-small-3.11.4 RUN apt-get update -qqy && \ apt-get install -qy \ git \ cmake \ python3-pip \ + shellcheck \ r-base \ libpq-dev \ libtirpc-dev \ @@ -27,7 +28,7 @@ COPY --from=reclaimerbuild /go/reclaimer/reclaimer /bin/reclaimer COPY --from=littlejohnbuild /go/littlejohn/littlejohn /bin/littlejohn RUN rm /usr/lib/python3.*/EXTERNALLY-MANAGED -RUN pip install gdal[numpy]==3.10.0 +RUN pip install gdal[numpy]==3.11.4 COPY requirements.txt /tmp/ RUN pip install -r /tmp/requirements.txt @@ -54,3 +55,4 @@ ENV PYTHONPATH=/root/star RUN python3 -m pytest ./tests RUN python3 -m pylint prepare_layers prepare_species utils tests RUN python3 -m mypy prepare_layers prepare_speices utils tests +RUN shellcheck ./scripts/run.sh diff --git a/README.md b/README.md index 497b609..87c8fd8 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ If not, you will need: * Python3 >= 3.10 * GDAL * R (required for validation) +* [Reclaimer](https://github.com/quantifyearth/reclaimer/) - a Go tool for fetching data from Zenodo +* [Littlejohn](https://github.com/quantifyearth/littlejohn/) - a Go tool for running scripts in parallel If you are using macOS please note that the default Python install that Apple ships is now several years out of date (Python 3.9, released Oct 2020) and you'll need to install a more recent version (for example, using [homebrew](https://brew.sh)). From 380f34fffe394322dfa911c1c9886e3d05e61d38 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 16 Oct 2025 08:46:17 +0100 Subject: [PATCH 18/21] README update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 87c8fd8..f41a1d7 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ If not, you will need: If you are using macOS please note that the default Python install that Apple ships is now several years out of date (Python 3.9, released Oct 2020) and you'll need to install a more recent version (for example, using [homebrew](https://brew.sh)). -With those you should set up a Python virtual environment to install all the required packages. The one trick to this is you need to match the Python GDAL package to your installed GDAL version. +With those you should set up a Python virtual environment to install all the required packages. The one trick to this is you need to match the Python GDAL package to your installed GDAL version. For example, on my machine I did the following: ```shell $ python3 -m venv ./venv From ad47f640911fbbed39dbd9302e8c10aa42387024 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 16 Oct 2025 08:48:53 +0100 Subject: [PATCH 19/21] Fix typo in dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 5adc593..913f289 100644 --- a/Dockerfile +++ b/Dockerfile @@ -54,5 +54,5 @@ ENV PYTHONPATH=/root/star RUN python3 -m pytest ./tests RUN python3 -m pylint prepare_layers prepare_species utils tests -RUN python3 -m mypy prepare_layers prepare_speices utils tests +RUN python3 -m mypy prepare_layers prepare_species utils tests RUN shellcheck ./scripts/run.sh From 502d23b74694ac6f087368f9862828445bc0ac56 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 16 Oct 2025 09:05:49 +0100 Subject: [PATCH 20/21] Address review comments --- .gitmodules | 0 README.md | 90 +++++++++++++++++++++++++++++------------------- requirements.txt | 7 ++-- scripts/run.sh | 3 +- 4 files changed, 58 insertions(+), 42 deletions(-) delete mode 100644 .gitmodules diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index e69de29..0000000 diff --git a/README.md b/README.md index f41a1d7..1749c29 100644 --- a/README.md +++ b/README.md @@ -4,13 +4,53 @@ An implementation of the threat based [STAR biodiversity metric by Muir et al](h See [method.md](method.md) for a description of the methodology, or `scripts/run.sh` for how to execute the pipeline. -# Running the pipeline +## Checking out the code -## Requirements +The code is available on github, and can be checked out from there: -The easiest way to run the pipeline is using the included Dockerfile to build a Docker container which will have all the dependancies installed in it. +```shell +$ git clone https://github.com/quantifyearth/STAR.git +... +$ cd STAR +``` + +## Additional inputs + +There are some additional inputs required to run the pipeline, which should be placed in the directory you use to store the pipeline results. + +* crosswalk_bin_T.csv - the crosswalk table from the [Lumbierres et al 2021](https://conbio.onlinelibrary.wiley.com/doi/10.1111/cobi.13851) +* SpeciesList_generalisedRangePolygons.csv - A list of species with generalised ranges on the IUCN Redlist. +* BL_Species_Elevations_2023.csv (optional) - corrections to the elevation of birdlife species on the IUCN Redlist taken from the BirdLife data. + +The script also assumes you have a Postgres database with the IUCN Redlist database in it. + +## Running the pipeline + +There are two ways to run the pipeline. The easiest way is to use Docker if you have it available to you, as it will manage all the dependencies for you. But you can check out and run it locally if you want to also, but it requires a little more effort. -If not, you will need: +### Running with Docker + + +There is included a docker file, which is based on the GDAL container image, which is set up to install everything ready to use. You can build that using: + +```shell +$ docker buildx build -t star . +``` + +You can then invoke the run script using this. You should map an external folder into the container as a place to store the intermediary data and final results, and you should provide details about the Postgres instance with the IUCN redlist: + +```shell +$ docker run --rm -v /some/local/dir:/data \ + -e DB_HOST=localhost \ + -e DB_NAME=iucnredlist \ + -e DB_PASSWORD=supersecretpassword \ + -e DB_USER=postgres \ + star ./scripts/run.sh +``` + +### Running without Docker + +If you prefer not to use Docker, you will need: * Python3 >= 3.10 * GDAL @@ -38,42 +78,20 @@ You will also need to install the R stats packages required for the validation s $ R -e "install.packages(c('lme4', 'lmerTest'), repos='https://cran.rstudio.com/')" ``` -## Additional inputs - -There are some additional inputs required to run the pipeline, which should be plated in the directory you use to store the pipeline results. - -* crosswalk_bin_T.csv - the crosswalk table from the [Lumbierres et al 2021](https://conbio.onlinelibrary.wiley.com/doi/10.1111/cobi.13851) -* SpeciesList_generalisedRangePolygons.csv - A list of species with generalised ranges on the IUCN Redlist. -* BL_Species_Elevations_2023.csv (optional) - corrections to the elevation of birdlife species on the IUCN Redlist taken from the BirdLife data. - -## Running the pipeline - -The easiest way to get started will be to run `scripts/run.sh` under a linux environment. - -### Running on Ubuntu - -The following extra utilities will need to be installed: - -* [Reclaimer](https://github.com/quantifyearth/reclaimer/) - a utility for downloading data from various primary sources. -* [Littlejohn](https://github.com/quantifyearth/littlejohn/) - a utility to run jobs in parallel driven by a CSV file. +Before running the pipeline you will need to set several environmental variables to tell the script where to store data and where the database with the IUCN Redlist is. You can set these manually, or we recommend using a tool like [direnv](https://direnv.net). -### Running in Docker - -There is included a docker file, which is based on the GDAL container image, which is set up to install everything ready to use. You can build that using: - -``` -$ docker buildx build -t star . +```shell +export DATADIR=[PATH WHERE YOU WANT THE RESULTS] +export DB_HOST=localhost +export DB_NAME=iucnredlist +export DB_PASSWORD=supersecretpassword +export DB_USER=postgres ``` -You can then invoke the run script using this. You should map an external folder into the container as a place to store the intermediary data and final results, and you should provide details about the Postgres instance with the IUCN redlist: +Once you have all that you can then run the pipeline: -``` -$ docker run --rm -v /some/local/dir:/data \ - -e DB_HOST=localhost \ - -e DB_NAME=iucnredlist \ - -e DB_PASSWORD=supersecretpassword \ - -e DB_USER=postgres \ - star ./scripts/run.sh +```shell +(venv) $ ./scripts/run.sh ``` # Credits diff --git a/requirements.txt b/requirements.txt index 6fad873..720c71a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,13 +6,12 @@ psutil pymer4 pyproj scikit-image -yirgacheffe -aoh[validation] +yirgacheffe>=1.9 +aoh[validation]>=1.0 +# GDAL should be installed manually to match the version of the library installed on your machine gdal[numpy] -git+https://github.com/quantifyearth/iucn_modlib - pylint mypy pytest diff --git a/scripts/run.sh b/scripts/run.sh index a1d153f..f3c193f 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -32,7 +32,7 @@ if [[ -n "${SLURM_JOB_ID}" ]]; then cd "${HOME}"/dev/star PROCESS_COUNT="${SLURM_JOB_CPUS_PER_NODE}" else - PROCESS_COUNT=$(nproc --all) + PROCESS_COUNT=$(getconf _NPROCESSORS_ONLN) fi echo "Using ${PROCESS_COUNT} threads." @@ -79,7 +79,6 @@ fi if [[ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif || ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]]; then if [ ! -f "${DATADIR}"/elevation/elevation.tif ]; then echo "Fetching elevation map..." - mkdir -p "${DATADIR}"/elevation reclaimer zenodo --zenodo_id 5719984 --filename dem-100m-esri54017.tif --output "${DATADIR}"/elevation/elevation.tif fi if [ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif ]; then From 8346a594a5f260264fd856285362654f9271f467 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 16 Oct 2025 09:14:28 +0100 Subject: [PATCH 21/21] Add Lumbierres crosswalk --- README.md | 15 ++++++++++++++- data/crosswalk_bin_T.csv | 18 ++++++++++++++++++ scripts/run.sh | 2 +- 3 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 data/crosswalk_bin_T.csv diff --git a/README.md b/README.md index 1749c29..bc552b6 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,6 @@ $ cd STAR There are some additional inputs required to run the pipeline, which should be placed in the directory you use to store the pipeline results. -* crosswalk_bin_T.csv - the crosswalk table from the [Lumbierres et al 2021](https://conbio.onlinelibrary.wiley.com/doi/10.1111/cobi.13851) * SpeciesList_generalisedRangePolygons.csv - A list of species with generalised ranges on the IUCN Redlist. * BL_Species_Elevations_2023.csv (optional) - corrections to the elevation of birdlife species on the IUCN Redlist taken from the BirdLife data. @@ -97,3 +96,17 @@ Once you have all that you can then run the pipeline: # Credits The author of this package is greatly indebted to both [Francesca Ridley](https://www.ncl.ac.uk/nes/people/profile/francescaridley.html) from the University of Newcastle and [Simon Tarr](https://www.linkedin.com/in/simon-tarr-22069b209/) of the IUCN for their guidance and review. + +## Data Attribution + +The crosswalk table `data/crosswalk_bin_T.csv` was created by [Francesca Ridley](https://www.ncl.ac.uk/nes/people/profile/francescaridley.html) and is derived from: + +``` +Lumbierres, M., Dahal, P.R., Di Marco, M., Butchart, S.H.M., Donald, P.F., +& Rondinini, C. (2022). Translating habitat class to land cover to map area +of habitat of terrestrial vertebrates. Conservation Biology, 36, e13851. +https://doi.org/10.1111/cobi.13851 +``` + +The paper is licensed under CC BY-NC. It is used in this STAR implementation to crosswalk between the IUCN Habitat classes in the Redlist and the land classes in the Copernicus data layers. + diff --git a/data/crosswalk_bin_T.csv b/data/crosswalk_bin_T.csv new file mode 100644 index 0000000..f068cf8 --- /dev/null +++ b/data/crosswalk_bin_T.csv @@ -0,0 +1,18 @@ +CGLS100_name,CGLS100_value,Label,H_1,H_2,H_3,H_4,H_5,H_6,H_7,H_8,H_14.1,H_14.2,H_14.3,H_14.6,H_14.4,H_14.5,H_15 +CLS_20_shrubs,20,shrubs,0,1,1,0,0,0,U,1,0,0,0,0,0,0,0 +CLS_30_Herbaceous_vegetation,30,Herbaceous_vegetation,0,0,0,1,0,0,U,0,0,0,0,0,0,0,0 +CLS_40_CultivatedandManaged_VegetationAgriculture,40,CultivatedandManaged_VegetationAgriculture,0,0,0,1,1,0,U,0,1,1,0,0,0,0,0 +CLS_50_Urban_builtup,50,Urban_builtup,0,0,0,0,0,0,U,0,0,0,0,0,1,1,0 +CLS_60_bare_sparsevegetation,60,bare_sparsevegetation,0,0,1,0,0,1,U,1,0,0,0,0,0,0,0 +CLS_80_permanent_water,80,permanent_water,0,0,0,0,1,0,U,0,0,0,0,0,0,0,0 +CLS_90_Herbaceous_wetland,90,Herbaceous_wetland,0,0,0,0,1,0,U,0,0,0,0,0,0,0,1 +CLS_111_Closedforest_evergreen_needle,111,Closedforest_evergreen_needle,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_112_Closedforest_evergreen_broad,112,Closedforest_evergreen_broad,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_114_Closedforest_deciduous_broad,114,Closedforest_deciduous_broad,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_115_Closedforest_mixed,115,Closedforest_mixed,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_116_Closedforest_unknown,116,Closedforest_unknown,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_121_Openforest_evergreen_needle,121,Openforest_evergreen_needle,1,0,0,0,0,1,U,0,0,0,0,0,0,0,0 +CLS_122_Openforest_evergreen_broad,122,Openforest_evergreen_broad,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_124_Openforest_deciduous_broad,124,Openforest_deciduous_broad,0,1,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_125_Openforest_mixed,125,Openforest_mixed,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_126_Openforest_unknown,126,Openforest_unknown,0,0,0,0,0,0,U,0,0,0,0,0,0,0,0 diff --git a/scripts/run.sh b/scripts/run.sh index f3c193f..d88e05b 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -94,7 +94,7 @@ fi # Generate the crosswalk table if [ ! -f "${DATADIR}"/crosswalk.csv ]; then echo "Generating crosswalk table..." - python3 ./prepare_layers/convert_crosswalk.py --original "${DATADIR}"/crosswalk_bin_T.csv --output "${DATADIR}"/crosswalk.csv + python3 ./prepare_layers/convert_crosswalk.py --original ./data/crosswalk_bin_T.csv --output "${DATADIR}"/crosswalk.csv fi # Get species data per taxa from IUCN data