Add occurrence validation to run script #18

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

mdales merged 5 commits into main from mwd-add-occurrence-validation

Jan 22, 2026

prepare_layers/convert_crosswalk.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -50,6 +50,10 @@ def convert_crosswalk( @@
                 for code in codes:
                     res.append([hab, code])
+        # This addition is for the islands correction layer, which will
+        # be added as an effective lcc_0.tif
+        res.append(["islands", "0"])
         df = pd.DataFrame(res, columns=["code", "value"])
         df.to_csv(output_path, index=False)
@@ Expand Down @@

prepare_layers/remove_nans_from_mask.py

-Original file line number
+Diff line change
@@ -0,0 +1,40 @@
+    import argparse
+    import os
+    from pathlib import Path
+    import yirgacheffe as yg
+    def remove_nans_from_mask(
+        input_path: Path,
+        output_path: Path,
+    ) -> None:
+        os.makedirs(output_path.parent, exist_ok=True)
+        with yg.read_raster(input_path) as layer:
+            converted = layer.nan_to_num()
+            converted.to_geotiff(output_path)
+    def main() -> None:
+        parser = argparse.ArgumentParser(description="Convert NaNs to zeros in mask layers")
+        parser.add_argument(
+            '--original',
+            type=Path,
+            help="Original raster",
+            required=True,
+            dest="original_path",
+        )
+        parser.add_argument(
+            '--output',
+            type=Path,
+            help='Destination raster',
+            required=True,
+            dest='output_path',
+        )
+        args = parser.parse_args()
+        remove_nans_from_mask(
+            args.original_path,
+            args.output_path
+        )
+    if __name__ == "__main__":
+        main()

prepare_species/common.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -28,7 +28,8 @@ @@
         "threats",
         "category",
         "category_weight",
-        "geometry"
+        "geometry",
+        "in_star",
     ]
     # From Muir et al: For each species, a global STAR threat abatement (START) score
@@ Expand Down Expand Up / @@ -84,6 +85,8 @@ class SpeciesReport: @@
             "keeps_habitats",
             "has_geometries",
             "keeps_geometries",
+            "has_category",
+            "in_star",
             "filename",
         ]
@@ Expand Down Expand Up / @@ -224,6 +227,16 @@ def tidy_reproject_save( @@
         os.makedirs(output_directory_path, exist_ok=True)
         output_path = output_directory_path / f"{grow.id_no}.geojson"
         res = gpd.GeoDataFrame(grow.to_frame().transpose(), crs=src_crs, geometry="geometry")
+        # Ensure proper dtypes for JSON serialization
+        int_columns = ['id_no', 'assessment_id', 'assessment_year', 'elevation_lower', 'elevation_upper', 'category_weight']
+        for col in int_columns:
+            if col in res.columns:
+                res[col] = res[col].astype('Int64')  # Nullable integer type
+        if 'in_star' in res.columns:
+            res['in_star'] = res['in_star'].astype(bool)
         res_projected = res.to_crs(target_crs)
         res_projected.to_file(output_path, driver="GeoJSON")
         report.filename = output_path

prepare_species/extract_species_data_psql.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -30,6 +30,10 @@
  
    logging.basicConfig()

    logger.setLevel(logging.DEBUG)

    # Note that this query returns more species that would be accepted into STAR based on

    # threat category, but for model checking we want all AOHs for a given taxa, so the

    # pipeline must process them all, and then just include the correct set when processing

    # STAR

    MAIN_STATEMENT = """

    SELECT

        assessments.sis_taxon_id as id_no,

    @@ -54,7 +58,7 @@
  
        AND taxons.class_name = %s

        AND taxons.infra_type is NULL -- no subspecies

        AND taxons.metadata->>'taxon_level' = 'Species'

        AND red_list_category_lookup.code IN ('NT', 'VU', 'EN', 'CR')

        AND red_list_category_lookup.code IN ('DD', 'LC', 'NT', 'VU', 'EN', 'CR')

    """

    SYSTEMS_STATEMENT = """

    @@ -138,20 +142,10 @@ def process_row(
  
            presence += (4,)

            report.possibly_extinct = True # pylint: disable=W0201

        cursor.execute(SYSTEMS_STATEMENT, (assessment_id,))

        systems_data = cursor.fetchall()

        try:

            systems = process_systems(systems_data, report)

        except ValueError as exc:

            logger.debug("Dropping %s: %s", id_no, str(exc))

            return report

        cursor.execute(THREATS_STATEMENT, (assessment_id,))

        raw_threats = cursor.fetchall()

        threats = process_threats(raw_threats, report)

        if len(threats) == 0:

            return report

        include_in_star = category in ('NT', 'VU', 'EN', 'CR')

        report.has_category = include_in_star

        # First checks are the ones that rule out being able to make an AOH at all

        cursor.execute(HABITATS_STATEMENT, (assessment_id,))

        raw_habitats = cursor.fetchall()

        try:

    @@ -170,6 +164,37 @@ def process_row(
  
        except ValueError as _exc:

            return report

        # Second checks are whether it is good for STAR, so from hereon we should

        # output a GeoJSON regardless as we should make an AOH for validation with this

        cursor.execute(SYSTEMS_STATEMENT, (assessment_id,))

        systems_data = cursor.fetchall()

        try:

            systems = process_systems(systems_data, report)

        except ValueError as exc:

            logger.debug("Dropping %s: %s", id_no, str(exc))

            include_in_star = False

            try:

                systems = systems_data[0][0]

            except IndexError:

                systems = []

        cursor.execute(THREATS_STATEMENT, (assessment_id,))

        raw_threats = cursor.fetchall()

        threats = process_threats(raw_threats, report)

        if len(threats) == 0:

            include_in_star = False

        report.in_star = include_in_star

        try:

            category_weight = CATEGORY_WEIGHTS[category]

        except KeyError:

            assert include_in_star is False

            category_weight = 0

        # This is a fix as per the method to include the missing islands layer:

        habitats_list = list(habitats) + ["islands"]

        gdf = gpd.GeoDataFrame(

            [[

                id_no,

    @@ -179,14 +204,15 @@ def process_row(
  
                systems,

                int(elevation_lower) if elevation_lower is not None else None,

                int(elevation_upper) if elevation_upper is not None else None,

                '|'.join(list(habitats)),

                '|'.join(habitats_list),

                scientific_name,

                family_name,

                class_name,

                json.dumps(threats),

                category,

                CATEGORY_WEIGHTS[category],

                geometry

                category_weight,

                geometry,

                include_in_star,

              ]],

            columns=COLUMNS,

            crs=target_projection or 'epsg:4326'

prepare_species/extract_species_data_redlist.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -243,6 +243,8 @@ def process_species( @@
             logger.debug("Dropping %s: %s", id_no, exc)
             return report
+        habitats_list = list(habitats) + ["islands"]
         # Create GeoDataFrame with all data
         gdf = gpd.GeoDataFrame(
             [[
@@ Expand All / @@ -253,7 +255,7 @@ def process_species( @@
                 systems,
                 elevation_lower,
                 elevation_upper,
-                '|'.join(list(habitats)),
+                '|'.join(habitats_list),
                 scientific_name,
                 family_name,
                 class_name,
@@ Expand Down @@

requirements.txt

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -7,8 +7,8 @@ pymer4
  
    pyproj

    scikit-image

    redlistapi

    yirgacheffe>=1.9

    aoh[validation]>=1.0

    yirgacheffe>=1.12,<2.0

    aoh[validation]>=2.0.1,<3.0

    # GDAL should be installed manually to match the version of the library installed on your machine

    gdal[numpy]

scripts/run.sh

-Original file line number
+Diff line change
@@ Expand Up / @@ -63,32 +63,20 @@ if [ ! -d "${DATADIR}"/habitat_layers ]; then @@
         echo "Processing habitat map..."
         aoh-habitat-process --habitat "${DATADIR}"/habitat/raw.tif \
-                            --scale 1000.0 \
+                            --scale 992.292720200000133 \
                             --projection "ESRI:54009" \
                             --output "${DATADIR}"/tmp_habitat_layers/current
         mv "${DATADIR}"/tmp_habitat_layers "${DATADIR}"/habitat_layers
     fi
-    if [ ! -d "${DATADIR}"/masks ]; then
-        echo "Processing masks..."
-        python3 ./prepare_layers/make_masks.py --habitat_layers "${DATADIR}"/habitat_layers/current \
-                                            --output_directory "${DATADIR}"/masks
+    if [ ! -f "${DATADIR}"/habitat_layers/current/lcc_0.tif ]; then
+        cp "${DATADIR}/Zenodo/MissingLandcover_1km_cover.tif" "${DATADIR}"/habitat_layers/current/lcc_0.tif
     fi
-    # Fetch and prepare the elevation layers
-    if [[ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif || ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]]; then
-        if [ ! -f "${DATADIR}"/elevation/elevation.tif ]; then
-            echo "Fetching elevation map..."
-            reclaimer zenodo --zenodo_id 5719984  --filename dem-100m-esri54017.tif --output "${DATADIR}"/elevation/elevation.tif
-        fi
-        if [ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif ]; then
-            echo "Generating elevation max layer..."
-            gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r max -co COMPRESS=LZW -wo NUM_THREADS=40 "${DATADIR}"/elevation/elevation.tif "${DATADIR}"/elevation/elevation-max-1k.tif
-        fi
-        if [ ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]; then
-            echo "Generating elevation min layer..."
-            gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r min -co COMPRESS=LZW -wo NUM_THREADS=40 "${DATADIR}"/elevation/elevation.tif "${DATADIR}"/elevation/elevation-min-1k.tif
-        fi
+    if [ ! -f "${DATADIR}"/masks/CGLS100Inland_withGADMIslands.tif ]; then
+        echo "Processing masks..."
+        python3 ./prepare_layers/remove_nans_from_mask.py --original "${DATADIR}"/Zenodo/CGLS100Inland_withGADMIslands.tif \
+                                                          --output "${DATADIR}"/masks/CGLS100Inland_withGADMIslands.tif
     fi
     # Generate the crosswalk table
@@ Expand All / @@ -102,7 +90,10 @@ for TAXA in "${TAXALIST[@]}" @@
     do
         if [ ! -d "${DATADIR}"/species-info/"${TAXA}"/ ]; then
             echo "Extracting species data for ${TAXA}..."
-            python3 ./prepare_species/extract_species_data_psql.py --class "${TAXA}" --output "${DATADIR}"/species-info/"${TAXA}"/ --projection "ESRI:54009" --excludes "${DATADIR}"/SpeciesList_generalisedRangePolygons.csv
+            python3 ./prepare_species/extract_species_data_psql.py --class "${TAXA}" \
+                                                                   --output "${DATADIR}"/species-info/"${TAXA}"/ \
+                                                                   --projection "ESRI:54009" \
+                                                                   --excludes "${DATADIR}"/SpeciesList_generalisedRangePolygons.csv
         fi
     done
@@ Expand Down Expand Up @@
     echo "Calculating model validation..."
     aoh-validate-prevalence --collated_aoh_data "${DATADIR}"/validation/aohs.csv \
                             --output "${DATADIR}"/validation/model_validation.csv
+    for TAXA in "${TAXALIST[@]}"
+    do
+        echo "Fetching GBIF data for ${TAXA}..."
+        aoh-fetch-gbif-data --collated_aoh_data "${DATADIR}"/validation/aohs.csv \
+                            --taxa "${TAXA}" \
+                            --output_dir "${DATADIR}"/validation/occurrences/
+        echo "Validating occurrences for ${TAXA}..."
+        aoh-validate-occurrences --gbif_data_path "${DATADIR}"/validation/occurrences/"${TAXA}" \
+                                 --species_data "${DATADIR}"/species-info/"${TAXA}"/current/ \
+                                 --aoh_results  "${DATADIR}"/aohs/current/"${TAXA}"/ \
+                                 --output "${DATADIR}"/validation/occurrences/"${TAXA}".csv
+    done
     # Threats
     echo "Generating threat task list..."
@@ Expand Down @@

utils/aoh_generator.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -22,21 +22,21 @@ def aoh_generator(
  
                for species_path in species_paths:

                    res.append([

                        data_dir / "habitat_layers" / scenario,

                        data_dir / "elevation" / "elevation-max-1k.tif",

                        data_dir / "elevation" / "elevation-min-1k.tif",

                        data_dir / "Zenodo" / "FABDEM_1km_max_patched.tif",

                        data_dir / "Zenodo" / "FABDEM_1km_min_patched.tif",

                        data_dir / "crosswalk.csv",

                        species_path,

                        data_dir / "masks" / "terrestrial_mask.tif",

                        data_dir / "masks" / "CGLS100Inland_withGADMIslands.tif",

                        data_dir / "aohs" / scenario / taxa_dir_path.name,

                    ])

        df = pd.DataFrame(res, columns=[

            '--habitats',

            '--fractional_habitats',

            '--elevation-max',

            '--elevation-min',

            '--crosswalk',

            '--speciesdata',

            '--area',

            '--weights',

            '--output'

        ])

        output_dir, _ = os.path.split(output_csv_path)

utils/threats_generator.py

-Original file line number
+Diff line change
@@ -1,6 +1,7 @@
     #!/usr/bin/env python3
     import argparse
+    import json
     import os
     from pathlib import Path
@@ Expand All / @@ -23,11 +24,17 @@ def threats_generator( @@
                     taxon_id = species_path.stem
                     aoh_path = data_dir / "aohs" / source / taxa_dir_path.name / f"{taxon_id}_all.tif"
                     if aoh_path.exists():
-                        res.append([
-                            species_path,
-                            aoh_path,
-                            data_dir / "threat_rasters" / taxa_dir_path.name,
-                        ])
+                        # Check if species should be included in STAR. We don't use geopandas here
+                        # because it will parse the geometry, which can be quite slow, and we just
+                        # want a single bool field
+                        with open(species_path, 'r', encoding="UTF-8") as f:
+                            data = json.load(f)
+                        if data['features'][0]['properties']['in_star']:
+                            res.append([
+                                species_path,
+                                aoh_path,
+                                data_dir / "threat_rasters" / taxa_dir_path.name,
+                            ])
         df = pd.DataFrame(res, columns=[
             '--speciesdata',
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add occurrence validation to run script #18

Uh oh!

Diff view

Diff view

There are no files selected for viewing