Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 72 additions & 6 deletions prepare_species/extract_species_data_psql.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"id_no",
"assessment_id",
"season",
"systems",
"elevation_lower",
"elevation_upper",
"full_habitat_code",
Expand Down Expand Up @@ -60,13 +61,27 @@
LEFT JOIN red_list_category_lookup ON red_list_category_lookup.id = assessments.red_list_category_id
WHERE
assessments.latest = true
AND assessments.sis_taxon_id NOT IN %s
AND assessment_scopes.scope_lookup_id = 15 -- global assessments only
AND taxons.class_name = %s
AND taxons.infra_type is NULL -- no subspecies
AND taxons.metadata->>'taxon_level' = 'Species'
AND red_list_category_lookup.code IN ('NT', 'VU', 'EN', 'CR')
"""

SYSTEMS_STATEMENT = """
SELECT
STRING_AGG(system_lookup.description->>'en', '|') AS systems
FROM
assessments
LEFT JOIN assessment_systems ON assessment_systems.assessment_id = assessments.id
LEFT JOIN system_lookup ON assessment_systems.system_lookup_id = system_lookup.id
WHERE
assessments.id = %s
GROUP BY
assessments.id
"""

THREATS_STATEMENT = """
SELECT
supplementary_fields->>'scope' AS scope,
Expand Down Expand Up @@ -117,6 +132,8 @@ class SpeciesReport:
"assessment_id",
"scientific_name",
"possibly_extinct",
"has_systems",
"not_terrestrial_system",
"has_threats",
"has_habitats",
"keeps_habitats",
Expand Down Expand Up @@ -173,6 +190,25 @@ def tidy_reproject_save(
res_projected.to_file(output_path, driver="GeoJSON")
report.filename = output_path

def process_systems(
systems_data: List[Tuple],
report: SpeciesReport,
) -> None:
if len(systems_data) == 0:
raise ValueError("No systems found")
if len(systems_data) > 1:
raise ValueError("More than one systems aggregation found")
systems = systems_data[0][0]
if systems is None:
raise ValueError("no systems info")
report.has_systems = True

if "Terrestrial" not in systems:
raise ValueError("No Terrestrial in systems")
report.not_terrestrial_system = True

return systems

SCOPES = [
"whole (>90%)",
"majority (50-90%)",
Expand Down Expand Up @@ -214,12 +250,14 @@ def process_threats(
return total != 0

def process_habitats(
habitats_data: List,
habitats_data: List[List[str]],
report: SpeciesReport,
) -> Set:
if len(habitats_data) == 0:
raise ValueError("No habitats found")
report.has_habitats = True
# Promote to "Unknown"
habitats_data = [["18"]]
else:
report.has_habitats = True
if len(habitats_data) > 1:
raise ValueError("Expected only one habitat row")

Expand All @@ -229,7 +267,7 @@ def process_habitats(
habitat_values = habitat_values_row[0]

if habitat_values is None:
continue
habitat_values = "18"
habitat_set = {x for x in habitat_values.split('|') if x}
habitats |= habitat_set

Expand Down Expand Up @@ -291,6 +329,15 @@ def process_row(
presence += (4,)
report.possibly_extinct = True # pylint: disable=W0201


cursor.execute(SYSTEMS_STATEMENT, (assessment_id,))
systems_data = cursor.fetchall()
try:
systems = process_systems(systems_data, report)
except ValueError as exc:
logger.debug("Dropping %s: %s", id_no, str(exc))
return report

cursor.execute(THREATS_STATEMENT, (assessment_id,))
raw_threats = cursor.fetchall()
threatened = process_threats(raw_threats, report)
Expand All @@ -316,6 +363,7 @@ def process_row(
id_no,
assessment_id,
"all",
systems,
int(elevation_lower) if elevation_lower is not None else None,
int(elevation_upper) if elevation_upper is not None else None,
'|'.join(list(habitats)),
Expand Down Expand Up @@ -359,20 +407,30 @@ def apply_overrides(

def extract_data_per_species(
class_name: str,
overrides_path: str,
overrides_path: Optional[str],
excludes_path: Optional[str],
output_directory_path: str,
target_projection: Optional[str],
) -> None:

connection = psycopg2.connect(DB_CONFIG)
cursor = connection.cursor()

excludes = []
if excludes_path is not None:
try:
df = pd.read_csv(excludes_path)
excludes = tuple([int(x) for x in df.id_no.unique()]) # pylint: disable=R1728
logger.info("Excluding %d species", len(excludes))
except FileNotFoundError:
pass

# For STAR-R we need historic data, but for STAR-T we just need current.
# for era, presence in [("current", (1, 2)), ("historic", (1, 2, 4, 5))]:
for era, presence in [("current", (1, 2))]:
era_output_directory_path = os.path.join(output_directory_path, era)

cursor.execute(MAIN_STATEMENT, (class_name,))
cursor.execute(MAIN_STATEMENT, (excludes, class_name,))
# This can be quite big (tens of thousands), but in modern computer term is quite small
# and I need to make a follow on DB query per result.
results = cursor.fetchall()
Expand Down Expand Up @@ -415,6 +473,13 @@ def main() -> None:
required=False,
dest="overrides",
)
parser.add_argument(
'--excludes',
type=str,
help="CSV of taxon IDs to not include",
required=False,
dest="excludes"
)
parser.add_argument(
'--output',
type=str,
Expand All @@ -435,6 +500,7 @@ def main() -> None:
extract_data_per_species(
args.classname,
args.overrides,
args.excludes,
args.output_directory_path,
args.target_projection
)
Expand Down
6 changes: 3 additions & 3 deletions tests/test_species_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ def test_simple_example():
def test_no_habitats_in_db():
habitat_data = []
report = SpeciesReport(1, 2, "name")
with pytest.raises(ValueError):
_ = process_habitats(habitat_data, report)
res = process_habitats(habitat_data, report)
assert res == set(["18"])
assert not report.has_habitats
assert not report.keeps_habitats
assert report.keeps_habitats

def test_too_many_habitats_in_db():
habitat_data = [("4.1|4.2",), ("1.2",)]
Expand Down