From 53481e8ad34b81439d3e65994f0993836175938e Mon Sep 17 00:00:00 2001 From: Mats Bovin Date: Fri, 5 Jan 2024 11:08:11 +0100 Subject: [PATCH] Update comments and readme --- sbdi/README.md | 22 ++++++++++++++++------ sbdi/process-backbone.py | 37 ++++++++++++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/sbdi/README.md b/sbdi/README.md index e3177be..a2890b6 100644 --- a/sbdi/README.md +++ b/sbdi/README.md @@ -37,9 +37,19 @@ Tag 1.0.2 created and pushed. ## Build search index -The solr search index can be built from scratch in the following way from the /admin page: - -* Select **DwCA Import** and import the [GBIF Backbone Taxonomy](https://www.gbif.org/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c) (~5h) -* Select **Create Links** and run: - * **Build search and suggest weights** (~7h) - * **Build solr suggestion index** (~15min, application will throw a read timeout exception but indexing will continue to run on Solr) +The solr search index can be built from scratch from the [GBIF Backbone Taxonomy](https://www.gbif.org/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c). Before importing the data it needs to be pre-processed (see [process-backbone.py](./process-backbone.py) for details). + +* Download backbone (to `/data/bie-index/import`): + ``` + make fetch-backbone + ``` +* Pre-process backbone: + ``` + make process-backbone + ``` +* Go to the /admin page and select **DwCA Import** and import from `/data/bie-index/import/backbone` (~2h) +* Go to the /admin page and select **Create Links** and run: + * **Denormalise taxa** (~8h) + * ~~**Build link identifiers** (7:30h)~~ + * **Build search and suggest weights** (~2:15h) + * **Build solr suggestion index** (~15min - the application will throw a read timeout exception but indexing will continue to run on Solr) diff --git a/sbdi/process-backbone.py b/sbdi/process-backbone.py index adc1cbc..12ef144 100755 --- a/sbdi/process-backbone.py +++ b/sbdi/process-backbone.py @@ -1,11 +1,35 @@ #!/usr/bin/env python3 +# +# This script pre-processes the GBIF Backbone taxonomy before loading it into the bie-index. +# +# The original files are renamed (eg. Taxon.tsv -> Taxon.tsv.original) and the processed file +# is saved with the original name (eg. Taxon.tsv). +# +# The following procsessing is done: +# +# Taxon +# ----- +# - Remove scientificNameAuthorship from scientificName (if included) because the bie-index +# expects the scientificName to be without authorship. +# Eg: Capreolus capreolus (Linnaeus, 1758) -> Capreolus capreolus +# +# VernacularName +# -------------- +# - Only include Swedish and English names +# - Exclude names from some sources of bad quality +# + import os import sys -ALLOWED_LANGUGES = ['sv', 'en'] +ALLOWED_LANGUAGES = [ + 'sv', + 'en', +] DISALLOWED_SOURCES = [ - 'Belgian Species List', # This contains comma-seprated lists of names - 'Abrocomidae', # This and all the following have all names wrongly tagged as English + 'Belgian Species List', # Contains comma-seprated lists of names + # All of these have names in various languages wrongly tagged as English + 'Abrocomidae', 'Acrobatidae', 'Ailuridae', 'Alpheidae', @@ -167,6 +191,7 @@ def process_taxon(src_dir): destination_path = f'{src_dir}/Taxon.tsv' original_path = f'{destination_path}.original' + # Rename original file (if not already done) if not os.path.isfile(original_path): os.rename(destination_path, original_path) @@ -181,6 +206,7 @@ def process_taxon(src_dir): scientificName = record[5] scientificNameAuthorship = record[6] + # Remove scientificNameAuthorship from scientificName if scientificNameAuthorship and scientificName.endswith(scientificNameAuthorship): record[5] = scientificName[:-len(scientificNameAuthorship)].strip() @@ -202,6 +228,7 @@ def process_vernacular_name(src_dir): destination_path = f'{src_dir}/VernacularName.tsv' original_path = f'{destination_path}.original' + # Rename original file (if not already done) if not os.path.isfile(original_path): os.rename(destination_path, original_path) @@ -217,8 +244,8 @@ def process_vernacular_name(src_dir): language = record[2] source = record[7] - if (row_count == 0 or - (language in ALLOWED_LANGUGES and + if (row_count == 0 or # Header row + (language in ALLOWED_LANGUAGES and source not in DISALLOWED_SOURCES)): keep_count = keep_count + 1 outfile.write(row)