From 53481e8ad34b81439d3e65994f0993836175938e Mon Sep 17 00:00:00 2001
From: Mats Bovin <mats.bovin@gmail.com>
Date: Fri, 5 Jan 2024 11:08:11 +0100
Subject: [PATCH] Update comments and readme

---
 sbdi/README.md           | 22 ++++++++++++++++------
 sbdi/process-backbone.py | 37 ++++++++++++++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/sbdi/README.md b/sbdi/README.md
index e3177be..a2890b6 100644
--- a/sbdi/README.md
+++ b/sbdi/README.md
@@ -37,9 +37,19 @@ Tag 1.0.2 created and pushed.
 
 ## Build search index
 
-The solr search index can be built from scratch in the following way from the /admin page:
-
-* Select **DwCA Import** and import the [GBIF Backbone Taxonomy](https://www.gbif.org/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c) (~5h)
-* Select **Create Links** and run:
-  * **Build search and suggest weights** (~7h)
-  * **Build solr suggestion index** (~15min, application will throw a read timeout exception but indexing will continue to run on Solr)
+The solr search index can be built from scratch from the [GBIF Backbone Taxonomy](https://www.gbif.org/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c). Before importing the data it needs to be pre-processed (see [process-backbone.py](./process-backbone.py) for details). 
+
+* Download backbone (to `/data/bie-index/import`):
+  ```
+  make fetch-backbone
+  ```
+* Pre-process backbone:
+  ```
+  make process-backbone
+  ```
+* Go to the /admin page and select **DwCA Import** and import from `/data/bie-index/import/backbone` (~2h)
+* Go to the /admin page and select **Create Links** and run:
+  * **Denormalise taxa** (~8h)
+  * ~~**Build link identifiers** (7:30h)~~
+  * **Build search and suggest weights** (~2:15h)
+  * **Build solr suggestion index** (~15min - the application will throw a read timeout exception but indexing will continue to run on Solr)
diff --git a/sbdi/process-backbone.py b/sbdi/process-backbone.py
index adc1cbc..12ef144 100755
--- a/sbdi/process-backbone.py
+++ b/sbdi/process-backbone.py
@@ -1,11 +1,35 @@
 #!/usr/bin/env python3
+#
+# This script pre-processes the GBIF Backbone taxonomy before loading it into the bie-index.
+#
+# The original files are renamed (eg. Taxon.tsv -> Taxon.tsv.original) and the processed file
+# is saved with the original name (eg. Taxon.tsv).
+#
+# The following procsessing is done:
+#
+# Taxon
+# -----
+# - Remove scientificNameAuthorship from scientificName (if included) because the bie-index
+#   expects the scientificName to be without authorship.
+#   Eg: Capreolus capreolus (Linnaeus, 1758) -> Capreolus capreolus
+#
+# VernacularName
+# --------------
+# - Only include Swedish and English names
+# - Exclude names from some sources of bad quality
+#
+
 import os
 import sys
 
-ALLOWED_LANGUGES = ['sv', 'en']
+ALLOWED_LANGUAGES = [
+    'sv',
+    'en',
+]
 DISALLOWED_SOURCES = [
-    'Belgian Species List', # This contains comma-seprated lists of names
-    'Abrocomidae', # This and all the following have all names wrongly tagged as English
+    'Belgian Species List', # Contains comma-seprated lists of names
+    # All of these have names in various languages wrongly tagged as English
+    'Abrocomidae',
     'Acrobatidae',
     'Ailuridae',
     'Alpheidae',
@@ -167,6 +191,7 @@ def process_taxon(src_dir):
     destination_path = f'{src_dir}/Taxon.tsv'
     original_path = f'{destination_path}.original'
 
+    # Rename original file (if not already done)
     if not os.path.isfile(original_path):
         os.rename(destination_path, original_path)
 
@@ -181,6 +206,7 @@ def process_taxon(src_dir):
         scientificName = record[5]
         scientificNameAuthorship = record[6]
 
+        # Remove scientificNameAuthorship from scientificName
         if scientificNameAuthorship and scientificName.endswith(scientificNameAuthorship):
             record[5] = scientificName[:-len(scientificNameAuthorship)].strip()
 
@@ -202,6 +228,7 @@ def process_vernacular_name(src_dir):
     destination_path = f'{src_dir}/VernacularName.tsv'
     original_path = f'{destination_path}.original'
 
+    # Rename original file (if not already done)
     if not os.path.isfile(original_path):
         os.rename(destination_path, original_path)
 
@@ -217,8 +244,8 @@ def process_vernacular_name(src_dir):
         language = record[2]
         source = record[7]
 
-        if (row_count == 0 or
-                (language in ALLOWED_LANGUGES and
+        if (row_count == 0 or # Header row
+                (language in ALLOWED_LANGUAGES and
                  source not in DISALLOWED_SOURCES)):
             keep_count = keep_count + 1
             outfile.write(row)