From 9a64b6eaa7b47e43c19bbe2ab7ac23ffa9389e72 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 1 Sep 2022 09:33:56 +0100
Subject: [PATCH 01/92] MNT: Show which ds has None for size

---
 pangaea_downloader/pq_scraper.py     | 5 ++++-
 pangaea_downloader/tools/datasets.py | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py
index bb17b7f..dfbf485 100755
--- a/pangaea_downloader/pq_scraper.py
+++ b/pangaea_downloader/pq_scraper.py
@@ -95,7 +95,10 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
                     continue
                 df = pd.concat(df_list)
             else:
-                dataset_type = process.ds_type(size)
+                try:
+                    dataset_type = process.ds_type(size)
+                except Exception:
+                    raise ValueError(f"Can't process type from size for {ds_id}")
                 if dataset_type == "video":
                     if verbose >= 1:
                         print(
diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index b54c6d5..1965371 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -80,7 +80,10 @@ def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]:
         url = process.url_from_uri(child_uri)
         size = process.get_html_info(url)
         # Assess type
-        typ = process.ds_type(size)
+        try:
+            typ = process.ds_type(size)
+        except Exception:
+            raise ValueError(f"Can't process type from size for {url}")
         if typ == "video":
             if verbose >= 1:
                 print(

From 0460e6021a9d8c273f9cc7427ef3796c49881269 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 9 Dec 2021 16:06:10 -0400
Subject: [PATCH 02/92] ENH: Add option to control whether URL columns are
 required

---
 pangaea_downloader/tools/datasets.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index 1965371..dbc841a 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -19,7 +19,7 @@
 T_POLL_INTV = 0.1667
 
 
-def fetch_child(child_url: str, verbose=1) -> Optional[DataFrame]:
+def fetch_child(child_url: str, verbose=1, ensure_url=True) -> Optional[DataFrame]:
     """Fetch Pangaea child dataset using provided URI/DOI and return DataFrame."""
     # Load data set
     global T_POLL_LAST
@@ -39,7 +39,7 @@ def fetch_child(child_url: str, verbose=1) -> Optional[DataFrame]:
             )
         return
     # Check for image URL column
-    if not checker.has_url_col(ds.data):
+    if ensure_url and not checker.has_url_col(ds.data):
         if verbose >= 1:
             print(
                 colorama.Fore.YELLOW
@@ -54,7 +54,9 @@ def fetch_child(child_url: str, verbose=1) -> Optional[DataFrame]:
     return df
 
 
-def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]:
+def fetch_children(
+    parent_url: str, verbose=1, ensure_url=True
+) -> Optional[List[DataFrame]]:
     """Take in url of a parent dataset, fetch and return list of child datasets."""
     # Fetch dataset
     global T_POLL_LAST
@@ -111,7 +113,7 @@ def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]:
                         + colorama.Fore.RESET
                     )
                 return
-            if not checker.has_url_col(child.data):
+            if ensure_url and not checker.has_url_col(child.data):
                 if verbose >= 1:
                     print(
                         colorama.Fore.YELLOW

From b038c4f9cad8b098f833ec2381d6784144dd09dd Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 1 Sep 2022 09:40:53 +0100
Subject: [PATCH 03/92] API: Disable ensure url option in search

---
 pangaea_downloader/pq_scraper.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py
index dfbf485..7ea135a 100755
--- a/pangaea_downloader/pq_scraper.py
+++ b/pangaea_downloader/pq_scraper.py
@@ -75,7 +75,9 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
         # ------------- ASSESS DATASET TYPE ------------- #
         try:
             if is_parent:
-                df_list = datasets.fetch_children(url, verbose=verbose - 1)
+                df_list = datasets.fetch_children(
+                    url, verbose=verbose - 1, ensure_url=False
+                )
                 if df_list is None:
                     if verbose >= 1:
                         print(
@@ -110,7 +112,9 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
                 elif dataset_type == "paginated":
                     df = scraper.scrape_image_data(url, verbose=verbose - 1)
                 elif dataset_type == "tabular":
-                    df = datasets.fetch_child(url, verbose=verbose - 1)
+                    df = datasets.fetch_child(
+                        url, verbose=verbose - 1, ensure_url=False
+                    )
         except Exception as err:
             if isinstance(err, KeyboardInterrupt):
                 raise

From 09006e283455634771e33df063dc21779bb24078 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 1 Sep 2022 09:53:55 +0100
Subject: [PATCH 04/92] MNT: Handle alt campaign name within set_metadata

---
 pangaea_downloader/tools/datasets.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index dbc841a..e9c5880 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -28,7 +28,6 @@ def fetch_child(child_url: str, verbose=1, ensure_url=True) -> Optional[DataFram
     time.sleep(t_wait)  # Stay under 180 requests every 30s
     ds = PanDataSet(child_url)
     T_POLL_LAST = time.time()
-    doi = getattr(ds, "doi", "").split("doi.org/")[-1]
     # Dataset is restricted
     if ds.loginstatus != "unrestricted":
         if verbose >= 1:
@@ -48,7 +47,7 @@ def fetch_child(child_url: str, verbose=1, ensure_url=True) -> Optional[DataFram
             )
         return
     # Add metadata
-    df = set_metadata(ds, alt=doi)
+    df = set_metadata(ds)
     # Exclude unwanted rows
     df = exclude_rows(df)
     return df
@@ -122,8 +121,7 @@ def fetch_children(
                     )
             else:
                 # Add metadata
-                child_doi = getattr(child, "doi", "").split("doi.org/")[-1]
-                df = set_metadata(child, alt=child_doi)
+                df = set_metadata(child)
                 # Add child dataset to list
                 df = exclude_rows(df)
                 df_list.append(df)
@@ -136,11 +134,12 @@ def fetch_children(
         return None
 
 
-def set_metadata(ds: PanDataSet, alt="unknown") -> DataFrame:
+def set_metadata(ds: PanDataSet) -> DataFrame:
     """Add metadata to a PanDataSet's dataframe."""
     ds.data["dataset_title"] = ds.title
     ds.data["doi"] = getattr(ds, "doi", "")
     # Dataset campaign
+    alt = ds.data["doi"].split("doi.org/")[-1]
     if (len(ds.events) > 0) and (ds.events[0].campaign is not None):
         ds.data["campaign"] = ds.events[0].campaign.name
     else:

From ac76bc7b13d15c78b91af084b00e5655d3f9b9d7 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 1 Sep 2022 09:55:08 +0100
Subject: [PATCH 05/92] MNT: Use dataset ID as alt instead of DOI

---
 pangaea_downloader/tools/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index e9c5880..96ad334 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -139,7 +139,7 @@ def set_metadata(ds: PanDataSet) -> DataFrame:
     ds.data["dataset_title"] = ds.title
     ds.data["doi"] = getattr(ds, "doi", "")
     # Dataset campaign
-    alt = ds.data["doi"].split("doi.org/")[-1]
+    alt = str(ds.id)
     if (len(ds.events) > 0) and (ds.events[0].campaign is not None):
         ds.data["campaign"] = ds.events[0].campaign.name
     else:

From f156f499dab0f4db040caabf707aa7f8821cea52 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 1 Sep 2022 11:53:07 +0100
Subject: [PATCH 06/92] MNT: Skip children of parents without URL

---
 pangaea_downloader/pq_scraper.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py
index 7ea135a..743cb70 100755
--- a/pangaea_downloader/pq_scraper.py
+++ b/pangaea_downloader/pq_scraper.py
@@ -75,9 +75,7 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
         # ------------- ASSESS DATASET TYPE ------------- #
         try:
             if is_parent:
-                df_list = datasets.fetch_children(
-                    url, verbose=verbose - 1, ensure_url=False
-                )
+                df_list = datasets.fetch_children(url, verbose=verbose - 1)
                 if df_list is None:
                     if verbose >= 1:
                         print(

From d8894f9845a4dcfb902824bd37dbc2039f93a2a3 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 1 Sep 2022 14:17:26 +0100
Subject: [PATCH 07/92] STY: Split arg per line with ensure_url included

---
 pangaea_downloader/tools/datasets.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index 96ad334..e0adc9e 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -54,7 +54,9 @@ def fetch_child(child_url: str, verbose=1, ensure_url=True) -> Optional[DataFram
 
 
 def fetch_children(
-    parent_url: str, verbose=1, ensure_url=True
+    parent_url: str,
+    verbose=1,
+    ensure_url=True,
 ) -> Optional[List[DataFrame]]:
     """Take in url of a parent dataset, fetch and return list of child datasets."""
     # Fetch dataset

From 13920281d0a651ff04142993fd6edbd24fadc510 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Fri, 2 Sep 2022 13:28:24 +0100
Subject: [PATCH 08/92] ENH: Get lat/lon from PanDataSet if not scraped

---
 pangaea_downloader/tools/scraper.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py
index 95218f2..a6bb0bc 100644
--- a/pangaea_downloader/tools/scraper.py
+++ b/pangaea_downloader/tools/scraper.py
@@ -27,6 +27,30 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
     soup = BeautifulSoup(resp.text, "lxml")
     # Get coordinates of expedition
     coordinates = get_metadata(soup)
+    if coordinates is None and hasattr(ds, "geometryextent"):
+        print(
+            colorama.Fore.RED + "\t\t\t[ALERT] Trying to get coordinates from"
+            " PanDataSet.geometryextent" + colorama.Fore.RESET
+        )
+        lat = None
+        long = None
+        for k in ["meanLatitude", "latitude", "Latitude"]:
+            if k in ds.geometryextent:
+                lat = ds.geometryextent[k]
+                break
+        for k in ["meanLongitude", "longitude", "Latitude"]:
+            if k in ds.geometryextent:
+                long = ds.geometryextent[k]
+                break
+        if lat is None and long is None:
+            coordinates = None
+        coordinates = lat, long
+
+    if coordinates is None:
+        print(
+            colorama.Fore.RED + "\t\t\t[ERROR] Coordinate metadata not found on page!"
+            " Saved file won't have Longitude, Latitude columns!" + colorama.Fore.RESET
+        )
 
     # Get download link to photos page
     download_link = soup.find("div", attrs={"class": "text-block top-border"}).a["href"]
@@ -71,10 +95,6 @@ def get_metadata(page_soup: BeautifulSoup) -> Optional[Tuple[float, float]]:
         lat = float(coordinates.find("span", attrs={"class": "latitude"}).text)
         long = float(coordinates.find("span", attrs={"class": "longitude"}).text)
         return lat, long
-    print(
-        colorama.Fore.RED + "\t\t\t[ERROR] Coordinate metadata not found on page!"
-        " Saved file won't have Longitude, Latitude columns!" + colorama.Fore.RESET
-    )
     return None
 
 

From f52e3b58497ba2143c2f115f6ef23a5a3a9551f8 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Fri, 2 Sep 2022 13:29:58 +0100
Subject: [PATCH 09/92] ENH: Add auth_token support

---
 pangaea_downloader/pq_scraper.py     | 25 ++++++++++++++++++++++---
 pangaea_downloader/tools/datasets.py | 14 ++++++++++----
 requirements.txt                     |  2 +-
 3 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py
index 743cb70..48aa831 100755
--- a/pangaea_downloader/pq_scraper.py
+++ b/pangaea_downloader/pq_scraper.py
@@ -16,7 +16,12 @@
 from pangaea_downloader.tools import datasets, process, scraper, search
 
 
-def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
+def search_and_download(
+    queries=None,
+    output_dir="query-outputs",
+    auth_token=None,
+    verbose=0,
+):
     """
     Search `PANGAEA`_ for a set of queries, and download datasets for each result.
 
@@ -31,6 +36,8 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
     output_dir : str, default="query-outputs"
         The output directory where downloaded datasets will be saved.
         Any existing output datasets will be skipped instead of downloaded.
+    auth_token : str, optional
+        Bearer authentication token.
     verbose : int, default=1
         Verbosity level.
     """
@@ -75,7 +82,11 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
         # ------------- ASSESS DATASET TYPE ------------- #
         try:
             if is_parent:
-                df_list = datasets.fetch_children(url, verbose=verbose - 1)
+                df_list = datasets.fetch_children(
+                    url,
+                    verbose=verbose - 1,
+                    auth_token=auth_token,
+                )
                 if df_list is None:
                     if verbose >= 1:
                         print(
@@ -111,7 +122,10 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
                     df = scraper.scrape_image_data(url, verbose=verbose - 1)
                 elif dataset_type == "tabular":
                     df = datasets.fetch_child(
-                        url, verbose=verbose - 1, ensure_url=False
+                        url,
+                        verbose=verbose - 1,
+                        ensure_url=False,
+                        auth_token=auth_token,
                     )
         except Exception as err:
             if isinstance(err, KeyboardInterrupt):
@@ -195,6 +209,11 @@ def get_parser():
         default="query-outputs",
         help="Directory for downloaded datasets. Default is %(default)s.",
     )
+    parser.add_argument(
+        "--auth-token",
+        type=str,
+        help="Bearer authentication token",
+    )
     parser.add_argument(
         "--verbose",
         "-v",
diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index e0adc9e..cb335b3 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -19,14 +19,19 @@
 T_POLL_INTV = 0.1667
 
 
-def fetch_child(child_url: str, verbose=1, ensure_url=True) -> Optional[DataFrame]:
+def fetch_child(
+    child_url: str,
+    verbose=1,
+    ensure_url=True,
+    auth_token=None,
+) -> Optional[DataFrame]:
     """Fetch Pangaea child dataset using provided URI/DOI and return DataFrame."""
     # Load data set
     global T_POLL_LAST
     global T_POLL_INTV
     t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time())
     time.sleep(t_wait)  # Stay under 180 requests every 30s
-    ds = PanDataSet(child_url)
+    ds = PanDataSet(child_url, auth_token=auth_token)
     T_POLL_LAST = time.time()
     # Dataset is restricted
     if ds.loginstatus != "unrestricted":
@@ -57,6 +62,7 @@ def fetch_children(
     parent_url: str,
     verbose=1,
     ensure_url=True,
+    auth_token=None,
 ) -> Optional[List[DataFrame]]:
     """Take in url of a parent dataset, fetch and return list of child datasets."""
     # Fetch dataset
@@ -64,7 +70,7 @@ def fetch_children(
     global T_POLL_INTV
     t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time())
     time.sleep(t_wait)  # Stay under 180 requests every 30s
-    ds = PanDataSet(parent_url)
+    ds = PanDataSet(parent_url, auth_token=auth_token)
     T_POLL_LAST = time.time()
     # Check restriction
     if ds.loginstatus != "unrestricted":
@@ -104,7 +110,7 @@ def fetch_children(
         elif typ == "tabular":
             t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time())
             time.sleep(t_wait)  # Stay under 180 requests every 30s
-            child = PanDataSet(url)
+            child = PanDataSet(url, auth_token=auth_token)
             T_POLL_LAST = time.time()
             if ds.loginstatus != "unrestricted":
                 if verbose >= 1:
diff --git a/requirements.txt b/requirements.txt
index 368ee16..8726b63 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ matplotlib>=3.4.2
 numpy>=1.20.3
 opencv-python>=4.5.2.54
 pandas>=1.2.5
-pangaeapy>=0.0.5
+pangaeapy>=1.0.6
 requests>=2.25.1
 scikit-learn>=0.24.2
 tqdm

From 4da4e30197d8f6c07bcfa811d9414f53501571aa Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Tue, 7 Mar 2023 12:25:58 -0500
Subject: [PATCH 10/92] MNT: Only print saving dataframe if verbosity high
 enough

---
 pangaea_downloader/tools/datasets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index cb335b3..397b969 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -176,7 +176,8 @@ def save_df(df: DataFrame, output_path: str, level=1, index=None, verbose=1) ->
         return False
     # Save if dataframe not empty
     df.to_csv(output_path, index=False)
-    print(f"{tabs}[{idx}] Saved to '{output_path}'")
+    if verbose >= 1:
+        print(f"{tabs}[{idx}] Saved to '{output_path}'")
     return True
 
 

From aec74c6304f9f080036c328feeb2ca06d435a758 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 Mar 2023 11:08:46 -0500
Subject: [PATCH 11/92] BUG: Remove unused import of IPython, not specified in
 requirements

---
 pangaea_downloader/merge_benthic_datasets.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index eb15e1a..8376eb3 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -15,7 +15,6 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from IPython.display import display
 from tqdm.auto import tqdm
 
 from pangaea_downloader import __meta__

From 682fba462315032a8b39b11bfbc006be296d6275 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 Mar 2023 11:09:15 -0500
Subject: [PATCH 12/92] BUG: Skip non-CSV files when processing outputs

---
 pangaea_downloader/merge_benthic_datasets.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 8376eb3..6aee8f8 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -712,6 +712,8 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
     dfs_fnames = []
 
     for fname in tqdm(sorted(sorted(os.listdir(input_dirname)), key=len)):  # noqa: C414
+        if not fname.endswith(".csv"):
+            continue
         # for fname in tqdm(os.listdir(input_dirname)):
         ds_id = os.path.splitext(fname)[0]
         df = pd.read_csv(os.path.join(input_dirname, fname))

From 759b1be9f4ec07153f461ccabeaa70ca6961faf9 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 Mar 2023 11:10:40 -0500
Subject: [PATCH 13/92] MNT: Don't add dummy campaign and site columns when
 downloading datasets

---
 pangaea_downloader/tools/datasets.py | 8 --------
 pangaea_downloader/tools/scraper.py  | 7 -------
 2 files changed, 15 deletions(-)

diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index 397b969..23c7688 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -147,16 +147,8 @@ def set_metadata(ds: PanDataSet) -> DataFrame:
     ds.data["dataset_title"] = ds.title
     ds.data["doi"] = getattr(ds, "doi", "")
     # Dataset campaign
-    alt = str(ds.id)
     if (len(ds.events) > 0) and (ds.events[0].campaign is not None):
         ds.data["campaign"] = ds.events[0].campaign.name
-    else:
-        ds.data["campaign"] = alt
-    # Dataset site/event/deployment
-    if "Event" in ds.data.columns:
-        ds.data["site"] = ds.data["Event"]
-    else:
-        ds.data["site"] = alt + "_site"
     return ds.data
 
 
diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py
index a6bb0bc..22f4e84 100644
--- a/pangaea_downloader/tools/scraper.py
+++ b/pangaea_downloader/tools/scraper.py
@@ -73,15 +73,8 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
         df["Latitude"] = lat
     df["Dataset"] = ds.title
     df["DOI"] = getattr(ds, "doi", "")
-    doi = getattr(ds, "doi", "").split("doi.org/")[-1]
     if (len(ds.events) > 0) and (ds.events[0].campaign is not None):
         df["Campaign"] = ds.events[0].campaign.name
-    else:
-        df["Campaign"] = doi
-    if "Event" in ds.data.columns:
-        df["Site"] = ds.data["Event"]
-    else:
-        df["Site"] = doi + "_site"
     return df
 
 

From 18651b8a12b504ccb9b3b8b4fc43aa18e2c7b4bc Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 Mar 2023 13:50:41 -0500
Subject: [PATCH 14/92] MNT: Print message explaining errors being repeated

---
 pangaea_downloader/pq_scraper.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py
index 48aa831..6f618b2 100755
--- a/pangaea_downloader/pq_scraper.py
+++ b/pangaea_downloader/pq_scraper.py
@@ -157,7 +157,9 @@ def search_and_download(
         print(f"Number of files previously saved: {n_files}.")
         print(f"Total dataset files: {n_files + n_downloads}")
         print(f"Number of dataset errors (excluding access): {len(errors)}.")
-
+        if len(errors) > 0:
+            print()
+            print("Captured errors are now repeated as follows.")
         for msg in errors:
             print()
             print(msg)

From 53324810d17eb8f722b70ef287146131277912d3 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 Mar 2023 13:51:21 -0500
Subject: [PATCH 15/92] MNT: Ignore existing dataset and site columns

---
 pangaea_downloader/merge_benthic_datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 6aee8f8..b0baa64 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -255,8 +255,8 @@ def reformat_df(df, remove_duplicate_columns=True):
     # is the output column name, and the value is a list of search names
     # in order of priority. The first match will be kept and others discarded.
     desired_columns = {
-        "dataset": ["ds_id", "dataset", "Campaign", "campaign"],
-        "site": ["Event", "event", "Site", "site", "deployment"],
+        "dataset": ["ds_id"],
+        "site": ["Event", "event", "deployment"],
         "image": ["image", "filename"],
         "datetime": [
             "Date/Time",

From df49254d14df4122d3d4907954e342fa4fcd8749 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 Mar 2023 13:51:50 -0500
Subject: [PATCH 16/92] MNT: Change default site to be based on dataset name,
 not DOI

---
 pangaea_downloader/merge_benthic_datasets.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index b0baa64..bd2f688 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -365,6 +365,10 @@ def reformat_df(df, remove_duplicate_columns=True):
     # if "timestamp" not in df.columns and "datetime" in df.columns:
     #     df["timestamp"] = df["datetime"].apply(datetime2timestamp)
 
+    # Add default site if it is missing
+    if "site" not in df.columns:
+        df["site"] = df["dataset"] + "_site"
+
     if any([c in clean_cols for c in ["Kingdom", "Phylum", "Genus"]]):
         df["taxonomy"] = df.apply(row2taxonomy, axis=1)
         df.drop(

From a5786557c5f9297f75e6b1c310246e9adf16b0eb Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 Mar 2023 13:52:36 -0500
Subject: [PATCH 17/92] BUG: Reflect latitudesouth, latitude-, longitudewest,
 longitude-

---
 pangaea_downloader/merge_benthic_datasets.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index bd2f688..987e485 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -273,7 +273,7 @@ def reformat_df(df, remove_duplicate_columns=True):
             "latitude+",
             "latitudemed",
             "latitudenorth",
-            "latitudesouth",
+            # "latitudesouth",  # special handling
         ],
         "longitude": [
             "Longitude",
@@ -282,8 +282,8 @@ def reformat_df(df, remove_duplicate_columns=True):
             "long",
             "longitude+",
             "longitudemed",
-            "longitudewest",
             "longitudeeast",
+            # "longitudewest",  # special handling
         ],
         "x_pos": [],
         "y_pos": [],
@@ -360,6 +360,16 @@ def reformat_df(df, remove_duplicate_columns=True):
     # Rename columns to canonical names
     df.rename(columns=mapping, inplace=True, errors="raise")
 
+    # Handle latitudesouth and longitudewest
+    if "latitude" not in df.columns and "latitudesouth" in df.columns:
+        df["latitude"] = -df["latitudesouth"]
+    if "latitude" not in df.columns and "latitude-" in df.columns:
+        df["latitude"] = -df["latitude-"]
+    if "longitude" not in df.columns and "longitudewest" in df.columns:
+        df["longitude"] = -df["longitudewest"]
+    if "longitude" not in df.columns and "longitude-" in df.columns:
+        df["longitude"] = -df["longitude-"]
+
     # Add file extension to image
     df["image"] = df.apply(add_file_extension, axis=1)
     # if "timestamp" not in df.columns and "datetime" in df.columns:

From ea5eba4e596766dc10a5c57a27a2bf07ed43dd04 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 Mar 2023 19:18:41 -0500
Subject: [PATCH 18/92] MNT: Save results for parents whose children don't have
 URLs

---
 pangaea_downloader/pq_scraper.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py
index 6f618b2..fac599b 100755
--- a/pangaea_downloader/pq_scraper.py
+++ b/pangaea_downloader/pq_scraper.py
@@ -85,6 +85,7 @@ def search_and_download(
                 df_list = datasets.fetch_children(
                     url,
                     verbose=verbose - 1,
+                    ensure_url=False,
                     auth_token=auth_token,
                 )
                 if df_list is None:

From 8203880f99b9fe38938ad53f682c14ae5661c446 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Fri, 24 Mar 2023 17:29:21 +0000
Subject: [PATCH 19/92] MNT: Inherit verbosity from caller

---
 pangaea_downloader/merge_benthic_datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 987e485..f8b7f98 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -767,7 +767,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
             files_with_repeat_urls.append(fname)
 
         # Try to fix repeated URLs that are accidental dups but should differ
-        df = fixup_repeated_urls(df, url_column=url_col, verbose=1)
+        df = fixup_repeated_urls(df, url_column=url_col, verbose=verbose)
 
         if len(df) != len(df.drop_duplicates(subset=url_col)):
             files_with_repeat_urls2.append(fname)
@@ -832,7 +832,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
     else:
         if verbose >= 1:
             print("Fix repeated output paths to prevent collisions")
-        df_all = fixup_repeated_output_paths(df_all, inplace=True, verbose=2)
+        df_all = fixup_repeated_output_paths(df_all, inplace=True, verbose=verbose)
 
     if os.path.dirname(output_path):
         os.makedirs(os.path.dirname(output_path), exist_ok=True)

From 77d4cf1619d5adf71672365b79732e6a9f6cd898 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 00:21:40 +0100
Subject: [PATCH 20/92] MNT: Read CSV files without low_memory mode due to
 'mixed types'

---
 pangaea_downloader/merge_benthic_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index f8b7f98..f4a79fe 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -730,7 +730,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
             continue
         # for fname in tqdm(os.listdir(input_dirname)):
         ds_id = os.path.splitext(fname)[0]
-        df = pd.read_csv(os.path.join(input_dirname, fname))
+        df = pd.read_csv(os.path.join(input_dirname, fname), low_memory=False)
         n_total += 1
         if not checker.has_url_col(df):
             continue

From db0fae0361ed049e39f05294377c5a012385cecc Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 00:25:23 +0100
Subject: [PATCH 21/92] ENH: Interpolate or extract missing lat, lon, datetime
 metadata

---
 pangaea_downloader/merge_benthic_datasets.py | 707 ++++++++++++++++++-
 requirements.txt                             |   1 +
 2 files changed, 707 insertions(+), 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index f4a79fe..b6ffb72 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -10,20 +10,24 @@
 import os
 import re
 from collections import defaultdict
+from functools import partial
 
 import dateutil.parser
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import scipy.interpolate
+from pangaeapy import PanDataSet
 from tqdm.auto import tqdm
 
 from pangaea_downloader import __meta__
 from pangaea_downloader.tools import checker
 
 try:
-    from benthicnet.io import fixup_repeated_output_paths
+    from benthicnet.io import fixup_repeated_output_paths, row2basename
 except ImportError:
     fixup_repeated_output_paths = None
+    row2basename = None
 
 TAXONOMY_RANKS = [
     ["Kingdom", "Regnum"],
@@ -695,6 +699,698 @@ def fixup_repeated_urls(
     return df
 
 
+def fixup_favourite_images(df, verbose=1):
+    """
+    Drop duplicated favourite images.
+
+    These occur in Ingo Schewe's datasets along OFOS profiles during POLARSTERN
+    cruises, PANGAEA dataset ids 849814--849816 and 873995--874002.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        A PANGAEA dataframe with Type column.
+    verbose : int, default=1
+        Level of verbosity.
+
+    Returns
+    -------
+    df : pandas.DataFrame
+        As input dataframe, but with all Type entries starting with favourite
+        removed (case-insensitive).
+    """
+    if "Type" not in df.columns:
+        return df
+    # Remove all Favourite timer, Favourite hotkey, FAVOURITE_TIMER, and
+    # FAVOURITE_HOTKEY entries, which although they have unique URLs for their
+    # images are actually identical images to the ones occuring immediately
+    # after them in the dataframe.
+    n_samples_before = len(df)
+    df = df[~df["Type"].str.lower().str.startswith("favourite")]
+    n_samples_after = len(df)
+    if verbose >= 1 and n_samples_after != n_samples_before:
+        print(
+            f"{df.iloc[0]['dataset']}:"
+            f" Removed {n_samples_before - n_samples_after} favourited images."
+            f" {n_samples_before} -> {n_samples_after} rows"
+        )
+    return df
+
+
+def get_dataset_datetime(ds_id):
+    """
+    Determine a generic date for a dataset from the min and max extent datetimes.
+
+    Parameters
+    ----------
+    ds_id : int
+        The identifier of a PANGAEA dataset.
+
+    Returns
+    -------
+    dt_avg : str
+        The average datetime between the min and max extent, with precision
+        reduced to reflect what can accurately be represented.
+    """
+    ds = PanDataSet(ds_id)
+    dt_min = pd.to_datetime(ds.mintimeextent)
+    dt_max = pd.to_datetime(ds.maxtimeextent)
+    if dt_min is None and dt_max is None:
+        return pd.NaT
+    elif dt_min is None:
+        return dt_max.strftime("%Y-%m-%d")
+    elif dt_max is None:
+        return dt_min.strftime("%Y-%m-%d")
+    delta = dt_max - dt_min
+    dt_avg = dt_min + delta / 2
+    if delta > datetime.timedelta(days=90):
+        return dt_avg.strftime("%Y")
+    if delta > datetime.timedelta(days=4):
+        return dt_avg.strftime("%Y-%m")
+    if delta > datetime.timedelta(hours=3):
+        return dt_avg.strftime("%Y-%m-%d")
+    if delta > datetime.timedelta(minutes=5):
+        return dt_avg.strftime("%Y-%m-%d %H:00:00")
+    if delta > datetime.timedelta(seconds=5):
+        return dt_avg.strftime("%Y-%m-%d %H:%M:00")
+    return dt_avg.strftime("%Y-%m-%d %H:%M:%S")
+
+
+def fix_missing_datetime_from_image_name(df, ds_id, verbose=1):
+    """
+    Extract datetime information from the contents of the image column in the dataframe.
+
+    Note that the extraction operation is only performed on dataset IDs for
+    which the image naming scheme has been manually evaluated, and is not
+    applied blindly to datasets which have not been inspected.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Input dataframe.
+    ds_id : int
+        The identifier of the PANGAEA dataset.
+    verbose : int, default=1
+        Verbosity level.
+
+    Returns
+    -------
+    df : pandas.DataFrame
+        As input, but with missing datetime cells filled in from the image.
+        Existing datetime values are unchanged.
+    """
+    if "datetime" not in df.columns:
+        df["datetime"] = pd.NaT
+
+    ds_id = int(ds_id)
+
+    select = df["datetime"].isna()
+
+    if row2basename is None:
+        selected_image = df.loc[select, "image"]
+    else:
+        selected_image = df[select].apply(
+            partial(row2basename, use_url_extension=True), axis=1
+        )
+
+    selected_image_no_ext = selected_image.apply(lambda x: os.path.splitext(x)[0])
+
+    if ds_id in [
+        785104,
+        785105,
+        785108,
+        785109,
+        785110,
+        836457,
+        867771,
+        867772,
+        867773,
+        867774,
+        867775,
+        867776,
+        867777,
+        867778,
+        867806,
+        867807,
+        867808,
+        867852,
+        867853,
+        867861,
+        873541,
+        875713,
+        875714,
+        876422,
+        876423,
+        876511,
+        876512,
+        876513,
+        876514,
+        876515,
+        876516,
+        876517,
+        876518,
+        880043,
+        880044,
+        885666,
+        885667,
+        885668,
+        885669,
+        885670,
+        885672,
+        885674,
+        885675,
+        885709,
+        885712,
+        885713,
+        885714,
+        885715,
+        885716,
+        885717,
+        885718,
+        885719,
+        885720,
+    ]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. PP_107-100_2012-03-19.png
+        # e.g. PP_100_2012-06-05a.jpg
+        # e.g. TH_122_2012-03-27.jpg
+        # e.g. J_05_2017_05_24a.jpg
+        # e.g. J_overview_2017-05-24za.jpg
+        # e.g. J_40_2017_08_11a.jpg
+        # e.g. J_05_2017-08-11a.jpg
+        # e.g. LG_OVERVIEW_01_05_06_07_09_2013_02_24a.jpg
+        # e.g. LG_01_07_2010_11_11a.jpg
+        # e.g. LG_01_2010_11_11a.jpg
+        # e.g. LG_Cluster1_2012_01_31a.jpg
+        # e.g. LG_01_07_2012_04_22a.jpg
+        # e.g. LG_SCREW_2012_04_22a.jpg
+        # e.g. So_01_2014_02_15b.jpg
+        # e.g. XH_01_2013_01_12_a.jpg
+        # e.g. XH_01%2B09_2013_11_19_a.jpg
+        # e.g. XH_01_2010_04_22_a.jpg
+        # e.g. LH_020_2015_01_28a_counted.jpg
+        # e.g. LH_020_2015_01_28xx.jpg
+        # e.g. J_J40%2BJ46%2BJ41_2016_09_25_a.jpg
+        dtstr = selected_image_no_ext.str.lower().str.rstrip(
+            "abcdefghijklmnopqrstuvwxyz_-"
+        )
+        dtstr = dtstr.str[-11:].str.replace("_", "-").str.lstrip("-")
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d")
+
+    elif ds_id in [
+        789211,
+        789212,
+        789213,
+        789214,
+        789215,
+        789216,
+        789219,
+        819234,
+    ]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. 2003_W01-2.jpg
+        # e.g. 2004_B_bewachsen.jpg
+        # e.g. 2005_B.jpg
+        # e.g. 2013_B01-1.jpg
+        dtstr = selected_image_no_ext.str[:4]
+        # Test the format is correct; we will get an error if not
+        _ = pd.to_datetime(dtstr, format="%Y")
+        # But we actually want to keep the lower precision string
+        df.loc[select, "datetime"] = dtstr
+
+    elif ds_id in [
+        789217,
+        793210,
+        793211,
+        818906,
+        818907,
+        836263,
+        836264,
+        836265,
+        836266,
+        837653,
+    ]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. 04_2011.jpg
+        # e.g. 04a_2011_analog.jpg
+        # e.g. 04.2-2008.jpg
+        # e.g. 08-2008.jpg
+        # e.g. 04a_2013.jpg
+        # e.g. 05a_2003.jpg
+        # e.g. 04_2007.jpg
+        dtstr = selected_image_no_ext.str.lower().str.rstrip(
+            "abcdefghijklmnopqrstuvwxyz_-"
+        )
+        dtstr = dtstr.str[-4:]
+        # Test the format is correct; we will get an error if not
+        _ = pd.to_datetime(dtstr, format="%Y")
+        # But we actually want to keep the lower precision string
+        df.loc[select, "datetime"] = dtstr
+
+    elif ds_id in [836024, 836025]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. 00setting_2014-08.jpg
+        # e.g. 39.9_2014.jpg
+        # e.g. 2014_B01-1.jpg
+        df.loc[select, "datetime"] = "2014"
+
+    elif ds_id in [840699, 840700, 840702, 840703, 840742, 840743]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. J_001_2012-01-31.jpg
+        # e.g. J_003_2012-01-31_2.jpg
+        # e.g. J_115_2012-01-31_a.jpg
+        # e.g. J_033_2012-08-08.jpg
+        dtstr = selected_image_no_ext.apply(lambda x: x.split("_")[2])
+        dtstr = dtstr.str[:10]
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d")
+
+    elif ds_id in [840701, 849298]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. J_002_2013-03_03a.jpg
+        # e.g. J_001_2015-01.jpg
+        # e.g. J_001_2015-01_a.jpg
+        # e.g. J_056_2013-03_06logger.jpg
+        dtstr = selected_image_no_ext.apply(lambda x: x.split("_")[2])
+        # Test the format is correct; we will get an error if not
+        _ = pd.to_datetime(dtstr, format="%Y-%m")
+        # But we actually want to keep the lower precision string
+        df.loc[select, "datetime"] = dtstr
+
+    elif ds_id in [872407, 872408, 872409, 872410, 872411]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. J_40_2017-01-12_a.jpg
+        # e.g. J_overview2_2017-02-02_x.jpg
+        # e.g. J_xx_2017-01-12_x-62.jpg
+        # e.g. J_17_2017-01-14.jpg
+        # e.g. J_23_2017-01-14_b-1.jpg
+        dtstr = selected_image_no_ext.apply(lambda x: x.split("_")[2])
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d")
+
+    elif ds_id in [878045, 888410]:
+        # Nothing to do
+        pass
+
+    elif ds_id in [894734]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. HOTKEY_2018_03_27at21_09_21CP4A4682
+        # e.g. TIMER_2018_03_18at04_04_09CP4A3970
+        dtstr = selected_image_no_ext.apply(lambda x: "_".join(x.split("_")[1:])[:20])
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y_%m_%dat%H_%M_%S")
+
+    elif ds_id in [896157]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. 2016-08-2600000.jpg
+        dtstr = selected_image_no_ext.str[:10]
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d")
+
+    if ds_id in [
+        918232,
+        918233,
+        918327,
+        918340,
+        918341,
+        918382,
+        918383,
+        918385,
+    ]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. XH_01_2010_04_22_a.jpg
+        # e.g. XH_01_2010_04_28a.jpg
+        # e.g. XH_03_2018_10_18_a-1.jpg
+        dtstr = selected_image_no_ext.apply(lambda x: "-".join(x.split("_")[2:5])[:10])
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d")
+
+    return df
+
+
+def add_missing_datetime(df, ds_id=None, verbose=1):
+    """
+    Add missing datetime values using either the mean extent or extraction from the file name.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Input dataframe.
+    ds_id : int, optional
+        The identifier of the PANGAEA dataset. The default behaviour is to
+        extract this from the dataset column of the dataframe.
+    verbose : int, default=1
+        Verbosity level.
+
+    Returns
+    -------
+    df : pandas.DataFrame
+        As input, but with missing datetime cells completed, either by using the
+        average from the datetime extent metadata, or by extracting it from the
+        image name.
+        All existing datetime values are left unchanged.
+    """
+    if "datetime" not in df.columns:
+        df["datetime"] = pd.NaT
+
+    if ds_id is None:
+        # Get dataset id from first row
+        ds_id = df.iloc[0]["dataset"].split("-")[-1]
+    ds_id = int(ds_id)
+
+    # Add datetimes that are still missing by inferring from the image filename
+    df = fix_missing_datetime_from_image_name(df, ds_id, verbose=verbose)
+
+    if all(df["datetime"].isna()):
+        # This dataset has no datetime values
+        # Try to determine average datetime from the datetime extent metadata on
+        # the dataset record
+        dt_avg = get_dataset_datetime(ds_id)
+        if dt_avg is not None:
+            if verbose >= 1:
+                print(
+                    f"{ds_id}: Using average datetime from extent"
+                    f" - filenames look like {df.iloc[0]['image']}"
+                )
+            df["datetime"] = dt_avg
+
+    if not any(df["datetime"].isna()):
+        # This dataframe already has all datetime information
+        return df
+
+    select = df["datetime"].isna()
+    if ds_id in [889035, 889025]:
+        if verbose >= 1:
+            print(f"{ds_id}: Adding manual missing datetime for {ds_id}")
+        # From the abstract on PANGAEA (sic):
+        # Experimet was setup during 2007-02-15 and 2007-06-13.
+        df.loc[select, "datetime"] = "2007"
+
+    if ds_id in [896160, 896164]:
+        if verbose >= 1:
+            print(f"{ds_id}: Adding manual missing datetime for {ds_id}")
+        # From the INDEX 2016 ROV (see dataset title and paper
+        # https://doi.org/10.3389/fmars.2019.00096)
+        df.loc[select, "datetime"] = "2016"
+
+    return df
+
+
+def interpolate_by_datetime(df, columns):
+    """
+    Use datetime column to interpolate values for selected columns.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Dataframe with ``"datetime"`` column, which may contain missing values
+        in other columns.
+    columns : str or iterable of str
+        Name of column or columns to fill in missing values with interpolation.
+
+    Returns
+    -------
+    df : pandas.DataFrame
+        Like input, but with missing values in specified columns completed by
+        linear interpolation over datetime.
+    """
+    # Convert datetime string to a datetime object
+    datetime_actual = pd.to_datetime(df["datetime"])
+    has_datetime = ~datetime_actual.isna()
+    if isinstance(columns, str):
+        columns = [columns]
+    for col in columns:
+        has_col = ~df[col].isna()
+        has_dt_and_col = has_datetime & has_col
+        has_dt_not_col = has_datetime & ~has_col
+        df.loc[has_dt_not_col, col] = np.interp(
+            datetime_actual[has_dt_not_col],
+            datetime_actual[has_dt_and_col],
+            df.loc[has_dt_and_col, col],
+        )
+    return df
+
+
+def fixup_incomplete_metadata(df, ds_id=None, verbose=1):
+    """
+    Fix datasets which have partial, but incomplete, lat/lon/datetime metadata.
+
+    Interpolation is performed as appropriate to the dataset. The methodology
+    was determined by manually inspecting each dataset.
+    Any latitude and longitude values which can not be resolved are filled in
+    with the dataset-level mean latitude and longitude as reported by PANGAEA.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Input dataframe.
+    ds_id : int, optional
+        The identifier of the PANGAEA dataset. The default behaviour is to
+        extract this from the dataset column of the dataframe.
+    verbose : int, default=1
+        Verbosity level.
+
+    Returns
+    -------
+    df : pandas.DataFrame
+        As input, but with missing datetime, latitude, longitude, and/or depth
+        cells completed by interpolation or similar.
+        All existing datetime values are left unchanged.
+    """
+    if ds_id is None:
+        # Get dataset id from first row
+        ds_id = df.iloc[0]["dataset"].split("-")[-1]
+    ds_id = int(ds_id)
+
+    if ds_id in [753197]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+            print("Nothing to be done.")
+
+    if ds_id in [805606, 805607, 805611, 805612]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+            print(f"{ds_id}: Interpolating by index")
+        indices = np.arange(len(df))
+        col = "datetime"
+        select_not_col = df[col].isna()
+        select_has_col = ~select_not_col
+        if any(select_has_col) and any(select_not_col):
+            missing_timestamps = np.interp(
+                indices[select_not_col],
+                indices[select_has_col],
+                pd.to_datetime(df.loc[select_has_col, "datetime"]).apply(
+                    lambda x: x.timestamp()
+                ),
+            )
+            df.loc[select_not_col, col] = [
+                datetime.datetime.fromtimestamp(int(ts)) for ts in missing_timestamps
+            ]
+
+    if ds_id == 875080:
+        # N.B. There is date metadata in the csv, but not time. But there is time
+        # metadata in the filename, so we could extract this if we wanted to.
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+            print("Nothing to be done.")
+        # lat/lon was only recorded for the first 11 images. Fill in the rest
+        # with the median latitude and longitude for the record at the end
+        # of this function.
+
+    if 873995 <= ds_id <= 874002:
+        if verbose >= 1:
+            print(f"Interpolating latitude, longitude, and depth for dataset {ds_id}")
+        # Interpolate lat, lon, and depth based on datetime
+        df = interpolate_by_datetime(df, ["latitude", "longitude", "depth"])
+
+    if ds_id in [875071, 875073]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+        # Drop rows without datetime values (these have missing lat/lon as well)
+        # For 875071, these images are of the deck of the ship.
+        # For 875073, these images have a translation of less than half an image
+        # from the subsequent image, so we don't need the ones without metadata.
+        df = df[~df["datetime"].isna()]
+        # Interpolate missing depth values
+        df = interpolate_by_datetime(df, ["depth"])
+
+    if ds_id in [875084]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+        # For 875084, images without latitude and longitude are not useful.
+        # The first three are of the deck, the rest are dark watercolumn shots.
+        df = df[~df["longitude"].isna()]
+        # Interpolate missing depth values
+        df = interpolate_by_datetime(df, ["depth"])
+
+    if (878001 <= ds_id <= 878019) or ds_id == 878045:
+        if verbose >= 1:
+            print(f"{ds_id}: Dropping rows missing metadata for dataset {ds_id}")
+        # Images without metadata are of the water column and highly redundant.
+        df = df[~df["longitude"].isna()]
+
+    if ds_id in [894732, 894734]:
+        if verbose >= 1:
+            print(f"{ds_id}: Dropping rows missing metadata for dataset {ds_id}")
+        # It's not clear to me that any of these images are of the seafloor.
+        df = df[~df["longitude"].isna()]
+
+    if ds_id in [895557, 903782, 903788, 903850, 907025, 894801]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+            print(
+                f"{ds_id}: Interpolating by index over subset of images in the same series"
+            )
+        indices = np.arange(len(df))
+        image_no_ext = df["image"].apply(lambda x: os.path.splitext(x)[0])
+        image_major = image_no_ext.str[:-3]
+        missing_dt = df["datetime"].isna()
+        missing_lat = df["latitude"].isna()
+        missing_lon = df["longitude"].isna()
+        for image_major_i in image_major.unique():
+            select = image_major == image_major_i
+            col = "latitude"
+            select_and_col = select & ~missing_lat
+            select_not_col = select & missing_lat
+            if any(select_and_col) and any(select_not_col):
+                df.loc[select_not_col, col] = np.interp(
+                    indices[select_not_col],
+                    indices[select_and_col],
+                    df.loc[select_and_col, col],
+                )
+            col = "longitude"
+            select_and_col = select & ~missing_lon
+            select_not_col = select & missing_lon
+            if any(select_and_col) and any(select_not_col):
+                df.loc[select_not_col, col] = np.interp(
+                    indices[select_not_col],
+                    indices[select_and_col],
+                    df.loc[select_and_col, col],
+                )
+            col = "datetime"
+            select_and_col = select & ~missing_dt
+            select_not_col = select & missing_dt
+            if any(select_and_col) and any(select_not_col):
+                df.loc[select_not_col, col] = scipy.interpolate.interp1d(
+                    indices[select_and_col],
+                    pd.to_datetime(df.loc[select_and_col, col]),
+                    kind="nearest",
+                    fill_value="extrapolate",
+                )(indices[select_not_col])
+
+    if ds_id in [911904, 918924, 919348]:
+        if verbose >= 1:
+            print(f"{ds_id}: Extracting missing datetime metadata for dataset {ds_id}")
+        # Extract missing datetime from the filename, formatted like (e.g.)
+        # TIMER_2019_03_31_at_05_50_12_IMG_0263
+        has_no_datetime = df["datetime"].isna()
+        fname_inner = df.loc[has_no_datetime, "image"].apply(
+            lambda x: "_".join(x.split("_")[1:-2])
+        )
+        df.loc[has_no_datetime, "datetime"] = pd.to_datetime(
+            fname_inner, format="%Y_%m_%d_at_%H_%M_%S"
+        )
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Interpolating latitude, longitude, and depth for dataset {ds_id}"
+            )
+        df = interpolate_by_datetime(df, ["latitude", "longitude", "depth"])
+
+    if ds_id in [914155]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+        # Images without datetime are too dark
+        df = df[~df["datetime"].isna()]
+        # Other images are missing latitude and longitude metadata
+        df = interpolate_by_datetime(df, ["latitude", "longitude"])
+
+    if ds_id in [914156, 914197]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+        # Some images are clearly of the same thing, but one is good visibility
+        # with no lat/lon, and the next is too dark and has no datetime.
+        for from_image, to_image in [
+            ("IMG_0393", "IMG_0392"),
+            ("IMG_0395", "IMG_0394"),
+        ]:
+            columns = ["latitude", "longitude"]
+            select_from = df["image"].str.startswith(from_image)
+            select_to = df["image"].str.startswith(to_image)
+            df.loc[select_to, columns] = df.loc[select_from, columns]
+        # Drop images without datetime
+        df = df[~df["datetime"].isna()]
+        # Fill in any missing latitude and longitude metadata
+        df = interpolate_by_datetime(df, ["latitude", "longitude"])
+
+    if ds_id in [914192]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+        # Some images are clearly of the same thing, but one is good visibility
+        # with no lat/lon, and the next is too dark and has no datetime.
+        for from_image, to_image in [
+            ("IMG_1776", "IMG_1775"),
+        ]:
+            columns = ["latitude", "longitude"]
+            select_from = df["image"].str.startswith(from_image)
+            select_to = df["image"].str.startswith(to_image)
+            df.loc[select_to, columns] = df.loc[select_from, columns]
+        # Drop images without datetime
+        df = df[~df["datetime"].isna()]
+        # Fill in any missing latitude and longitude metadata
+        df = interpolate_by_datetime(df, ["latitude", "longitude"])
+
+    if any(df["latitude"].isna() | df["longitude"].isna()):
+        # Fill in any missing latitude and longitude values with the
+        # mean coordinate reported at the dataset level
+        ds = PanDataSet(ds_id)
+        if hasattr(ds, "geometryextent"):
+            lat = None
+            long = None
+            for k in ["meanLatitude", "latitude", "Latitude"]:
+                if k in ds.geometryextent:
+                    lat = ds.geometryextent[k]
+                    break
+            for k in ["meanLongitude", "longitude", "Latitude"]:
+                if k in ds.geometryextent:
+                    long = ds.geometryextent[k]
+                    break
+            if lat is not None:
+                if verbose >= 1:
+                    print(f"{ds_id}: Using dataset mean latitude for missing values")
+                df.loc[df["latitude"].isna(), "latitude"] = lat
+            if long is not None:
+                if verbose >= 1:
+                    print(f"{ds_id}: Using dataset mean longitude for missing values")
+                df.loc[df["longitude"].isna(), "longitude"] = long
+
+    return df
+
+
 def process_datasets(input_dirname, output_path=None, verbose=0):
     """
     Process a directory of datasets: clean, concatenate and save.
@@ -776,6 +1472,15 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
         if sum(df.isna().all("columns")) > 0:
             print(f"{ds_id} has a row which is all NaNs")
 
+        # Remove duplicated "favourited" images
+        df = fixup_favourite_images(df, verbose=verbose)
+
+        # Fix incomplete lat/lon/datetime metadata
+        df = fixup_incomplete_metadata(df, ds_id, verbose=verbose)
+
+        # Add datetime if it is completely missing
+        df = add_missing_datetime(df, ds_id, verbose=verbose)
+
         dfs.append(df)
         dfs_fnames.append(fname)
 
diff --git a/requirements.txt b/requirements.txt
index 8726b63..05b0ea0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,5 @@ pandas>=1.2.5
 pangaeapy>=1.0.6
 requests>=2.25.1
 scikit-learn>=0.24.2
+scipy
 tqdm

From 2c16f24788dc2c541e078a6b05a9e66812f18c82 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 00:29:21 +0100
Subject: [PATCH 22/92] MNT: Increase default verbosity level 0->1

---
 pangaea_downloader/merge_benthic_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index b6ffb72..bae82d6 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1595,7 +1595,7 @@ def get_parser():
         "--verbose",
         "-v",
         action="count",
-        default=0,
+        default=1,
         help=textwrap.dedent(
             """
             Increase the level of verbosity of the program. This can be

From 073efff5515e7db43499ae33ca0412c42674687a Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 00:29:42 +0100
Subject: [PATCH 23/92] MNT: Exclude some more dataset titles

---
 pangaea_downloader/merge_benthic_datasets.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index bae82d6..f8db667 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -199,6 +199,13 @@ def check_title(title):
         return False
     if title.startswith("Images of shell cross sections"):
         return False
+    if (
+        "early biofouling processes in a coastal lagoon" in title.lower()
+        or "early biofouling processes in a coastal la goon" in title.lower()
+    ):
+        return False
+    if "photographs of tiles" in title.lower():
+        return False
 
     return True
 

From 0a0b632874b3f39b5aac3881d0e9d4684bffa6d7 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 00:30:13 +0100
Subject: [PATCH 24/92] MNT: Manually exclude dataset 805690, which was
 downloaded without its title?

---
 pangaea_downloader/merge_benthic_datasets.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index f8db667..aceae03 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1433,6 +1433,10 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
             continue
         # for fname in tqdm(os.listdir(input_dirname)):
         ds_id = os.path.splitext(fname)[0]
+        if ds_id == "805690":
+            # The title was not captured from this dataset for some reason,
+            # so we can't exclude it via the title.
+            continue
         df = pd.read_csv(os.path.join(input_dirname, fname), low_memory=False)
         n_total += 1
         if not checker.has_url_col(df):

From 2bbbd66816d79d11c68c6d5db2c9ca8a480b3cb2 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 00:31:04 +0100
Subject: [PATCH 25/92] MNT: Make final report only appear if verbosity enabled

---
 pangaea_downloader/merge_benthic_datasets.py | 35 ++++++++++----------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index aceae03..89979b0 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1495,23 +1495,24 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
         dfs.append(df)
         dfs_fnames.append(fname)
 
-    print(f"There are {n_valid} valid (of {n_total}) valid datasets")
-    print(
-        f"Of which {len(files_with_repeat_urls)} have repeated URLs (before replacing dups with image)"
-    )
-    print(
-        f"Of which {len(files_with_repeat_urls2)} have repeated URLs (after replacing dups with image)"
-    )
-    print()
-    print(f"There are {len(column_count)} unique column names:")
-    print()
-
-    for col, count in dict(
-        sorted(column_count.items(), key=lambda item: item[1], reverse=True)
-    ).items():
-        c = col + " "
-        print(f"{c:.<35s} {count:4d}")
-    print()
+    if verbose >= 0:
+        print(f"There are {n_valid} valid (of {n_total}) valid datasets")
+        print(
+            f"Of which {len(files_with_repeat_urls)} have repeated URLs (before replacing dups with image)"
+        )
+        print(
+            f"Of which {len(files_with_repeat_urls2)} have repeated URLs (after replacing dups with image)"
+        )
+        print()
+        print(f"There are {len(column_count)} unique column names:")
+        print()
+
+        for col, count in dict(
+            sorted(column_count.items(), key=lambda item: item[1], reverse=True)
+        ).items():
+            c = col + " "
+            print(f"{c:.<35s} {count:4d}")
+        print()
 
     if verbose >= 1:
         print("Filter columns")

From 37bedd6021fb5b3c9ce83b6fcb4dc49362d991c5 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 00:31:33 +0100
Subject: [PATCH 26/92] MNT: Remove unused import of dateutil.parser

---
 pangaea_downloader/merge_benthic_datasets.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 89979b0..ffab652 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -12,7 +12,6 @@
 from collections import defaultdict
 from functools import partial
 
-import dateutil.parser
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd

From f7a44eef4b9e8f26293c9efb23d74a4aa1155bce Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 00:45:00 +0100
Subject: [PATCH 27/92] ENH: Use caching functionality built into PanDataSet

---
 pangaea_downloader/merge_benthic_datasets.py | 4 ++--
 pangaea_downloader/tools/datasets.py         | 6 +++---
 pangaea_downloader/tools/scraper.py          | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index ffab652..25ee80d 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -758,7 +758,7 @@ def get_dataset_datetime(ds_id):
         The average datetime between the min and max extent, with precision
         reduced to reflect what can accurately be represented.
     """
-    ds = PanDataSet(ds_id)
+    ds = PanDataSet(ds_id, enable_cache=True)
     dt_min = pd.to_datetime(ds.mintimeextent)
     dt_max = pd.to_datetime(ds.maxtimeextent)
     if dt_min is None and dt_max is None:
@@ -1373,7 +1373,7 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1):
     if any(df["latitude"].isna() | df["longitude"].isna()):
         # Fill in any missing latitude and longitude values with the
         # mean coordinate reported at the dataset level
-        ds = PanDataSet(ds_id)
+        ds = PanDataSet(ds_id, enable_cache=True)
         if hasattr(ds, "geometryextent"):
             lat = None
             long = None
diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index 23c7688..8cc319a 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -31,7 +31,7 @@ def fetch_child(
     global T_POLL_INTV
     t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time())
     time.sleep(t_wait)  # Stay under 180 requests every 30s
-    ds = PanDataSet(child_url, auth_token=auth_token)
+    ds = PanDataSet(child_url, enable_cache=True, auth_token=auth_token)
     T_POLL_LAST = time.time()
     # Dataset is restricted
     if ds.loginstatus != "unrestricted":
@@ -70,7 +70,7 @@ def fetch_children(
     global T_POLL_INTV
     t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time())
     time.sleep(t_wait)  # Stay under 180 requests every 30s
-    ds = PanDataSet(parent_url, auth_token=auth_token)
+    ds = PanDataSet(parent_url, enable_cache=True, auth_token=auth_token)
     T_POLL_LAST = time.time()
     # Check restriction
     if ds.loginstatus != "unrestricted":
@@ -110,7 +110,7 @@ def fetch_children(
         elif typ == "tabular":
             t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time())
             time.sleep(t_wait)  # Stay under 180 requests every 30s
-            child = PanDataSet(url, auth_token=auth_token)
+            child = PanDataSet(url, enable_cache=True, auth_token=auth_token)
             T_POLL_LAST = time.time()
             if ds.loginstatus != "unrestricted":
                 if verbose >= 1:
diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py
index 22f4e84..a8fa7e5 100644
--- a/pangaea_downloader/tools/scraper.py
+++ b/pangaea_downloader/tools/scraper.py
@@ -17,7 +17,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
     # Load dataset
     t_wait = max(0, datasets.T_POLL_LAST + datasets.T_POLL_INTV - time.time())
     time.sleep(t_wait)  # Stay under 180 requests every 30s
-    ds = PanDataSet(url)
+    ds = PanDataSet(url, enable_cache=True)
     datasets.T_POLL_LAST = time.time()
     # Request dataset url
     if verbose >= 1:

From 1643a8bef872ade381c9b74d6f37ff521e167b26 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 00:59:00 +0100
Subject: [PATCH 28/92] MNT: Extract datetime from filename for rest of 896160
 series

---
 pangaea_downloader/merge_benthic_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 25ee80d..5d1c1e0 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1027,7 +1027,7 @@ def fix_missing_datetime_from_image_name(df, ds_id, verbose=1):
         dtstr = selected_image_no_ext.apply(lambda x: "_".join(x.split("_")[1:])[:20])
         df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y_%m_%dat%H_%M_%S")
 
-    elif ds_id in [896157]:
+    elif ds_id in [896157, 896160, 896164]:
         if verbose >= 1:
             print(
                 f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"

From 5e4cf23ed0aae38fed1a73086da63cc38ea6bcab Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 00:59:16 +0100
Subject: [PATCH 29/92] MNT: Extract from filename from two more datasets

---
 pangaea_downloader/merge_benthic_datasets.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 5d1c1e0..6aa8770 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -824,6 +824,8 @@ def fix_missing_datetime_from_image_name(df, ds_id, verbose=1):
     if ds_id in [
         785104,
         785105,
+        785106,
+        785107,
         785108,
         785109,
         785110,

From 2f0d49c82988e146627465b4811f86132b04274b Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 00:59:47 +0100
Subject: [PATCH 30/92] MNT: Also manually exclude 803979, parent of 805690

---
 pangaea_downloader/merge_benthic_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 6aa8770..11e1350 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1434,7 +1434,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
             continue
         # for fname in tqdm(os.listdir(input_dirname)):
         ds_id = os.path.splitext(fname)[0]
-        if ds_id == "805690":
+        if ds_id in ["805690", "803979"]:
             # The title was not captured from this dataset for some reason,
             # so we can't exclude it via the title.
             continue

From f33f564b69ae05f792fd9eff3f40f9464109c4f2 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 01:05:32 +0100
Subject: [PATCH 31/92] ENH: Extract datetime from filename for datasets
 371062, 371063, 371064

---
 pangaea_downloader/merge_benthic_datasets.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 11e1350..6e90b99 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -821,7 +821,14 @@ def fix_missing_datetime_from_image_name(df, ds_id, verbose=1):
 
     selected_image_no_ext = selected_image.apply(lambda x: os.path.splitext(x)[0])
 
-    if ds_id in [
+    if 371062 <= ds_id <= 371064:
+        # e.g. PO309_41-1_2004-04-05T08_55_41.jpg
+        # e.g. PO309_41-2-1_2004-04-05T11_28_26.jpg
+        # e.g. PO322_211-4-1_2005-05-18T19_35_31.jpg
+        dtstr = selected_image_no_ext.apply(lambda x: "-".join(x.split("_")[2:]))
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%dT%H-%M-%S")
+
+    elif ds_id in [
         785104,
         785105,
         785106,

From 6929f5ff51dd862a7215ecc662d48c1f62cb9453 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 01:22:32 +0100
Subject: [PATCH 32/92] MNT: C416 Unnecessary dict comprehension - rewrite
 using dict()

---
 pangaea_downloader/tools/scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py
index a8fa7e5..34a4b52 100644
--- a/pangaea_downloader/tools/scraper.py
+++ b/pangaea_downloader/tools/scraper.py
@@ -124,7 +124,7 @@ def get_pagination(page_soup: BeautifulSoup, src_url: str) -> Optional[dict]:
     # List of page URLs
     page_urls = [urljoin(src_url, a["href"]) for a in pagination.find_all("a")][:-1]
     # Page number : Page URL
-    page_dict = {k: v for k, v in zip(page_nums, page_urls)}
+    page_dict = dict(zip(page_nums, page_urls))
     return page_dict
 
 

From 4e1309c56ec7f190311283fc2d454eb2e42a1490 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 01:22:51 +0100
Subject: [PATCH 33/92] Revert "MNT: Save results for parents whose children
 don't have URLs"

This reverts commit ea5eba4e596766dc10a5c57a27a2bf07ed43dd04.

We don't need to save these superfluous results now we have
caching enabled.
---
 pangaea_downloader/pq_scraper.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py
index fac599b..198e48d 100755
--- a/pangaea_downloader/pq_scraper.py
+++ b/pangaea_downloader/pq_scraper.py
@@ -85,7 +85,6 @@ def search_and_download(
                 df_list = datasets.fetch_children(
                     url,
                     verbose=verbose - 1,
-                    ensure_url=False,
                     auth_token=auth_token,
                 )
                 if df_list is None:
@@ -125,7 +124,6 @@ def search_and_download(
                     df = datasets.fetch_child(
                         url,
                         verbose=verbose - 1,
-                        ensure_url=False,
                         auth_token=auth_token,
                     )
         except Exception as err:

From a7e152148f62eb50da10689ecd6ecb479a4e93bb Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 01:40:24 +0100
Subject: [PATCH 34/92] DOC: Typo Scrapping -> Scraping

---
 pangaea_downloader/tools/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index 8cc319a..a3c4ded 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -103,7 +103,7 @@ def fetch_children(
             continue
         elif typ == "paginated":
             if verbose >= 1:
-                print(f"\t\t[{i+1}] Scrapping dataset...")
+                print(f"\t\t[{i+1}] Scraping dataset...")
             df = scraper.scrape_image_data(url)
             if df is not None:
                 df_list.append(df)

From 322b20cd5d820b1380234057270c78a22bf3c357 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 02:07:12 +0100
Subject: [PATCH 35/92] MNT: Check other children even if one is a restricted
 tabular dataset

---
 pangaea_downloader/tools/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index a3c4ded..9c3c14b 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -119,7 +119,7 @@ def fetch_children(
                         + f"\t\t[{i+1}] [ERROR] Access restricted: '{ds.loginstatus}'. {url}"
                         + colorama.Fore.RESET
                     )
-                return
+                continue
             if ensure_url and not checker.has_url_col(child.data):
                 if verbose >= 1:
                     print(

From f3f8d274dc94c5f0bf708deaa17d32809011fb0b Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 02:07:51 +0100
Subject: [PATCH 36/92] RF: Better loop conditioning structure, with common
 code at the end

---
 pangaea_downloader/tools/datasets.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index 9c3c14b..bb8a682 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -105,8 +105,6 @@ def fetch_children(
             if verbose >= 1:
                 print(f"\t\t[{i+1}] Scraping dataset...")
             df = scraper.scrape_image_data(url)
-            if df is not None:
-                df_list.append(df)
         elif typ == "tabular":
             t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time())
             time.sleep(t_wait)  # Stay under 180 requests every 30s
@@ -127,12 +125,13 @@ def fetch_children(
                         + f"\t\t[{i+1}] [WARNING] Image URL column NOT found! {url} Skipping..."
                         + colorama.Fore.RESET
                     )
-            else:
-                # Add metadata
-                df = set_metadata(child)
-                # Add child dataset to list
-                df = exclude_rows(df)
-                df_list.append(df)
+                continue
+            # Add metadata
+            df = set_metadata(child)
+            # Add child dataset to list
+            df = exclude_rows(df)
+        if df is not None:
+            df_list.append(df)
 
     # Return result
     if len(df_list) > 0:

From 24583c9c10473f834cd59e725eaf651781c8638e Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 02:23:36 +0100
Subject: [PATCH 37/92] MNT: Save title as dataset_title, not Dataset column

---
 pangaea_downloader/tools/scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py
index 34a4b52..7df2ec1 100644
--- a/pangaea_downloader/tools/scraper.py
+++ b/pangaea_downloader/tools/scraper.py
@@ -71,7 +71,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
         lat, long = coordinates
         df["Longitude"] = long
         df["Latitude"] = lat
-    df["Dataset"] = ds.title
+    df["dataset_title"] = ds.title
     df["DOI"] = getattr(ds, "doi", "")
     if (len(ds.events) > 0) and (ds.events[0].campaign is not None):
         df["Campaign"] = ds.events[0].campaign.name

From a752bdf92b8c8eecbc0100e35215e030b3bacbcf Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 02:29:04 +0100
Subject: [PATCH 38/92] RF: Move auto-deleting of partial file into save_df
 utility

---
 pangaea_downloader/pq_scraper.py     | 11 +----------
 pangaea_downloader/tools/datasets.py | 12 ++++++++++--
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py
index 198e48d..7165ba7 100755
--- a/pangaea_downloader/pq_scraper.py
+++ b/pangaea_downloader/pq_scraper.py
@@ -139,16 +139,7 @@ def search_and_download(
         # ----------------- SAVE TO FILE ----------------- #
         if df is None:
             continue
-        try:
-            saved = datasets.save_df(df, output_path, level=1, verbose=verbose - 1)
-        except Exception as err:
-            # Delete partially saved file, if present
-            if os.path.isfile(output_path):
-                try:
-                    os.remove(output_path)
-                except Exception:
-                    pass
-            raise err
+        saved = datasets.save_df(df, output_path, level=1, verbose=verbose - 1)
         n_downloads += 1 if saved else 0
 
     if verbose >= 0:
diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index bb8a682..2b24592 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -6,6 +6,8 @@
       use pangaea_downloader.tools.scraper module.
 """
 import os
+import shutil
+import tempfile
 import time
 from typing import List, Optional
 
@@ -165,8 +167,14 @@ def save_df(df: DataFrame, output_path: str, level=1, index=None, verbose=1) ->
         if verbose >= 1:
             print(f"{tabs}[{idx}] Empty DataFrame! File not saved!")
         return False
-    # Save if dataframe not empty
-    df.to_csv(output_path, index=False)
+    # Save dataframe if it is not empty
+    with tempfile.TemporaryDirectory() as dir_tmp:
+        # Write to a temporary file
+        tmp_path = os.path.join(dir_tmp, os.path.basename(output_path))
+        df.to_csv(tmp_path, index=False)
+        # Move our temporary file to the destination
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        shutil.move(tmp_path, output_path)
     if verbose >= 1:
         print(f"{tabs}[{idx}] Saved to '{output_path}'")
     return True

From 03e381f93dc0d8576a0da490515ba7dff12033ab Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 02:30:23 +0100
Subject: [PATCH 39/92] ENH: Record ds_id while acquiring each dataset

---
 pangaea_downloader/tools/datasets.py | 18 ++++++++++++++++--
 pangaea_downloader/tools/scraper.py  |  5 ++++-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index 2b24592..c8a591b 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -57,6 +57,10 @@ def fetch_child(
     df = set_metadata(ds)
     # Exclude unwanted rows
     df = exclude_rows(df)
+    # Add dataset ID
+    doi = getattr(ds, "doi", "")
+    ds_id = uri2dsid(doi if doi else child_url)
+    df["ds_id"] = ds_id
     return df
 
 
@@ -89,6 +93,7 @@ def fetch_children(
     df_list = []
     for i, child_uri in enumerate(ds.children):
         url = process.url_from_uri(child_uri)
+        ds_id = uri2dsid(child_uri)
         size = process.get_html_info(url)
         # Assess type
         try:
@@ -132,8 +137,10 @@ def fetch_children(
             df = set_metadata(child)
             # Add child dataset to list
             df = exclude_rows(df)
-        if df is not None:
-            df_list.append(df)
+        if df is None:
+            continue
+        df["ds_id"] = ds_id
+        df_list.append(df)
 
     # Return result
     if len(df_list) > 0:
@@ -224,6 +231,13 @@ def fix_text(text: str) -> str:
     return text
 
 
+def uri2dsid(uri: str) -> str:
+    """
+    Extract PANGAEA dataset ID from url/uri/doi string.
+    """
+    return uri.split("PANGAEA.")[-1]
+
+
 def get_dataset_id(df: DataFrame) -> str:
     """Take a Pandas DataFrame as input and return the datasets Pangaea ID."""
     col = find_column_match("doi")
diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py
index 7df2ec1..01c487a 100644
--- a/pangaea_downloader/tools/scraper.py
+++ b/pangaea_downloader/tools/scraper.py
@@ -72,7 +72,10 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
         df["Longitude"] = long
         df["Latitude"] = lat
     df["dataset_title"] = ds.title
-    df["DOI"] = getattr(ds, "doi", "")
+    doi = getattr(ds, "doi", "")
+    df["DOI"] = doi
+    ds_id = datasets.uri2dsid(doi if doi else url)
+    df["ds_id"] = ds_id
     if (len(ds.events) > 0) and (ds.events[0].campaign is not None):
         df["Campaign"] = ds.events[0].campaign.name
     return df

From 2a45e0d1a8a0c10d19d46131380164693fe969d4 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 02:39:45 +0100
Subject: [PATCH 40/92] MNT: Save children of parents individually, not merged
 together

---
 pangaea_downloader/pq_scraper.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py
index 7165ba7..e187eb5 100755
--- a/pangaea_downloader/pq_scraper.py
+++ b/pangaea_downloader/pq_scraper.py
@@ -104,7 +104,22 @@ def search_and_download(
                             + colorama.Fore.RESET
                         )
                     continue
-                df = pd.concat(df_list)
+                for df in df_list:
+                    if df is None:
+                        continue
+                    # Add the parent's ID to the dataframe
+                    df["parent_ds_id"] = ds_id
+                    # Save the child to its own CSV, including a column that
+                    # records the parent's dataset ID
+                    child_id = df.iloc[0]["ds_id"]
+                    child_output_path = os.path.join(output_dir, f"{child_id}.csv")
+                    saved = datasets.save_df(
+                        df, child_output_path, level=1, verbose=verbose - 1
+                    )
+                    n_downloads += 1 if saved else 0
+                # We have saved all the children individually, so will skip
+                # saving a redundant merged dataframe
+                continue
             else:
                 try:
                     dataset_type = process.ds_type(size)

From 93ec48c3775e64f9788854bea6bcc2c9a9922de8 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 09:43:15 +0100
Subject: [PATCH 41/92] ENH: Record child to parent dataset ID mapping

---
 pangaea_downloader/pq_scraper.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py
index e187eb5..7cd6ee6 100755
--- a/pangaea_downloader/pq_scraper.py
+++ b/pangaea_downloader/pq_scraper.py
@@ -56,6 +56,10 @@ def search_and_download(
     os.makedirs(output_dir, exist_ok=True)
     df_results.to_csv(output_dir.rstrip("/") + "_search_results.csv", index=False)
 
+    fname_child2parent = output_dir.rstrip("/") + "_child2parent.csv"
+    with open(fname_child2parent, "w") as f:
+        f.write("child,parent\n")
+
     # Process each result dictionary
     n_files = 0
     n_downloads = 0
@@ -117,6 +121,8 @@ def search_and_download(
                         df, child_output_path, level=1, verbose=verbose - 1
                     )
                     n_downloads += 1 if saved else 0
+                    with open(fname_child2parent, "a") as f:
+                        f.write(f"{child_id},{ds_id}\n")
                 # We have saved all the children individually, so will skip
                 # saving a redundant merged dataframe
                 continue

From c0c75cc7e0720575f6cbcf1784b07cd5a6b6774d Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:02:10 +0100
Subject: [PATCH 42/92] MNT: Fix latitude- and longitude- lookup

---
 pangaea_downloader/merge_benthic_datasets.py | 24 +++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 6e90b99..59c44c3 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -371,14 +371,22 @@ def reformat_df(df, remove_duplicate_columns=True):
     df.rename(columns=mapping, inplace=True, errors="raise")
 
     # Handle latitudesouth and longitudewest
-    if "latitude" not in df.columns and "latitudesouth" in df.columns:
-        df["latitude"] = -df["latitudesouth"]
-    if "latitude" not in df.columns and "latitude-" in df.columns:
-        df["latitude"] = -df["latitude-"]
-    if "longitude" not in df.columns and "longitudewest" in df.columns:
-        df["longitude"] = -df["longitudewest"]
-    if "longitude" not in df.columns and "longitude-" in df.columns:
-        df["longitude"] = -df["longitude-"]
+    if "latitude" not in df.columns and "latitudesouth" in lower_cols:
+        col = df.columns[lower_cols.index("latitudesouth")]
+        print(f"Using {col} for {df.iloc[0]['dataset']}")
+        df["latitude"] = -df[col]
+    if "latitude" not in df.columns and "latitude-" in lower_cols:
+        col = df.columns[lower_cols.index("latitude-")]
+        print(f"Using {col} for {df.iloc[0]['dataset']}")
+        df["latitude"] = -df[col]
+    if "longitude" not in df.columns and "longitudewest" in lower_cols:
+        col = df.columns[lower_cols.index("longitudewest")]
+        print(f"Using {col} for {df.iloc[0]['dataset']}")
+        df["longitude"] = -df[col]
+    if "longitude" not in df.columns and "longitude-" in lower_cols:
+        col = df.columns[lower_cols.index("longitude-")]
+        print(f"Using {col} for {df.iloc[0]['dataset']}")
+        df["longitude"] = -df[col]
 
     # Add file extension to image
     df["image"] = df.apply(add_file_extension, axis=1)

From 0099de3805ee91a21d3e831e082ec6d3e346dda8 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:04:52 +0100
Subject: [PATCH 43/92] MNT: Fix method for merging elevation data with depth
 data

---
 pangaea_downloader/merge_benthic_datasets.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 59c44c3..8958f87 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -304,7 +304,6 @@ def reformat_df(df, remove_duplicate_columns=True):
             "bathymetry",
             "bathy",
             "depth",
-            "elevation",
         ],
         "backscatter": [],
         "temperature": ["temperature", "temp"],
@@ -388,6 +387,12 @@ def reformat_df(df, remove_duplicate_columns=True):
         print(f"Using {col} for {df.iloc[0]['dataset']}")
         df["longitude"] = -df[col]
 
+    # Use elevation if there was no depth
+    if "depth" not in df.columns and "elevation" in lower_cols:
+        col = df.columns[lower_cols.index("elevation")]
+        print(f"Using {col} for {df.iloc[0]['dataset']}")
+        df["depth"] = -df[col]
+
     # Add file extension to image
     df["image"] = df.apply(add_file_extension, axis=1)
     # if "timestamp" not in df.columns and "datetime" in df.columns:

From 36570fc68e1faaa8e79020ca184ef2a60e42db26 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:05:16 +0100
Subject: [PATCH 44/92] MNT: Redact erroneously negative depth values

---
 pangaea_downloader/merge_benthic_datasets.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 8958f87..ae2448f 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -387,6 +387,11 @@ def reformat_df(df, remove_duplicate_columns=True):
         print(f"Using {col} for {df.iloc[0]['dataset']}")
         df["longitude"] = -df[col]
 
+    # Remove datapoints with erroneous negative depth
+    if "depth" in df.columns:
+        # Only observed two datapoints where this happens
+        df.loc[df["depth"] < 0, "depth"] = pd.NA
+
     # Use elevation if there was no depth
     if "depth" not in df.columns and "elevation" in lower_cols:
         col = df.columns[lower_cols.index("elevation")]

From 5760dd5c9bfd3444edc10a33d673eab8d01da5a2 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:05:49 +0100
Subject: [PATCH 45/92] ENH: Handle heightaboveseafloor as an altitude field

---
 pangaea_downloader/merge_benthic_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index ae2448f..088b12f 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -297,7 +297,7 @@ def reformat_df(df, remove_duplicate_columns=True):
         ],
         "x_pos": [],
         "y_pos": [],
-        "altitude": ["altitude", "height"],
+        "altitude": ["altitude", "heightaboveseafloor", "height"],
         "depth": [
             "depthwater",
             "bathydepth",

From daedd46ab06b308c0a866f37074d50691f919e53 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:08:37 +0100
Subject: [PATCH 46/92] ENH: Add kwargs pass-through to interpolate_by_datetime

---
 pangaea_downloader/merge_benthic_datasets.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 088b12f..c65e775 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1154,7 +1154,7 @@ def add_missing_datetime(df, ds_id=None, verbose=1):
     return df
 
 
-def interpolate_by_datetime(df, columns):
+def interpolate_by_datetime(df, columns, **kwargs):
     """
     Use datetime column to interpolate values for selected columns.
 
@@ -1165,6 +1165,8 @@ def interpolate_by_datetime(df, columns):
         in other columns.
     columns : str or iterable of str
         Name of column or columns to fill in missing values with interpolation.
+    **kwargs
+        Additional arguments as per :func:`numpy.interp`.
 
     Returns
     -------
@@ -1185,6 +1187,7 @@ def interpolate_by_datetime(df, columns):
             datetime_actual[has_dt_not_col],
             datetime_actual[has_dt_and_col],
             df.loc[has_dt_and_col, col],
+            **kwargs,
         )
     return df
 

From df1a8558afba11913fac10c6bc3ae3004b6e3930 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:10:26 +0100
Subject: [PATCH 47/92] MNT: Don't extrapolate depth beyond measured values

---
 pangaea_downloader/merge_benthic_datasets.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index c65e775..50553ba 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1180,6 +1180,12 @@ def interpolate_by_datetime(df, columns, **kwargs):
     if isinstance(columns, str):
         columns = [columns]
     for col in columns:
+        interp_kwargs = kwargs
+        if col in ["depth", "altitude"]:
+            if "left" not in interp_kwargs:
+                interp_kwargs["left"] = np.nan
+            if "right" not in interp_kwargs:
+                interp_kwargs["right"] = np.nan
         has_col = ~df[col].isna()
         has_dt_and_col = has_datetime & has_col
         has_dt_not_col = has_datetime & ~has_col
@@ -1187,7 +1193,7 @@ def interpolate_by_datetime(df, columns, **kwargs):
             datetime_actual[has_dt_not_col],
             datetime_actual[has_dt_and_col],
             df.loc[has_dt_and_col, col],
-            **kwargs,
+            **interp_kwargs,
         )
     return df
 

From 9bfe308c1eedd4981e99db51b480acbcda530276 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:13:13 +0100
Subject: [PATCH 48/92] ENH: Interpolate holes in depth values based on
 datetime

---
 pangaea_downloader/merge_benthic_datasets.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 50553ba..d2cff88 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1406,6 +1406,18 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1):
         # Fill in any missing latitude and longitude metadata
         df = interpolate_by_datetime(df, ["latitude", "longitude"])
 
+    if (
+        (702075 <= ds_id <= 702080)
+        or (818484 <= ds_id <= 818509)
+        or ds_id in [849287, 849289]
+        or 862084 <= ds_id <= 862097
+        or ds_id in [875072, 875074]
+        or 875081 <= ds_id <= 875085
+    ):
+        if verbose >= 1:
+            print(f"{ds_id}: Interpolating missing depth metadata for dataset {ds_id}")
+        df = interpolate_by_datetime(df, ["depth"])
+
     if any(df["latitude"].isna() | df["longitude"].isna()):
         # Fill in any missing latitude and longitude values with the
         # mean coordinate reported at the dataset level

From 10caaa033f4654c99abd7b4cbad824f79ef93c36 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:13:50 +0100
Subject: [PATCH 49/92] BUG: Check if child dataframe is empty before trying to
 save

---
 pangaea_downloader/pq_scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py
index 7cd6ee6..e99f14d 100755
--- a/pangaea_downloader/pq_scraper.py
+++ b/pangaea_downloader/pq_scraper.py
@@ -109,7 +109,7 @@ def search_and_download(
                         )
                     continue
                 for df in df_list:
-                    if df is None:
+                    if df is None or len(df) == 0:
                         continue
                     # Add the parent's ID to the dataframe
                     df["parent_ds_id"] = ds_id

From 53356a62894fd652f67f1d634c3b376ff5486c4b Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:15:08 +0100
Subject: [PATCH 50/92] MNT: Save empty parent CSV for easy search download
 resumption

---
 pangaea_downloader/pq_scraper.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py
index e99f14d..9623b10 100755
--- a/pangaea_downloader/pq_scraper.py
+++ b/pangaea_downloader/pq_scraper.py
@@ -57,8 +57,9 @@ def search_and_download(
     df_results.to_csv(output_dir.rstrip("/") + "_search_results.csv", index=False)
 
     fname_child2parent = output_dir.rstrip("/") + "_child2parent.csv"
-    with open(fname_child2parent, "w") as f:
-        f.write("child,parent\n")
+    if not os.path.isfile(fname_child2parent):
+        with open(fname_child2parent, "w") as f:
+            f.write("child,parent\n")
 
     # Process each result dictionary
     n_files = 0
@@ -125,6 +126,9 @@ def search_and_download(
                         f.write(f"{child_id},{ds_id}\n")
                 # We have saved all the children individually, so will skip
                 # saving a redundant merged dataframe
+                # But we will save an empty file so we know to skip
+                with open(output_path, "w") as f:
+                    f.write("is_parent")
                 continue
             else:
                 try:

From b5f3da5b4375cdd90825b91165724db88aac2d47 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:20:49 +0100
Subject: [PATCH 51/92] STY: Import from instead of aliasing

---
 pangaea_downloader/tools/scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py
index 01c487a..0af5cea 100644
--- a/pangaea_downloader/tools/scraper.py
+++ b/pangaea_downloader/tools/scraper.py
@@ -9,7 +9,7 @@
 from pangaeapy import PanDataSet
 from requests.compat import urljoin
 
-import pangaea_downloader.tools.datasets as datasets
+from pangaea_downloader.tools import datasets
 
 
 def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:

From ff2e9a445c55238207bf7985491b02e0875de34f Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:23:32 +0100
Subject: [PATCH 52/92] ENH: Add wrapper to requests.get with 30s backoff on
 429 status

---
 pangaea_downloader/tools/requesting.py | 49 ++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 pangaea_downloader/tools/requesting.py

diff --git a/pangaea_downloader/tools/requesting.py b/pangaea_downloader/tools/requesting.py
new file mode 100644
index 0000000..1ec7ef0
--- /dev/null
+++ b/pangaea_downloader/tools/requesting.py
@@ -0,0 +1,49 @@
+"""
+URL request utilities.
+"""
+
+import time
+
+import requests
+
+
+def get_request_with_backoff(url, retries=5, backoff_factor=1, verbose=1, **kwargs):
+    """
+    Fetch a URL resource using requests with a custom backoff strategy for re-attempts.
+
+    Parameters
+    ----------
+    url : str
+        The URL to request.
+    retries : int, default=5
+        Maximum number of attempts.
+    backoff_factor : float, default=1
+        Base time to wait for before attempting to download again when receiving
+        a 500 or 503 HTTP status code.
+    verbose : int, default=1
+        Verbosity level.
+    **kwargs
+        Additional arguments as per :func:`requests.get`.
+    """
+    for i_attempt in range(retries):
+        r = requests.get(url, **kwargs)
+        if r.status_code not in [429, 500, 503]:
+            # Status code looks good
+            break
+        # N.B. Could also retry on [408, 502, 504, 599]
+        if r.status_code == 429:
+            # PANGAEA has a maximum of 180 requests within a 30s period
+            # Wait for this to cool off completely.
+            t_wait = 30
+        else:
+            # Other errors indicate a server side error. Wait a
+            # short period and then retry to see if it alleviates.
+            t_wait = backoff_factor * 2**i_attempt
+        if verbose >= 1:
+            print(
+                "Retrying in {} seconds (HTTP Status {}): {}".format(
+                    t_wait, r.status_code, url
+                )
+            )
+        time.sleep(t_wait)
+    return r

From 55a4563e9aec764fe85a6e00334ef55af334d5d4 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:25:00 +0100
Subject: [PATCH 53/92] ENH: Use 30s backoff on 429 status

---
 pangaea_downloader/citations.py     | 5 +++--
 pangaea_downloader/licenses.py      | 5 +++--
 pangaea_downloader/tools/eda.py     | 5 +++--
 pangaea_downloader/tools/process.py | 5 +++--
 pangaea_downloader/tools/scraper.py | 8 ++++----
 5 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/pangaea_downloader/citations.py b/pangaea_downloader/citations.py
index db6796f..833463c 100644
--- a/pangaea_downloader/citations.py
+++ b/pangaea_downloader/citations.py
@@ -1,13 +1,14 @@
 import pickle
 
 import pandas as pd
-import requests
+
+from .tools import requesting
 
 
 def get_bibtex(ds_id: str, verbose=False) -> str:
     """Get the BibTex Citation of a Pangaea dataset using the dataset ID."""
     bib_url = f"https://doi.pangaea.de/10.1594/PANGAEA.{ds_id}?format=citation_bibtex"
-    resp = requests.get(bib_url)
+    resp = requesting.get_request_with_backoff(bib_url)
     if verbose:
         print("\tStatus code:", resp.status_code)
     return resp.text
diff --git a/pangaea_downloader/licenses.py b/pangaea_downloader/licenses.py
index 4ba9a38..8cfac90 100644
--- a/pangaea_downloader/licenses.py
+++ b/pangaea_downloader/licenses.py
@@ -5,10 +5,11 @@
 from typing import Dict, Optional, Union
 
 import pandas as pd
-import requests
 from bs4 import BeautifulSoup
 from tqdm import tqdm
 
+from .tools import requesting
+
 
 def get_dataset_url(ds_id: Union[str, int]) -> str:
     """Return dataset URL given the six digit dataset ID."""
@@ -18,7 +19,7 @@ def get_dataset_url(ds_id: Union[str, int]) -> str:
 def get_dataset_license_info(url: str) -> Optional[Dict[str, str]]:
     """Return a dictionary with license information given the dataset URL."""
     # Make a request to the URL and parse the html
-    resp = requests.get(url)
+    resp = requesting.get_request_with_backoff(url)
     soup = BeautifulSoup(resp.text, "lxml")
     # Get the tag containing the license info
     license_tag = soup.find("a", attrs={"rel": "license"})
diff --git a/pangaea_downloader/tools/eda.py b/pangaea_downloader/tools/eda.py
index 219fa31..9696c93 100644
--- a/pangaea_downloader/tools/eda.py
+++ b/pangaea_downloader/tools/eda.py
@@ -4,10 +4,11 @@
 import matplotlib.cm
 import matplotlib.colors
 import numpy as np
-import requests
 from matplotlib.pyplot import get_cmap
 from sklearn.neighbors import KernelDensity
 
+from . import requesting
+
 
 def url_from_doi(doi: str) -> str:
     """
@@ -29,7 +30,7 @@ def img_from_url(url: str, verbose=False) -> np.array:
     """Take an image url and return retrieved image array."""
     success = False
     while not success:
-        resp = requests.get(url, stream=True)
+        resp = requesting.get_request_with_backoff(url, stream=True)
         print(f"status code: {resp.status_code}") if verbose else 0
         success = True if (resp.status_code == 200) else False
         if success:
diff --git a/pangaea_downloader/tools/process.py b/pangaea_downloader/tools/process.py
index 6d92e73..74d8ea6 100644
--- a/pangaea_downloader/tools/process.py
+++ b/pangaea_downloader/tools/process.py
@@ -1,9 +1,10 @@
 """Functions for processing each of the result items."""
 from typing import Optional, Tuple
 
-import requests
 from bs4 import BeautifulSoup
 
+from . import requesting
+
 
 def url_from_uri(uri: str, base_url="https://doi.pangaea.de/") -> str:
     """Take a pangaea uri/doi string as input and return its corresponding url string."""
@@ -28,7 +29,7 @@ def get_result_info(res: dict) -> Tuple[str, str, str, str, bool]:
 def get_html_info(url: str) -> Optional[str]:
     """Make get request to dataset webpage and return dataset size."""
     # Make get request to webpage
-    resp = requests.get(url)
+    resp = requesting.get_request_with_backoff(url)
     if resp.status_code == 200:
         # Parse html
         soup = BeautifulSoup(resp.text, "lxml")
diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py
index 0af5cea..b5871b5 100644
--- a/pangaea_downloader/tools/scraper.py
+++ b/pangaea_downloader/tools/scraper.py
@@ -9,7 +9,7 @@
 from pangaeapy import PanDataSet
 from requests.compat import urljoin
 
-from pangaea_downloader.tools import datasets
+from pangaea_downloader.tools import datasets, requesting
 
 
 def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
@@ -22,7 +22,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
     # Request dataset url
     if verbose >= 1:
         print("\t\t\t[INFO] Requesting:", url)
-    resp = requests.get(url)
+    resp = requesting.get_request_with_backoff(url)
     # Parse response
     soup = BeautifulSoup(resp.text, "lxml")
     # Get coordinates of expedition
@@ -58,7 +58,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
     if verbose >= 1:
         print("\t\t\t[INFO] URL to photos page:", download_link)
     # Get to photos page (page 1)
-    resp = requests.get(download_link)
+    resp = requesting.get_request_with_backoff(download_link)
     photos_page = BeautifulSoup(resp.text, "lxml")
     img_urls = get_urls_from_each_page(photos_page, src_url, verbose=verbose)
     if img_urls is None:
@@ -107,7 +107,7 @@ def get_urls_from_each_page(
             if verbose >= 1:
                 print(f"\t\t\t[INFO] Processing Page {n}...")
             url = pagination[n]
-            resp = requests.get(url)
+            resp = requesting.get_request_with_backoff(url)
             soup = BeautifulSoup(resp.text, "lxml")
             urls = get_page_image_urls(soup, verbose=verbose)
             img_urls.extend(urls)

From 87cd24ac72361795b080d30464e366c6c34ef75b Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:25:40 +0100
Subject: [PATCH 54/92] JNB: Fix reference to benthicnet.io utilities

---
 notebooks/explore-depth-columns.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb
index b84d525..27aeaaf 100644
--- a/notebooks/explore-depth-columns.ipynb
+++ b/notebooks/explore-depth-columns.ipynb
@@ -18,7 +18,7 @@
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "import pandas as pd\n",
-    "from benthicnet.utils import sanitize_filename, sanitize_filename_series\n",
+    "from benthicnet.io import sanitize_filename, sanitize_filename_series\n",
     "from IPython.display import display\n",
     "from tqdm.auto import tqdm\n",
     "\n",

From d6cb21152d1d3c33f8fd2a0b148be8b5f8e99c86 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:29:06 +0100
Subject: [PATCH 55/92] JNB: Don't use low_memory mode loading df

---
 notebooks/explore-depth-columns.ipynb | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb
index 27aeaaf..2d4110b 100644
--- a/notebooks/explore-depth-columns.ipynb
+++ b/notebooks/explore-depth-columns.ipynb
@@ -249,7 +249,7 @@
     "\n",
     "val_exception = {}\n",
     "for i, file in enumerate(column_examples[key]):\n",
-    "    df = pd.read_csv(os.path.join(dirname, file))\n",
+    "    df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
     "    url_column = find_url_column(df)\n",
     "    df.columns = [col.lower() for col in df.columns]\n",
     "    # Extract info\n",
@@ -308,7 +308,7 @@
     "key = \"bathy depth\"\n",
     "\n",
     "for i, file in enumerate(column_examples[key]):\n",
-    "    df = pd.read_csv(os.path.join(dirname, file))\n",
+    "    df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
     "    url_column = find_url_column(df)\n",
     "    df.columns = [col.lower() for col in df.columns]\n",
     "    # Extract info\n",
@@ -361,7 +361,7 @@
     "# Depth bot & depth top\n",
     "\n",
     "for i, file in enumerate(column_examples[keys[0]]):\n",
-    "    df = pd.read_csv(os.path.join(dirname, file))\n",
+    "    df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
     "    url_column = find_url_column(df)\n",
     "    df.columns = [col.lower() for col in df.columns]\n",
     "    for key in keys:\n",
@@ -458,7 +458,7 @@
     "keys = [\"depth\", \"bathy depth\"]\n",
     "if len(intersect) > 0:\n",
     "    for file in intersect:\n",
-    "        df = pd.read_csv(os.path.join(dirname, file))\n",
+    "        df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
     "        df.columns = [col.lower() for col in df.columns]\n",
     "        for key in keys:\n",
     "            # Extract info\n",
@@ -513,7 +513,7 @@
     "keys = [\"depth water\", \"bathy depth\"]\n",
     "if len(intersect) > 0:\n",
     "    for file in intersect:\n",
-    "        df = pd.read_csv(os.path.join(dirname, file))\n",
+    "        df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
     "        df.columns = [col.lower() for col in df.columns]\n",
     "        for key in keys:\n",
     "            # Extract info\n",

From 6cb1bc5b76e81040b5db21923117c795662be01d Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:29:52 +0100
Subject: [PATCH 56/92] JNB: Fix typo

---
 notebooks/explore-depth-columns.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb
index 2d4110b..65dedd2 100644
--- a/notebooks/explore-depth-columns.ipynb
+++ b/notebooks/explore-depth-columns.ipynb
@@ -272,7 +272,7 @@
     "    #         print(\"\\tMin or Max non-positive.\")\n",
     "    #         val_exception[url] = (mean, sd, min_, max_, start, end)\n",
     "    if value_near_zero(start) or value_near_zero(end):\n",
-    "        print(\"\\tStart or Ene near zero.\")\n",
+    "        print(\"\\tStart or End near zero.\")\n",
     "        val_exception[url] = (mean, sd, min_, max_, start, end)"
    ]
   },

From 973b25de4cbd12fc7a257e8cfe42b575ade5812a Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:30:26 +0100
Subject: [PATCH 57/92] JNB+BUG: Need to reset val_exception before parsing new
 keys

---
 notebooks/explore-depth-columns.ipynb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb
index 65dedd2..f741734 100644
--- a/notebooks/explore-depth-columns.ipynb
+++ b/notebooks/explore-depth-columns.ipynb
@@ -307,6 +307,7 @@
     "# Column to find\n",
     "key = \"bathy depth\"\n",
     "\n",
+    "val_exception = {}\n",
     "for i, file in enumerate(column_examples[key]):\n",
     "    df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
     "    url_column = find_url_column(df)\n",

From b583e8d3d312b75b0f3c04633a03898e5580c8c1 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:31:34 +0100
Subject: [PATCH 58/92] JNB+MNT: Reflect yaxis instead of plotting negative of
 depth

---
 notebooks/explore-depth-columns.ipynb | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb
index f741734..df659e2 100644
--- a/notebooks/explore-depth-columns.ipynb
+++ b/notebooks/explore-depth-columns.ipynb
@@ -265,7 +265,8 @@
     "        f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}, Depth start: {start}, end: {end}\"\n",
     "    )\n",
     "    plt.figure(figsize=(16, 4))\n",
-    "    plt.plot(-df[key], label=key)\n",
+    "    plt.plot(df[key], label=key)\n",
+    "    plt.gca().invert_yaxis()\n",
     "    plt.show()\n",
     "    # Datasets that defy column value norms\n",
     "    #     if (min_ <= 0) or (max_ <= 0):\n",
@@ -321,7 +322,8 @@
     "    # Show\n",
     "    print(f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\")\n",
     "    plt.figure(figsize=(16, 4))\n",
-    "    plt.plot(-df[key], label=key)\n",
+    "    plt.plot(df[key], label=key)\n",
+    "    plt.gca().invert_yaxis()\n",
     "    plt.show()\n",
     "    if (min_ < 0) or (max_ < 0):\n",
     "        print(\"\\tDoes not satisfy column value norms.\")\n",

From 2782beb4d96996e12b915ca47d50d7a26ebd93f7 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:33:27 +0100
Subject: [PATCH 59/92] JNB: Add title, ylabel, and print link to dataset

---
 notebooks/explore-depth-columns.ipynb | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb
index df659e2..dea076c 100644
--- a/notebooks/explore-depth-columns.ipynb
+++ b/notebooks/explore-depth-columns.ipynb
@@ -266,8 +266,11 @@
     "    )\n",
     "    plt.figure(figsize=(16, 4))\n",
     "    plt.plot(df[key], label=key)\n",
+    "    plt.ylabel(key)\n",
+    "    plt.title(url.split(\"/\")[-1] + \" : \" + key)\n",
     "    plt.gca().invert_yaxis()\n",
     "    plt.show()\n",
+    "    print(url)\n",
     "    # Datasets that defy column value norms\n",
     "    #     if (min_ <= 0) or (max_ <= 0):\n",
     "    #         print(\"\\tMin or Max non-positive.\")\n",
@@ -323,8 +326,11 @@
     "    print(f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\")\n",
     "    plt.figure(figsize=(16, 4))\n",
     "    plt.plot(df[key], label=key)\n",
+    "    plt.ylabel(key)\n",
+    "    plt.title(url.split(\"/\")[-1] + \" : \" + key)\n",
     "    plt.gca().invert_yaxis()\n",
     "    plt.show()\n",
+    "    print(url)\n",
     "    if (min_ < 0) or (max_ < 0):\n",
     "        print(\"\\tDoes not satisfy column value norms.\")\n",
     "        val_exception[url] = (mean, sd, min_, max_)"
@@ -383,7 +389,9 @@
     "        plt.plot(df[key], label=key)\n",
     "    plt.plot(abs(df[\"depth top\"] - df[\"depth bot\"]), label=\"diff\", linestyle=\":\")\n",
     "    plt.legend()\n",
-    "    plt.show()"
+    "    plt.title(url.split(\"/\")[-1])\n",
+    "    plt.show()\n",
+    "    print(url)"
    ]
   },
   {

From f8663be7d8c1ffa49d7354777c1882af0b55c906 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:34:47 +0100
Subject: [PATCH 60/92] JNB: Highlight negative depth

---
 notebooks/explore-depth-columns.ipynb | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb
index dea076c..34dcde7 100644
--- a/notebooks/explore-depth-columns.ipynb
+++ b/notebooks/explore-depth-columns.ipynb
@@ -277,6 +277,9 @@
     "    #         val_exception[url] = (mean, sd, min_, max_, start, end)\n",
     "    if value_near_zero(start) or value_near_zero(end):\n",
     "        print(\"\\tStart or End near zero.\")\n",
+    "        val_exception[url] = (mean, sd, min_, max_, start, end)\n",
+    "    if min_ < 0:\n",
+    "        print(\"\\tNegative depth.\")\n",
     "        val_exception[url] = (mean, sd, min_, max_, start, end)"
    ]
   },

From e809f54c4c5373a4f0479ce90b92371f1fa5376f Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:35:41 +0100
Subject: [PATCH 61/92] JNB: Plot elevation

---
 notebooks/explore-depth-columns.ipynb | 173 +++++++++++++++++++++++++-
 1 file changed, 170 insertions(+), 3 deletions(-)

diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb
index 34dcde7..2f3c590 100644
--- a/notebooks/explore-depth-columns.ipynb
+++ b/notebooks/explore-depth-columns.ipynb
@@ -397,6 +397,73 @@
     "    print(url)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "0ee401a9-e936-4d8b-915d-ed3b1303fd65",
+   "metadata": {},
+   "source": [
+    "### 2.4 Elevation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d1ae559-e6ee-47b8-8f20-69bcef238cb5",
+   "metadata": {
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Column to find\n",
+    "key = \"elevation\"\n",
+    "\n",
+    "val_exception = {}\n",
+    "for i, file in enumerate(column_examples[key]):\n",
+    "    df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
+    "    url_column = find_url_column(df)\n",
+    "    df.columns = [col.lower() for col in df.columns]\n",
+    "    # Extract info\n",
+    "    mean = df[key].mean()\n",
+    "    sd = df[key].std()\n",
+    "    min_ = df[key].min()\n",
+    "    max_ = df[key].max()\n",
+    "    url = get_dataset_url(file)\n",
+    "    # Check for start and end at 0 altitude/depth\n",
+    "    start, end = df[key].iloc[0], df[key].iloc[-1]\n",
+    "    # Show\n",
+    "    print(\n",
+    "        f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}, Depth start: {start}, end: {end}\"\n",
+    "    )\n",
+    "    plt.figure(figsize=(16, 4))\n",
+    "    plt.plot(df[key], label=key)\n",
+    "    plt.ylabel(key)\n",
+    "    plt.title(url.split(\"/\")[-1] + \" : \" + key)\n",
+    "    plt.show()\n",
+    "    print(url)\n",
+    "    # Datasets that defy column value norms\n",
+    "    #     if (min_ <= 0) or (max_ <= 0):\n",
+    "    #         print(\"\\tMin or Max non-positive.\")\n",
+    "    #         val_exception[url] = (mean, sd, min_, max_, start, end)\n",
+    "    if max_ > 0:\n",
+    "        print(\"\\tPositive elevation.\")\n",
+    "        val_exception[url] = (mean, sd, min_, max_, start, end)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ceb64c1f-b5b3-4d8c-9943-b9c6810a1d53",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "val_exception"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "83e54609",
@@ -416,7 +483,7 @@
     "print(len(column_examples[\"depth water\"]))\n",
     "print(len(column_examples[\"bathy depth\"]))\n",
     "print(len(column_examples[\"bathy depth_2\"]))\n",
-    "print(len(column_examples[\"bathy_depth\"]))"
+    "print(len(column_examples[\"elevation\"]))"
    ]
   },
   {
@@ -614,13 +681,113 @@
     "**NOTE:** Upon checking the dataset webpages we see that the two bathy depth columns correspond to the original collection and recollection sites."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "61d6de0f-09d8-43f5-a2b6-c47afed77a9d",
+   "metadata": {},
+   "source": [
+    "## 3.5 Datasets with depth water and elevation"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6aabda82",
+   "id": "c3627d1c-717d-4dc2-b20d-761adebd513d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column1 = \"depth water\"\n",
+    "column2 = \"elevation\"\n",
+    "\n",
+    "fnames_set1 = set(column_examples[column1])\n",
+    "fnames_set2 = set(column_examples[column2])\n",
+    "intersect = fnames_set1.intersection(fnames_set2)\n",
+    "\n",
+    "print(f\"{column1} count:\", len(fnames_set1))\n",
+    "print(f\"{column2} count:\", len(fnames_set2))\n",
+    "print(\"# of files with both:\", len(intersect))\n",
+    "print()\n",
+    "\n",
+    "keys = [column1, column2]\n",
+    "if len(intersect) > 0:\n",
+    "    for file in intersect:\n",
+    "        df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
+    "        df.columns = [col.lower() for col in df.columns]\n",
+    "        for key in keys:\n",
+    "            # Extract info\n",
+    "            mean = df[key].mean()\n",
+    "            sd = df[key].std()\n",
+    "            min_ = df[key].min()\n",
+    "            max_ = df[key].max()\n",
+    "            url = get_dataset_url(file)\n",
+    "            # Show\n",
+    "            print(\n",
+    "                f\"[{i}] '{key}' Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\"\n",
+    "            )\n",
+    "        # Plot\n",
+    "        plt.figure(figsize=(16, 4))\n",
+    "        for key in keys:\n",
+    "            factor = 1 if key == \"elevation\" else -1\n",
+    "            plt.plot(factor * df[key], label=key.capitalize())\n",
+    "        plt.legend()\n",
+    "        plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eeae8d64-0038-47ea-bc0b-8e59e0724b5e",
    "metadata": {},
+   "source": [
+    "## 3.6 Datasets with bathy depth and elevation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eccfc931-307a-4007-8306-6ea918a1489b",
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
-   "source": []
+   "source": [
+    "column1 = \"bathy depth\"\n",
+    "column2 = \"elevation\"\n",
+    "\n",
+    "fnames_set1 = set(column_examples[column1])\n",
+    "fnames_set2 = set(column_examples[column2])\n",
+    "intersect = fnames_set1.intersection(fnames_set2)\n",
+    "\n",
+    "print(f\"{column1} count:\", len(fnames_set1))\n",
+    "print(f\"{column2} count:\", len(fnames_set2))\n",
+    "print(\"# of files with both:\", len(intersect))\n",
+    "print()\n",
+    "\n",
+    "keys = [column1, column2]\n",
+    "if len(intersect) > 0:\n",
+    "    for file in intersect:\n",
+    "        df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
+    "        df.columns = [col.lower() for col in df.columns]\n",
+    "        for key in keys:\n",
+    "            # Extract info\n",
+    "            mean = df[key].mean()\n",
+    "            sd = df[key].std()\n",
+    "            min_ = df[key].min()\n",
+    "            max_ = df[key].max()\n",
+    "            url = get_dataset_url(file)\n",
+    "            # Show\n",
+    "            print(\n",
+    "                f\"[{i}] '{key}' Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\"\n",
+    "            )\n",
+    "        # Plot\n",
+    "        plt.figure(figsize=(16, 4))\n",
+    "        for key in keys:\n",
+    "            factor = 1 if key == \"elevation\" else -1\n",
+    "            plt.plot(factor * df[key], label=key.capitalize())\n",
+    "        plt.legend()\n",
+    "        plt.show()"
+   ]
   }
  ],
  "metadata": {

From 154ff1fca76171fa4bc1a889c06739f17af81bb6 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:48:49 +0100
Subject: [PATCH 62/92] BUG: Need to drop columns after handling reversed
 columns

---
 pangaea_downloader/merge_benthic_datasets.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index d2cff88..45e3b31 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -364,8 +364,6 @@ def reformat_df(df, remove_duplicate_columns=True):
             elif col not in mapping and col not in cols_to_drop:
                 cols_to_drop.append(col)
 
-    # Remove superfluous columns
-    df.drop(labels=cols_to_drop, axis="columns", inplace=True)
     # Rename columns to canonical names
     df.rename(columns=mapping, inplace=True, errors="raise")
 
@@ -398,6 +396,9 @@ def reformat_df(df, remove_duplicate_columns=True):
         print(f"Using {col} for {df.iloc[0]['dataset']}")
         df["depth"] = -df[col]
 
+    # Remove superfluous columns
+    df.drop(labels=cols_to_drop, axis="columns", inplace=True)
+
     # Add file extension to image
     df["image"] = df.apply(add_file_extension, axis=1)
     # if "timestamp" not in df.columns and "datetime" in df.columns:

From 4a3cb25b5c8abe2c8fd213e576a4efc934b7edd2 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 17:49:23 +0100
Subject: [PATCH 63/92] MNT: Drop latitude-, longitude- if used

---
 pangaea_downloader/merge_benthic_datasets.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 45e3b31..4ad9507 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -372,18 +372,22 @@ def reformat_df(df, remove_duplicate_columns=True):
         col = df.columns[lower_cols.index("latitudesouth")]
         print(f"Using {col} for {df.iloc[0]['dataset']}")
         df["latitude"] = -df[col]
+        cols_to_drop.append("latitudesouth")
     if "latitude" not in df.columns and "latitude-" in lower_cols:
         col = df.columns[lower_cols.index("latitude-")]
         print(f"Using {col} for {df.iloc[0]['dataset']}")
         df["latitude"] = -df[col]
+        cols_to_drop.append("latitude-")
     if "longitude" not in df.columns and "longitudewest" in lower_cols:
         col = df.columns[lower_cols.index("longitudewest")]
         print(f"Using {col} for {df.iloc[0]['dataset']}")
         df["longitude"] = -df[col]
+        cols_to_drop.append("longitudewest")
     if "longitude" not in df.columns and "longitude-" in lower_cols:
         col = df.columns[lower_cols.index("longitude-")]
         print(f"Using {col} for {df.iloc[0]['dataset']}")
         df["longitude"] = -df[col]
+        cols_to_drop.append("longitude-")
 
     # Remove datapoints with erroneous negative depth
     if "depth" in df.columns:

From 42bdc5374268cdd72fcfafe17b7f355357c1f8cb Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 23:31:01 +0100
Subject: [PATCH 64/92] MNT: Save depth_of_observer, bathymetry, and elevation
 separately

---
 pangaea_downloader/merge_benthic_datasets.py | 41 +++++++++-----------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 4ad9507..74b3e8b 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -298,13 +298,9 @@ def reformat_df(df, remove_duplicate_columns=True):
         "x_pos": [],
         "y_pos": [],
         "altitude": ["altitude", "heightaboveseafloor", "height"],
-        "depth": [
-            "depthwater",
-            "bathydepth",
-            "bathymetry",
-            "bathy",
-            "depth",
-        ],
+        "depth_of_observer": ["depthwater", "depth"],
+        "bathymetry": ["bathydepth", "bathymetry", "bathy"],
+        "elevation": ["elevation"],
         "backscatter": [],
         "temperature": ["temperature", "temp"],
         "salinity": ["salinity", "sal"],
@@ -390,15 +386,9 @@ def reformat_df(df, remove_duplicate_columns=True):
         cols_to_drop.append("longitude-")
 
     # Remove datapoints with erroneous negative depth
-    if "depth" in df.columns:
+    if "depth_of_observer" in df.columns:
         # Only observed two datapoints where this happens
-        df.loc[df["depth"] < 0, "depth"] = pd.NA
-
-    # Use elevation if there was no depth
-    if "depth" not in df.columns and "elevation" in lower_cols:
-        col = df.columns[lower_cols.index("elevation")]
-        print(f"Using {col} for {df.iloc[0]['dataset']}")
-        df["depth"] = -df[col]
+        df.loc[df["depth_of_observer"] < 0, "depth_of_observer"] = pd.NA
 
     # Remove superfluous columns
     df.drop(labels=cols_to_drop, axis="columns", inplace=True)
@@ -1185,8 +1175,10 @@ def interpolate_by_datetime(df, columns, **kwargs):
     if isinstance(columns, str):
         columns = [columns]
     for col in columns:
+        if col not in df:
+            continue
         interp_kwargs = kwargs
-        if col in ["depth", "altitude"]:
+        if col in ["depth", "depth_of_observer", "bathymetry", "altitude"]:
             if "left" not in interp_kwargs:
                 interp_kwargs["left"] = np.nan
             if "right" not in interp_kwargs:
@@ -1273,7 +1265,9 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1):
         if verbose >= 1:
             print(f"Interpolating latitude, longitude, and depth for dataset {ds_id}")
         # Interpolate lat, lon, and depth based on datetime
-        df = interpolate_by_datetime(df, ["latitude", "longitude", "depth"])
+        df = interpolate_by_datetime(
+            df, ["latitude", "longitude", "depth_of_observer", "bathymetry"]
+        )
 
     if ds_id in [875071, 875073]:
         if verbose >= 1:
@@ -1284,7 +1278,7 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1):
         # from the subsequent image, so we don't need the ones without metadata.
         df = df[~df["datetime"].isna()]
         # Interpolate missing depth values
-        df = interpolate_by_datetime(df, ["depth"])
+        df = interpolate_by_datetime(df, ["depth_of_observer", "bathymetry"])
 
     if ds_id in [875084]:
         if verbose >= 1:
@@ -1293,7 +1287,7 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1):
         # The first three are of the deck, the rest are dark watercolumn shots.
         df = df[~df["longitude"].isna()]
         # Interpolate missing depth values
-        df = interpolate_by_datetime(df, ["depth"])
+        df = interpolate_by_datetime(df, ["depth_of_observer", "bathymetry"])
 
     if (878001 <= ds_id <= 878019) or ds_id == 878045:
         if verbose >= 1:
@@ -1366,7 +1360,9 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1):
             print(
                 f"{ds_id}: Interpolating latitude, longitude, and depth for dataset {ds_id}"
             )
-        df = interpolate_by_datetime(df, ["latitude", "longitude", "depth"])
+        df = interpolate_by_datetime(
+            df, ["latitude", "longitude", "depth_of_observer", "bathymetry"]
+        )
 
     if ds_id in [914155]:
         if verbose >= 1:
@@ -1421,7 +1417,7 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1):
     ):
         if verbose >= 1:
             print(f"{ds_id}: Interpolating missing depth metadata for dataset {ds_id}")
-        df = interpolate_by_datetime(df, ["depth"])
+        df = interpolate_by_datetime(df, ["depth_of_observer", "bathymetry"])
 
     if any(df["latitude"].isna() | df["longitude"].isna()):
         # Fill in any missing latitude and longitude values with the
@@ -1578,7 +1574,8 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
         "latitude",
         "longitude",
         "altitude",
-        "depth",
+        "depth_of_observer",
+        "bathymetry",
         "backscatter",
         "temperature",
         "salinity",

From eda209d6c26eab8f979bafa97b94de99cac0d223 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 23:32:34 +0100
Subject: [PATCH 65/92] MNT: Rearrange so old columns are dropped before
 mapping new ones onto them

---
 pangaea_downloader/merge_benthic_datasets.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 74b3e8b..2916f3a 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -360,6 +360,9 @@ def reformat_df(df, remove_duplicate_columns=True):
             elif col not in mapping and col not in cols_to_drop:
                 cols_to_drop.append(col)
 
+    # Remove superfluous columns
+    df.drop(labels=cols_to_drop, axis="columns", inplace=True)
+
     # Rename columns to canonical names
     df.rename(columns=mapping, inplace=True, errors="raise")
 
@@ -368,31 +371,24 @@ def reformat_df(df, remove_duplicate_columns=True):
         col = df.columns[lower_cols.index("latitudesouth")]
         print(f"Using {col} for {df.iloc[0]['dataset']}")
         df["latitude"] = -df[col]
-        cols_to_drop.append("latitudesouth")
     if "latitude" not in df.columns and "latitude-" in lower_cols:
         col = df.columns[lower_cols.index("latitude-")]
         print(f"Using {col} for {df.iloc[0]['dataset']}")
         df["latitude"] = -df[col]
-        cols_to_drop.append("latitude-")
     if "longitude" not in df.columns and "longitudewest" in lower_cols:
         col = df.columns[lower_cols.index("longitudewest")]
         print(f"Using {col} for {df.iloc[0]['dataset']}")
         df["longitude"] = -df[col]
-        cols_to_drop.append("longitudewest")
     if "longitude" not in df.columns and "longitude-" in lower_cols:
         col = df.columns[lower_cols.index("longitude-")]
         print(f"Using {col} for {df.iloc[0]['dataset']}")
         df["longitude"] = -df[col]
-        cols_to_drop.append("longitude-")
 
     # Remove datapoints with erroneous negative depth
     if "depth_of_observer" in df.columns:
         # Only observed two datapoints where this happens
         df.loc[df["depth_of_observer"] < 0, "depth_of_observer"] = pd.NA
 
-    # Remove superfluous columns
-    df.drop(labels=cols_to_drop, axis="columns", inplace=True)
-
     # Add file extension to image
     df["image"] = df.apply(add_file_extension, axis=1)
     # if "timestamp" not in df.columns and "datetime" in df.columns:

From 4d5718839b0a2191875c813dac74731c31eaa9f8 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 23:33:01 +0100
Subject: [PATCH 66/92] MNT: Change warning colour from red to yellow

---
 pangaea_downloader/tools/scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py
index b5871b5..45d5782 100644
--- a/pangaea_downloader/tools/scraper.py
+++ b/pangaea_downloader/tools/scraper.py
@@ -29,7 +29,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
     coordinates = get_metadata(soup)
     if coordinates is None and hasattr(ds, "geometryextent"):
         print(
-            colorama.Fore.RED + "\t\t\t[ALERT] Trying to get coordinates from"
+            colorama.Fore.YELLOW + "\t\t\t[ALERT] Trying to get coordinates from"
             " PanDataSet.geometryextent" + colorama.Fore.RESET
         )
         lat = None

From 0dc20c61703617289c5d74ba8b62cf41c8f107b5 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 30 Mar 2023 23:33:19 +0100
Subject: [PATCH 67/92] MNT: Change Campaign -> campaign

---
 pangaea_downloader/tools/scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py
index 45d5782..a36cd34 100644
--- a/pangaea_downloader/tools/scraper.py
+++ b/pangaea_downloader/tools/scraper.py
@@ -77,7 +77,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
     ds_id = datasets.uri2dsid(doi if doi else url)
     df["ds_id"] = ds_id
     if (len(ds.events) > 0) and (ds.events[0].campaign is not None):
-        df["Campaign"] = ds.events[0].campaign.name
+        df["campaign"] = ds.events[0].campaign.name
     return df
 
 

From 87452af3d1dd467213ced87221b87f74a5be858e Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Mon, 3 Apr 2023 11:54:09 +0100
Subject: [PATCH 68/92] BUG: Add pangaea- to ds_id for dataframe output

---
 pangaea_downloader/merge_benthic_datasets.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 2916f3a..1240513 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1491,7 +1491,11 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
             files_without_url.append(fname)
             continue
 
-        df["ds_id"] = f"pangaea-{ds_id}"
+        if "ds_id" in df.columns:
+            df["ds_id"] = "pangaea-" + df["ds_id"].astype(str)
+        else:
+            df["ds_id"] = f"pangaea-{ds_id}"
+
         df = reformat_df(df)
         if df is None:
             continue

From d135600f7f42f3d385af3c47e9b5d6be089882d2 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Mon, 3 Apr 2023 11:54:59 +0100
Subject: [PATCH 69/92] MNT: Allow photographs of tiles

---
 pangaea_downloader/merge_benthic_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 1240513..4a1bdf4 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -204,7 +204,7 @@ def check_title(title):
     ):
         return False
     if "photographs of tiles" in title.lower():
-        return False
+        pass
 
     return True
 

From a3b14e20b8a80234320fb04193321e3bd784d6f0 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Mon, 3 Apr 2023 11:55:42 +0100
Subject: [PATCH 70/92] ENH: Include parent_ds_id in output dataframe

---
 pangaea_downloader/merge_benthic_datasets.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 4a1bdf4..0d3ec6b 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1581,6 +1581,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
         "salinity",
         "chlorophyll",
         "acidity",
+        "parent_ds_id",
     }
     df_all = pd.concat(
         [df[df.columns.intersection(select_cols)] for df in dfs if len(df) > 0]

From 2082aa31901b4bfbf1fe68a8f8194052c23e66d4 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Mon, 3 Apr 2023 11:58:14 +0100
Subject: [PATCH 71/92] MNT: Remove self-imposed rate-limit so cached data is
 loaded immediately

---
 pangaea_downloader/tools/datasets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index c8a591b..4a34874 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -18,7 +18,8 @@
 from pangaea_downloader.tools import checker, process, scraper
 
 T_POLL_LAST = 0
-T_POLL_INTV = 0.1667
+T_POLL_INTV = 0  # Allow rapid loading of cached records
+# T_POLL_INTV = 0.1667  # Rate-limit ourselves; stay under 5 requests within 30s
 
 
 def fetch_child(

From f236e0cd84f162e6ba8f5d35a8ff465747a2d89f Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Tue, 4 Apr 2023 11:18:10 +0100
Subject: [PATCH 72/92] ENH: Include url_thumbnail column

---
 pangaea_downloader/merge_benthic_datasets.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 0d3ec6b..1b458cf 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -265,6 +265,7 @@ def reformat_df(df, remove_duplicate_columns=True):
     # is the output column name, and the value is a list of search names
     # in order of priority. The first match will be kept and others discarded.
     desired_columns = {
+        "url_thumbnail": ["urlthumb", "urlthumbnail"],
         "dataset": ["ds_id"],
         "site": ["Event", "event", "deployment"],
         "image": ["image", "filename"],
@@ -329,6 +330,8 @@ def reformat_df(df, remove_duplicate_columns=True):
             if not found:
                 found = True
                 mapping[col] = canon
+                if col in cols_to_drop:
+                    cols_to_drop.remove(col)
                 if col != canon and canon in df.columns:
                     cols_to_drop.append(canon)
             elif col not in mapping and col not in cols_to_drop:
@@ -342,6 +345,8 @@ def reformat_df(df, remove_duplicate_columns=True):
             if not found:
                 found = True
                 mapping[col] = canon
+                if col in cols_to_drop:
+                    cols_to_drop.remove(col)
                 if col != canon and canon in df.columns:
                     cols_to_drop.append(canon)
             elif col not in mapping and col not in cols_to_drop:
@@ -355,6 +360,8 @@ def reformat_df(df, remove_duplicate_columns=True):
             if not found:
                 found = True
                 mapping[col] = canon
+                if col in cols_to_drop:
+                    cols_to_drop.remove(col)
                 if col != canon and canon in df.columns:
                     cols_to_drop.append(canon)
             elif col not in mapping and col not in cols_to_drop:
@@ -1569,6 +1576,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
         "dataset",
         "site",
         "url",
+        "url_thumbnail",
         "image",
         "datetime",
         "latitude",

From 28739119942be735c2a001eb5f9496e71580c632 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Tue, 4 Apr 2023 11:21:43 +0100
Subject: [PATCH 73/92] ENH: Find area columns encoding image area in square
 meters

---
 pangaea_downloader/merge_benthic_datasets.py | 21 ++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 1b458cf..76966ee 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -16,6 +16,7 @@
 import numpy as np
 import pandas as pd
 import scipy.interpolate
+from pandas.api.types import is_numeric_dtype
 from pangaeapy import PanDataSet
 from tqdm.auto import tqdm
 
@@ -235,8 +236,21 @@ def reformat_df(df, remove_duplicate_columns=True):
     # Make a copy of the dataframe so we can't overwrite the input
     df = df.copy()
 
-    # Remove bad columns
-    df.drop(labels=["-"], axis="columns", inplace=True, errors="ignore")
+    # Get dataset id from first row
+    ds_id = df.iloc[0]["ds_id"]
+    if isinstance(ds_id, str):
+        ds_id = ds_id.split("-")[-1]
+
+    # Handle Area column
+    for col in ["Area", "Area_2", "Area_3"]:
+        # Area is sometimes the seafloor surface area of the image in
+        # meters^2 and sometimes used as a synonym for location
+        if col in df.columns and not all(df[col].isna()) and is_numeric_dtype(df[col]):
+            print(df.columns)
+            print(f"{ds_id}: Using {col} for area measurement")
+            df.rename(columns={col: "area"}, inplace=True, errors="raise")
+            break
+
     # Remove duplicately named columns
     cols_to_drop = []
     if remove_duplicate_columns:
@@ -250,6 +264,8 @@ def reformat_df(df, remove_duplicate_columns=True):
             ):
                 cols_to_drop.append(col)
         df.drop(labels=cols_to_drop, axis="columns", inplace=True)
+    # Remove bad columns
+    df.drop(labels=["-"], axis="columns", inplace=True, errors="ignore")
 
     # Find the correct URL column, and drop other columns containing "url"
     cols_to_drop = []
@@ -1581,6 +1597,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
         "datetime",
         "latitude",
         "longitude",
+        "area",
         "altitude",
         "depth_of_observer",
         "bathymetry",

From 6a5c687156db6234154646895263b02843dad6bb Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Tue, 4 Apr 2023 11:22:27 +0100
Subject: [PATCH 74/92] MNT: Find and remove additional FAVOURITE duplicate
 images

---
 pangaea_downloader/merge_benthic_datasets.py | 28 +++++++++++++-------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 76966ee..76cd3a1 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -741,8 +741,10 @@ def fixup_favourite_images(df, verbose=1):
     """
     Drop duplicated favourite images.
 
-    These occur in Ingo Schewe's datasets along OFOS profiles during POLARSTERN
-    cruises, PANGAEA dataset ids 849814--849816 and 873995--874002.
+    These occur in Schewe and Bergmann's datasets along OFOS profiles during
+    POLARSTERN cruises, PANGAEA dataset ids 849814--849816. 873995--874002,
+    895102--895104, 896545--896549, 896653--896657, 912471.
+
 
     Parameters
     ----------
@@ -757,14 +759,22 @@ def fixup_favourite_images(df, verbose=1):
         As input dataframe, but with all Type entries starting with favourite
         removed (case-insensitive).
     """
-    if "Type" not in df.columns:
-        return df
-    # Remove all Favourite timer, Favourite hotkey, FAVOURITE_TIMER, and
-    # FAVOURITE_HOTKEY entries, which although they have unique URLs for their
-    # images are actually identical images to the ones occuring immediately
-    # after them in the dataframe.
     n_samples_before = len(df)
-    df = df[~df["Type"].str.lower().str.startswith("favourite")]
+    if "Type" in df.columns:
+        # Remove all Favourite timer, Favourite hotkey, FAVOURITE_TIMER, and
+        # FAVOURITE_HOTKEY entries, which although they have unique URLs for their
+        # images are actually identical images to the ones occuring immediately
+        # after them in the dataframe.
+        df = df[~df["Type"].str.lower().str.startswith("favourite")]
+    if "image" in df.columns:
+        # Check if the image filename field is repeated except for a leading
+        # "FAVOURITE_" string, if so remove it. These images are identical
+        # copies of the other images.
+        select = df["image"].str.lower().str.startswith("favourite")
+        image_tmp = df["image"].str.replace("FAVOURITE_", "", case=False, regex=False)
+        is_repeated = image_tmp.duplicated(False)
+        # Remove favourite images which are repeated
+        df = df[~(select & is_repeated)]
     n_samples_after = len(df)
     if verbose >= 1 and n_samples_after != n_samples_before:
         print(

From 2808ad9a470571d6b7486d34ed6b50f528dc0d7f Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Tue, 4 Apr 2023 11:22:56 +0100
Subject: [PATCH 75/92] MNT: Print files which had duplicated URLs resolved

---
 pangaea_downloader/merge_benthic_datasets.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 76cd3a1..e3254e5 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1581,9 +1581,13 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
         print(
             f"Of which {len(files_with_repeat_urls)} have repeated URLs (before replacing dups with image)"
         )
+        for fname in files_with_repeat_urls:
+            print(f"    {fname}")
         print(
             f"Of which {len(files_with_repeat_urls2)} have repeated URLs (after replacing dups with image)"
         )
+        for fname in files_with_repeat_urls2:
+            print(f"    {fname}")
         print()
         print(f"There are {len(column_count)} unique column names:")
         print()

From aadf712dad1d6da9b15e4286ccd4cead341c75ba Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 5 Apr 2023 10:50:06 +0100
Subject: [PATCH 76/92] Revert "MNT: Remove self-imposed rate-limit so cached
 data is loaded immediately"

This reverts commit 2082aa31901b4bfbf1fe68a8f8194052c23e66d4.
---
 pangaea_downloader/tools/datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index 4a34874..e53c600 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -18,8 +18,8 @@
 from pangaea_downloader.tools import checker, process, scraper
 
 T_POLL_LAST = 0
-T_POLL_INTV = 0  # Allow rapid loading of cached records
-# T_POLL_INTV = 0.1667  # Rate-limit ourselves; stay under 5 requests within 30s
+# T_POLL_INTV = 0  # Allow rapid loading of cached records
+T_POLL_INTV = 0.1667  # Rate-limit ourselves; stay under 5 requests within 30s
 
 
 def fetch_child(

From 4f152f16e7f9821cb9b982b2df140a61743cbbb0 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 5 Apr 2023 10:51:11 +0100
Subject: [PATCH 77/92] MNT: Print number of records before and after dropping
 duplicates

---
 pangaea_downloader/merge_benthic_datasets.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index e3254e5..a1ac91f 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1626,10 +1626,13 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
         [df[df.columns.intersection(select_cols)] for df in dfs if len(df) > 0]
     )
 
+    print(f"There are {len(df_all)} records before dropping duplicated URLs")
+
     # Remove duplicate URLs
     if verbose >= 1:
         print("Remove duplicates")
     df_all.drop_duplicates(subset="url", inplace=True, keep="first")
+    print(f"There are {len(df_all)} records after dropping duplicated URLs")
 
     # Fix repeated output paths by replacing with image field
     if fixup_repeated_output_paths is None:

From df0beeabe0e69f1f12051533a3c9dd74bb2f7537 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 5 Apr 2023 10:52:46 +0100
Subject: [PATCH 78/92] MNT: Print IDs of datasets which may have label columns

---
 pangaea_downloader/merge_benthic_datasets.py | 32 ++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index a1ac91f..e45ea51 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1504,6 +1504,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
     n_valid = 0
     dfs = []
     dfs_fnames = []
+    ids_with_potential_labels = []
 
     for fname in tqdm(sorted(sorted(os.listdir(input_dirname)), key=len)):  # noqa: C414
         if not fname.endswith(".csv"):
@@ -1548,6 +1549,30 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
             column_count[col] += 1
             column_examples[col].append(fname)
 
+        for key in [
+            # "Type",
+            "Content",  # Yes!
+            # "Sample label",
+            # "ID",
+            # "Sample ID",
+            "Classification",  # Yes!
+            "Species",  # Yes!
+            # "Reference",
+            # "Samp type",
+            "Family",
+            "Genus",
+            # "Ind No",
+            # "Imagery",
+            # "Img brightness",  # No
+            "Ground vis",  # Yes!
+            "Marine litter",
+            "Fisheries plastic",
+            "Unident litter",
+        ]:
+            if key in df.columns:
+                print(f"{fname} has {key}")
+                ids_with_potential_labels.append(ds_id)
+
         # Drop rows that are complete duplicates
         df.drop_duplicates(inplace=True)
 
@@ -1598,6 +1623,13 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
             c = col + " "
             print(f"{c:.<35s} {count:4d}")
         print()
+        ids_with_potential_labels = sorted(set(ids_with_potential_labels))
+        print(
+            f"There are {len(ids_with_potential_labels)} datasets which might have labels to extract:"
+        )
+        for ds_id in ids_with_potential_labels:
+            print(ds_id)
+        print()
 
     if verbose >= 1:
         print("Filter columns")

From f9d5b48211c5f64f70aded629e2ebcb9ba4f3d9b Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 5 Apr 2023 10:53:03 +0100
Subject: [PATCH 79/92] MNT: Rename parent_ds_id -> collection

---
 pangaea_downloader/merge_benthic_datasets.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index e45ea51..f000409 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1657,6 +1657,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
     df_all = pd.concat(
         [df[df.columns.intersection(select_cols)] for df in dfs if len(df) > 0]
     )
+    df_all.rename(columns={"parent_ds_id": "collection"}, inplace=True)
 
     print(f"There are {len(df_all)} records before dropping duplicated URLs")
 

From 45a3492b9eaafd972d7bd2a0c698b4794e58c7f3 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 6 Apr 2023 09:50:32 +0100
Subject: [PATCH 80/92] BUG: Fix nanosecond output format of datetime in
 pangaea-907025

---
 pangaea_downloader/merge_benthic_datasets.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index f000409..c348057 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1366,12 +1366,17 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1):
             select_and_col = select & ~missing_dt
             select_not_col = select & missing_dt
             if any(select_and_col) and any(select_not_col):
-                df.loc[select_not_col, col] = scipy.interpolate.interp1d(
+                new_values = scipy.interpolate.interp1d(
                     indices[select_and_col],
-                    pd.to_datetime(df.loc[select_and_col, col]),
+                    pd.to_datetime(df.loc[select_and_col, col]).map(
+                        pd.Timestamp.timestamp
+                    ),
                     kind="nearest",
                     fill_value="extrapolate",
                 )(indices[select_not_col])
+                new_values = pd.to_datetime(new_values, unit="s")
+                new_values = new_values.strftime("%Y-%m-%d")
+                df.loc[select_not_col, col] = new_values
 
     if ds_id in [911904, 918924, 919348]:
         if verbose >= 1:

From 13080a1ad7c2604473bd5f56879ff40c085a7734 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 6 Apr 2023 09:51:03 +0100
Subject: [PATCH 81/92] MNT: Convert parent_ds_id into pangaea-IDENTIFIER like
 ds_id

---
 pangaea_downloader/merge_benthic_datasets.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index c348057..6403be0 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1534,6 +1534,8 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
             df["ds_id"] = "pangaea-" + df["ds_id"].astype(str)
         else:
             df["ds_id"] = f"pangaea-{ds_id}"
+        if "parent_ds_id" in df.columns:
+            df["parent_ds_id"] = "pangaea-" + df["parent_ds_id"].astype(str)
 
         df = reformat_df(df)
         if df is None:

From 1db2a4a9575fc0a6499c5d848e6dae80ba3437e9 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 6 Apr 2023 09:51:29 +0100
Subject: [PATCH 82/92] DOC: Fix rate limit comment

---
 pangaea_downloader/tools/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index e53c600..8b6f83a 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -19,7 +19,7 @@
 
 T_POLL_LAST = 0
 # T_POLL_INTV = 0  # Allow rapid loading of cached records
-T_POLL_INTV = 0.1667  # Rate-limit ourselves; stay under 5 requests within 30s
+T_POLL_INTV = 0.1667  # Rate-limit ourselves; stay under 180 requests within 30s
 
 
 def fetch_child(

From b1b2a7095f1497685043fbd4606a8f6c3ed92b61 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 6 Apr 2023 09:52:13 +0100
Subject: [PATCH 83/92] ENH: Merge down metadata across rows with repeated
 URLs, preserving details

---
 pangaea_downloader/merge_benthic_datasets.py | 83 +++++++++++++++++++-
 1 file changed, 81 insertions(+), 2 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 6403be0..2df21bc 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -29,6 +29,10 @@
     fixup_repeated_output_paths = None
     row2basename = None
 
+# Create new `pandas` methods which use `tqdm` progress
+# (can use tqdm_gui, optional kwargs, etc.)
+tqdm.pandas()
+
 TAXONOMY_RANKS = [
     ["Kingdom", "Regnum"],
     ["Phylum", "Division"],
@@ -1480,6 +1484,81 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1):
     return df
 
 
+def merge_duplicated_urls(df):
+    """
+    Merge metadata across rows which have the same URL.
+    """
+    print("Original number of rows:", len(df))
+    df.drop_duplicates(inplace=True)
+    print("Number of rows after dropping simple duplicates:", len(df))
+    # Record the original sort index so we can get the data back in the original
+    # order.
+    df["original_index"] = df.index
+    # Determine how many images are at the same location. This indicates how
+    # accurate the latitude and longitude information is. We will want to keep
+    # the most accurate version of this.
+    repeat_location_counts = df[["longitude", "latitude"]].value_counts()
+    repeat_location_counts = repeat_location_counts.to_frame()
+    repeat_location_counts.rename(columns={0: "tally_repeated_location"}, inplace=True)
+    # Add the tally_repeated_location data as a new column
+    df = df.merge(repeat_location_counts, how="left", on=["latitude", "longitude"])
+
+    def resolve_duplicates(sdf):
+        if len(sdf) == 1:
+            # If there's only one row in the group, return it.
+            return sdf.iloc[0]
+        # Take the entry which has the fewest repetitions of the latitude and
+        # longitude value. We will use the version from the first dataset that
+        # had the fewest repetitions of the location for this image.
+        # We adopt this row's collection, dataset, and site values in addition
+        # to its coordinates.
+        idx = np.argmin(sdf["tally_repeated_location"])
+        row = sdf.iloc[idx].copy()
+        # For numeric columns (other than latitude and longitude), take the
+        # average of the values where they are present.
+        for col in [
+            "depth_of_observer",
+            "altitude",
+            "bathymetry",
+            "salinity",
+            "temperature",
+            "acidity",
+            "area",
+        ]:
+            select = ~pd.isna(sdf[col])
+            if select.sum() == 0:
+                continue
+            row[col] = sdf[select][col].mean()
+        # Look to see if we are missing an image or thumbnail entry and one
+        # of the duplicates has its value.
+        for col in ["image", "url_thumbnail"]:
+            if not pd.isna(row[col]):
+                continue
+            values = sdf[col]
+            values = values[~pd.isna(values)]
+            if len(values) == 0:
+                continue
+            row[col] = values.iloc[0]
+        # For datetime, use the fact that we encoded datetime as a string
+        # with varying levels of precision. More digits means higher precision.
+        # Take the most precise value, preferring the value from the selected
+        # record in the event of a tie.
+        datetime_len = sdf["datetime"].str.replace(" 00:00:00", "").str.len()
+        idx_dt = np.argmax(datetime_len)
+        if datetime_len.iloc[idx] != datetime_len.iloc[idx_dt]:
+            row["datetime"] = sdf.iloc[idx_dt]["datetime"]
+        return row
+
+    print("Merging metadata between rows with the same URL")
+    # Group by URL and apply our transformation to each group
+    df_out = df.groupby("url").progress_apply(resolve_duplicates)
+    # Reorder the dataframe to preseve implicit temporal information from the
+    # ordering of the images
+    df_out.sort_values("original_index", inplace=True)
+    df_out.drop(columns=["original_index", "tally_repeated_location"], inplace=True)
+    return df_out
+
+
 def process_datasets(input_dirname, output_path=None, verbose=0):
     """
     Process a directory of datasets: clean, concatenate and save.
@@ -1670,8 +1749,8 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
 
     # Remove duplicate URLs
     if verbose >= 1:
-        print("Remove duplicates")
-    df_all.drop_duplicates(subset="url", inplace=True, keep="first")
+        print("Merge duplicated URLs")
+    df_all = merge_duplicated_urls(df_all)
     print(f"There are {len(df_all)} records after dropping duplicated URLs")
 
     # Fix repeated output paths by replacing with image field

From f631dd85fba8e3b9413cac81a7593629266da3d8 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 6 Apr 2023 10:23:49 +0100
Subject: [PATCH 84/92] MNT: Save a copy with duplicates before removing them,
 so duplicates can be resolved without merging again

---
 pangaea_downloader/merge_benthic_datasets.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 2df21bc..d5e7336 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1747,6 +1747,13 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
 
     print(f"There are {len(df_all)} records before dropping duplicated URLs")
 
+    if os.path.dirname(output_path):
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    output_path_with_dups = os.path.splitext(output_path)[0] + "_with-duplicates.csv"
+    if verbose >= 0:
+        print(f"Saving (with duplicates) to {output_path_with_dups}")
+    df_all.to_csv(output_path_with_dups, index=False)
+
     # Remove duplicate URLs
     if verbose >= 1:
         print("Merge duplicated URLs")
@@ -1762,10 +1769,8 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
             print("Fix repeated output paths to prevent collisions")
         df_all = fixup_repeated_output_paths(df_all, inplace=True, verbose=verbose)
 
-    if os.path.dirname(output_path):
-        os.makedirs(os.path.dirname(output_path), exist_ok=True)
     if verbose >= 0:
-        print(f"Saving to {output_path}")
+        print(f"Saving (without duplicates) to {output_path}")
     df_all.to_csv(output_path, index=False)
 
 

From eebb878005d0e2ae00d8d93c5596148afc35005e Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Thu, 6 Apr 2023 10:25:35 +0100
Subject: [PATCH 85/92] BUG: Need to convert datetime to string before merging
 (some are datetime objects)

---
 pangaea_downloader/merge_benthic_datasets.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index d5e7336..2ccc433 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1757,6 +1757,8 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
     # Remove duplicate URLs
     if verbose >= 1:
         print("Merge duplicated URLs")
+    # Convert datetime to string
+    df_all["datetime"] = df_all["datetime"].astype(str)
     df_all = merge_duplicated_urls(df_all)
     print(f"There are {len(df_all)} records after dropping duplicated URLs")
 

From f1cf9852c9feaf686f8c2b61b04ada00efdd70b3 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 May 2024 23:25:25 +0100
Subject: [PATCH 86/92] MNT: Rewrite any(list comp) as any(generator) instead
 (flake8:C419)

C419 Unnecessary list comprehension passed to any() prevents short-circuiting - rewrite as a generator.
---
 pangaea_downloader/merge_benthic_datasets.py | 2 +-
 pangaea_downloader/tools/checker.py          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 2ccc433..2cfd4c3 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -425,7 +425,7 @@ def reformat_df(df, remove_duplicate_columns=True):
     if "site" not in df.columns:
         df["site"] = df["dataset"] + "_site"
 
-    if any([c in clean_cols for c in ["Kingdom", "Phylum", "Genus"]]):
+    if any(c in clean_cols for c in ["Kingdom", "Phylum", "Genus"]):
         df["taxonomy"] = df.apply(row2taxonomy, axis=1)
         df.drop(
             labels=[x for syn in TAXONOMY_RANKS for x in syn],
diff --git a/pangaea_downloader/tools/checker.py b/pangaea_downloader/tools/checker.py
index 7292c43..3d345d6 100644
--- a/pangaea_downloader/tools/checker.py
+++ b/pangaea_downloader/tools/checker.py
@@ -61,8 +61,8 @@ def is_invalid_file_ext(filename: str) -> bool:
 # --------------------------------------------- DataFrame Checkers --------------------------------------------- #
 def has_url_col(df: DataFrame) -> bool:
     """Take a Pandas DataFrame and return True if it has image URL column."""
-    condition1 = any(["url" in col.lower() for col in df.columns])
-    condition2 = any(["image" in col.lower() for col in df.columns])
+    condition1 = any("url" in col.lower() for col in df.columns)
+    condition2 = any("image" in col.lower() for col in df.columns)
     return condition1 or condition2
 
 

From 4a42f493f752ca3aa75f7d13b18076b74651a0d8 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 May 2024 23:26:46 +0100
Subject: [PATCH 87/92] MNT: Rename depth columns

---
 pangaea_downloader/merge_benthic_datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 2cfd4c3..9898115 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -319,8 +319,8 @@ def reformat_df(df, remove_duplicate_columns=True):
         "x_pos": [],
         "y_pos": [],
         "altitude": ["altitude", "heightaboveseafloor", "height"],
-        "depth_of_observer": ["depthwater", "depth"],
-        "bathymetry": ["bathydepth", "bathymetry", "bathy"],
+        "depth_camera": ["depthwater", "depth"],
+        "depth_seafloor": ["bathydepth", "bathymetry", "bathy"],
         "elevation": ["elevation"],
         "backscatter": [],
         "temperature": ["temperature", "temp"],

From e5d9db079226989d52bdfd0a8cf1ca64ae1e37d8 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 May 2024 23:27:21 +0100
Subject: [PATCH 88/92] MNT: Exclude AntGlassSponges with DOWN in their URL -
 not Benthic imagery

---
 pangaea_downloader/merge_benthic_datasets.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 9898115..4641f4b 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -538,6 +538,7 @@ def check_image_url(url):
     if (
         url.startswith("https://hs.pangaea.de/Images/Benthos/AntGlassSponges/")
         and "AHEAD" not in url
+        and "DOWN" not in url
     ):
         # Images of AntGlassSponges must contain "AHEAD" to be kept
         # otherwise, they are of sponges after removal

From 3dfaba1c1fa74cb0960d3c40b20534a8e729b128 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 May 2024 23:27:52 +0100
Subject: [PATCH 89/92] MNT: Skip missing URL cols

---
 pangaea_downloader/merge_benthic_datasets.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 4641f4b..7079aaf 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1526,6 +1526,8 @@ def resolve_duplicates(sdf):
             "acidity",
             "area",
         ]:
+            if col not in sdf.columns:
+                continue
             select = ~pd.isna(sdf[col])
             if select.sum() == 0:
                 continue
@@ -1533,6 +1535,8 @@ def resolve_duplicates(sdf):
         # Look to see if we are missing an image or thumbnail entry and one
         # of the duplicates has its value.
         for col in ["image", "url_thumbnail"]:
+            if col not in sdf.columns:
+                continue
             if not pd.isna(row[col]):
                 continue
             values = sdf[col]

From a8967dcdaef95cc991b860eef200a0eb6e2cf67c Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 May 2024 23:28:23 +0100
Subject: [PATCH 90/92] ENH: Add process_single to cleanup metadata for a
 single dataset

---
 pangaea_downloader/merge_benthic_datasets.py | 75 ++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index 7079aaf..a7532fa 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -1564,6 +1564,81 @@ def resolve_duplicates(sdf):
     return df_out
 
 
+def process_single(df, ds_id=None, verbose=1, remove_duplicate_columns=False):
+    """
+    Reformat and cleanup metadata for a single dataset.
+
+    Parameters
+    ----------
+    df : pandas.Dataframe
+        The dataset to process.
+    ds_id : int, optional
+        The ID number for the PANGAEA dataset. If omitted, it is inferred from
+        the ``ds_id`` column of ``df``.
+    verbose : int, default=1
+        Verbosity level.
+    remove_duplicate_columns : bool, default=False
+        Whether to remove duplicate column names.
+
+    Returns
+    -------
+    df : pandas.Dataframe
+        A processed copy of the dataset.
+    """
+    if df is None or len(df) == 0:
+        return df
+
+    if ds_id is None:
+        ds_id = df.iloc[0]["ds_id"]
+    if isinstance(ds_id, str):
+        ds_id = int(ds_id.split("-")[-1])
+
+    if "ds_id" in df.columns:
+        df["ds_id"] = "pangaea-" + df["ds_id"].astype(str)
+        df["ds_id"] = df["ds_id"].str.replace("pangaea-pangaea-", "pangaea-")
+    else:
+        df["ds_id"] = f"pangaea-{ds_id}"
+    if "parent_ds_id" in df.columns:
+        df["parent_ds_id"] = "pangaea-" + df["parent_ds_id"].astype(str)
+        df["parent_ds_id"] = df["parent_ds_id"].str.replace(
+            "pangaea-pangaea-", "pangaea-"
+        )
+
+    df = reformat_df(df, remove_duplicate_columns=remove_duplicate_columns)
+    if df is None:
+        return df
+
+    url_col = "url"
+    df = df[df[url_col] != ""]
+    if len(df) == 0:
+        return df
+
+    df = filter_urls(df, url_column=url_col)
+    if len(df) == 0:
+        return df
+
+    # Drop rows that are complete duplicates
+    df.drop_duplicates(inplace=True)
+
+    # Try to fix repeated URLs that are accidental dups but should differ
+    df = fixup_repeated_urls(df, url_column=url_col, verbose=verbose)
+
+    # Check for any rows that are all NaNs
+    if sum(df.isna().all("columns")) > 0:
+        print(f"{ds_id} has a row which is all NaNs")
+
+    # Remove duplicated "favourited" images
+    df = fixup_favourite_images(df, verbose=verbose)
+
+    # Fix incomplete lat/lon/datetime metadata
+    df = fixup_incomplete_metadata(df, ds_id, verbose=verbose)
+
+    # Add datetime if it is completely missing
+    df = add_missing_datetime(df, ds_id, verbose=verbose)
+
+    return df
+
+
 def process_datasets(input_dirname, output_path=None, verbose=0):
     """
     Process a directory of datasets: clean, concatenate and save.

From c4ce7aba35b6eae8c9f63e46f04b316b9d44abf2 Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 May 2024 23:29:17 +0100
Subject: [PATCH 91/92] JNB: More EDA and new output files

---
 notebooks/explore-depth-columns.ipynb | 80 ++++++++++++++++++++++++++-
 1 file changed, 79 insertions(+), 1 deletion(-)

diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb
index 2f3c590..53305b9 100644
--- a/notebooks/explore-depth-columns.ipynb
+++ b/notebooks/explore-depth-columns.ipynb
@@ -30,13 +30,19 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "b6f9ebdb",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "# Load datasets from this directory\n",
     "dirname = \"../query-outputs_2022-01-01\"\n",
+    "dirname = \"../query-outputs_2023-03-07_extras/\"\n",
+    "dirname = \"../query-outputs_2023-03-30c/\"\n",
+    "# dirname = \"../query-outputs_2023-03-30c\"\n",
     "# Pangaea benthic image dataset file with filtered dataset IDs\n",
     "pangaea_file = \"../full-dataset/pangaea_2022-01-24_filtered.csv\"\n",
+    "pangaea_file = \"../datasetcsvs/pangaea_2023-03-30c_with-tiles4.csv\"\n",
     "pangaea_df = pd.read_csv(pangaea_file)\n",
     "ds_ids = pangaea_df.dataset.unique()\n",
     "print(f\"Total {len(ds_ids)} datasets to process.\")"
@@ -186,6 +192,78 @@
     "    print(f\"{c:.<35s} {count:4d}\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a251b7dd-673b-43c0-b948-bb83019aedb1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column_examples[\"sal\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42a79516-2ab2-45ee-b876-daf12758ed00",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column_examples[\"area\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "235276c3-d887-46b6-a453-2873a636533a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column_examples[\"length\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "897336e0-d260-46d4-a71b-7e882e785ce5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column_examples[\"classification\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f159be9-f6dc-4d0f-ae6a-a781a9983cdf",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column_examples[\"content\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d804d2f-6adb-42f3-b164-68fe42a08b92",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column_examples[\"ground vis\"]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "a07b478a-bd3d-417f-8e88-f49ea585c812",

From d8e0aef41356b7e99502c85d3fb83ddbe8cc453c Mon Sep 17 00:00:00 2001
From: Scott Lowe <scott.code.lowe@gmail.com>
Date: Wed, 8 May 2024 23:45:27 +0100
Subject: [PATCH 92/92] DEV: Remove malfunctioning pretty-format-json

On the GitHub workflow, it is trying to fix the format of the
jupyter notebooks.
---
 .pre-commit-config.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 19b80c0..52656ca 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -80,8 +80,6 @@ repos:
       - id: detect-private-key
       - id: end-of-file-fixer
         exclude: ^LICENSE|\.(html|csv|txt|svg|py)$
-      - id: pretty-format-json
-        args: ["--autofix", "--no-ensure-ascii", "--no-sort-keys"]
       - id: requirements-txt-fixer
       - id: trailing-whitespace
         args: [--markdown-linebreak-ext=md]