diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 19b80c0..52656ca 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -80,8 +80,6 @@ repos:
       - id: detect-private-key
       - id: end-of-file-fixer
         exclude: ^LICENSE|\.(html|csv|txt|svg|py)$
-      - id: pretty-format-json
-        args: ["--autofix", "--no-ensure-ascii", "--no-sort-keys"]
       - id: requirements-txt-fixer
       - id: trailing-whitespace
         args: [--markdown-linebreak-ext=md]
diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb
index b84d525..53305b9 100644
--- a/notebooks/explore-depth-columns.ipynb
+++ b/notebooks/explore-depth-columns.ipynb
@@ -18,7 +18,7 @@
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "import pandas as pd\n",
-    "from benthicnet.utils import sanitize_filename, sanitize_filename_series\n",
+    "from benthicnet.io import sanitize_filename, sanitize_filename_series\n",
     "from IPython.display import display\n",
     "from tqdm.auto import tqdm\n",
     "\n",
@@ -30,13 +30,19 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "b6f9ebdb",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "# Load datasets from this directory\n",
     "dirname = \"../query-outputs_2022-01-01\"\n",
+    "dirname = \"../query-outputs_2023-03-07_extras/\"\n",
+    "dirname = \"../query-outputs_2023-03-30c/\"\n",
+    "# dirname = \"../query-outputs_2023-03-30c\"\n",
     "# Pangaea benthic image dataset file with filtered dataset IDs\n",
     "pangaea_file = \"../full-dataset/pangaea_2022-01-24_filtered.csv\"\n",
+    "pangaea_file = \"../datasetcsvs/pangaea_2023-03-30c_with-tiles4.csv\"\n",
     "pangaea_df = pd.read_csv(pangaea_file)\n",
     "ds_ids = pangaea_df.dataset.unique()\n",
     "print(f\"Total {len(ds_ids)} datasets to process.\")"
@@ -186,6 +192,78 @@
     "    print(f\"{c:.<35s} {count:4d}\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a251b7dd-673b-43c0-b948-bb83019aedb1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column_examples[\"sal\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42a79516-2ab2-45ee-b876-daf12758ed00",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column_examples[\"area\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "235276c3-d887-46b6-a453-2873a636533a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column_examples[\"length\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "897336e0-d260-46d4-a71b-7e882e785ce5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column_examples[\"classification\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f159be9-f6dc-4d0f-ae6a-a781a9983cdf",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column_examples[\"content\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d804d2f-6adb-42f3-b164-68fe42a08b92",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column_examples[\"ground vis\"]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "a07b478a-bd3d-417f-8e88-f49ea585c812",
@@ -249,7 +327,7 @@
     "\n",
     "val_exception = {}\n",
     "for i, file in enumerate(column_examples[key]):\n",
-    "    df = pd.read_csv(os.path.join(dirname, file))\n",
+    "    df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
     "    url_column = find_url_column(df)\n",
     "    df.columns = [col.lower() for col in df.columns]\n",
     "    # Extract info\n",
@@ -265,14 +343,21 @@
     "        f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}, Depth start: {start}, end: {end}\"\n",
     "    )\n",
     "    plt.figure(figsize=(16, 4))\n",
-    "    plt.plot(-df[key], label=key)\n",
+    "    plt.plot(df[key], label=key)\n",
+    "    plt.ylabel(key)\n",
+    "    plt.title(url.split(\"/\")[-1] + \" : \" + key)\n",
+    "    plt.gca().invert_yaxis()\n",
     "    plt.show()\n",
+    "    print(url)\n",
     "    # Datasets that defy column value norms\n",
     "    #     if (min_ <= 0) or (max_ <= 0):\n",
     "    #         print(\"\\tMin or Max non-positive.\")\n",
     "    #         val_exception[url] = (mean, sd, min_, max_, start, end)\n",
     "    if value_near_zero(start) or value_near_zero(end):\n",
-    "        print(\"\\tStart or Ene near zero.\")\n",
+    "        print(\"\\tStart or End near zero.\")\n",
+    "        val_exception[url] = (mean, sd, min_, max_, start, end)\n",
+    "    if min_ < 0:\n",
+    "        print(\"\\tNegative depth.\")\n",
     "        val_exception[url] = (mean, sd, min_, max_, start, end)"
    ]
   },
@@ -307,8 +392,9 @@
     "# Column to find\n",
     "key = \"bathy depth\"\n",
     "\n",
+    "val_exception = {}\n",
     "for i, file in enumerate(column_examples[key]):\n",
-    "    df = pd.read_csv(os.path.join(dirname, file))\n",
+    "    df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
     "    url_column = find_url_column(df)\n",
     "    df.columns = [col.lower() for col in df.columns]\n",
     "    # Extract info\n",
@@ -320,8 +406,12 @@
     "    # Show\n",
     "    print(f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\")\n",
     "    plt.figure(figsize=(16, 4))\n",
-    "    plt.plot(-df[key], label=key)\n",
+    "    plt.plot(df[key], label=key)\n",
+    "    plt.ylabel(key)\n",
+    "    plt.title(url.split(\"/\")[-1] + \" : \" + key)\n",
+    "    plt.gca().invert_yaxis()\n",
     "    plt.show()\n",
+    "    print(url)\n",
     "    if (min_ < 0) or (max_ < 0):\n",
     "        print(\"\\tDoes not satisfy column value norms.\")\n",
     "        val_exception[url] = (mean, sd, min_, max_)"
@@ -361,7 +451,7 @@
     "# Depth bot & depth top\n",
     "\n",
     "for i, file in enumerate(column_examples[keys[0]]):\n",
-    "    df = pd.read_csv(os.path.join(dirname, file))\n",
+    "    df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
     "    url_column = find_url_column(df)\n",
     "    df.columns = [col.lower() for col in df.columns]\n",
     "    for key in keys:\n",
@@ -380,7 +470,76 @@
     "        plt.plot(df[key], label=key)\n",
     "    plt.plot(abs(df[\"depth top\"] - df[\"depth bot\"]), label=\"diff\", linestyle=\":\")\n",
     "    plt.legend()\n",
-    "    plt.show()"
+    "    plt.title(url.split(\"/\")[-1])\n",
+    "    plt.show()\n",
+    "    print(url)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ee401a9-e936-4d8b-915d-ed3b1303fd65",
+   "metadata": {},
+   "source": [
+    "### 2.4 Elevation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d1ae559-e6ee-47b8-8f20-69bcef238cb5",
+   "metadata": {
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Column to find\n",
+    "key = \"elevation\"\n",
+    "\n",
+    "val_exception = {}\n",
+    "for i, file in enumerate(column_examples[key]):\n",
+    "    df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
+    "    url_column = find_url_column(df)\n",
+    "    df.columns = [col.lower() for col in df.columns]\n",
+    "    # Extract info\n",
+    "    mean = df[key].mean()\n",
+    "    sd = df[key].std()\n",
+    "    min_ = df[key].min()\n",
+    "    max_ = df[key].max()\n",
+    "    url = get_dataset_url(file)\n",
+    "    # Check for start and end at 0 altitude/depth\n",
+    "    start, end = df[key].iloc[0], df[key].iloc[-1]\n",
+    "    # Show\n",
+    "    print(\n",
+    "        f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}, Depth start: {start}, end: {end}\"\n",
+    "    )\n",
+    "    plt.figure(figsize=(16, 4))\n",
+    "    plt.plot(df[key], label=key)\n",
+    "    plt.ylabel(key)\n",
+    "    plt.title(url.split(\"/\")[-1] + \" : \" + key)\n",
+    "    plt.show()\n",
+    "    print(url)\n",
+    "    # Datasets that defy column value norms\n",
+    "    #     if (min_ <= 0) or (max_ <= 0):\n",
+    "    #         print(\"\\tMin or Max non-positive.\")\n",
+    "    #         val_exception[url] = (mean, sd, min_, max_, start, end)\n",
+    "    if max_ > 0:\n",
+    "        print(\"\\tPositive elevation.\")\n",
+    "        val_exception[url] = (mean, sd, min_, max_, start, end)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ceb64c1f-b5b3-4d8c-9943-b9c6810a1d53",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "val_exception"
    ]
   },
   {
@@ -402,7 +561,7 @@
     "print(len(column_examples[\"depth water\"]))\n",
     "print(len(column_examples[\"bathy depth\"]))\n",
     "print(len(column_examples[\"bathy depth_2\"]))\n",
-    "print(len(column_examples[\"bathy_depth\"]))"
+    "print(len(column_examples[\"elevation\"]))"
    ]
   },
   {
@@ -458,7 +617,7 @@
     "keys = [\"depth\", \"bathy depth\"]\n",
     "if len(intersect) > 0:\n",
     "    for file in intersect:\n",
-    "        df = pd.read_csv(os.path.join(dirname, file))\n",
+    "        df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
     "        df.columns = [col.lower() for col in df.columns]\n",
     "        for key in keys:\n",
     "            # Extract info\n",
@@ -513,7 +672,7 @@
     "keys = [\"depth water\", \"bathy depth\"]\n",
     "if len(intersect) > 0:\n",
     "    for file in intersect:\n",
-    "        df = pd.read_csv(os.path.join(dirname, file))\n",
+    "        df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
     "        df.columns = [col.lower() for col in df.columns]\n",
     "        for key in keys:\n",
     "            # Extract info\n",
@@ -600,13 +759,113 @@
     "**NOTE:** Upon checking the dataset webpages we see that the two bathy depth columns correspond to the original collection and recollection sites."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "61d6de0f-09d8-43f5-a2b6-c47afed77a9d",
+   "metadata": {},
+   "source": [
+    "## 3.5 Datasets with depth water and elevation"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6aabda82",
+   "id": "c3627d1c-717d-4dc2-b20d-761adebd513d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "column1 = \"depth water\"\n",
+    "column2 = \"elevation\"\n",
+    "\n",
+    "fnames_set1 = set(column_examples[column1])\n",
+    "fnames_set2 = set(column_examples[column2])\n",
+    "intersect = fnames_set1.intersection(fnames_set2)\n",
+    "\n",
+    "print(f\"{column1} count:\", len(fnames_set1))\n",
+    "print(f\"{column2} count:\", len(fnames_set2))\n",
+    "print(\"# of files with both:\", len(intersect))\n",
+    "print()\n",
+    "\n",
+    "keys = [column1, column2]\n",
+    "if len(intersect) > 0:\n",
+    "    for file in intersect:\n",
+    "        df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
+    "        df.columns = [col.lower() for col in df.columns]\n",
+    "        for key in keys:\n",
+    "            # Extract info\n",
+    "            mean = df[key].mean()\n",
+    "            sd = df[key].std()\n",
+    "            min_ = df[key].min()\n",
+    "            max_ = df[key].max()\n",
+    "            url = get_dataset_url(file)\n",
+    "            # Show\n",
+    "            print(\n",
+    "                f\"[{i}] '{key}' Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\"\n",
+    "            )\n",
+    "        # Plot\n",
+    "        plt.figure(figsize=(16, 4))\n",
+    "        for key in keys:\n",
+    "            factor = 1 if key == \"elevation\" else -1\n",
+    "            plt.plot(factor * df[key], label=key.capitalize())\n",
+    "        plt.legend()\n",
+    "        plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eeae8d64-0038-47ea-bc0b-8e59e0724b5e",
    "metadata": {},
+   "source": [
+    "## 3.6 Datasets with bathy depth and elevation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eccfc931-307a-4007-8306-6ea918a1489b",
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
-   "source": []
+   "source": [
+    "column1 = \"bathy depth\"\n",
+    "column2 = \"elevation\"\n",
+    "\n",
+    "fnames_set1 = set(column_examples[column1])\n",
+    "fnames_set2 = set(column_examples[column2])\n",
+    "intersect = fnames_set1.intersection(fnames_set2)\n",
+    "\n",
+    "print(f\"{column1} count:\", len(fnames_set1))\n",
+    "print(f\"{column2} count:\", len(fnames_set2))\n",
+    "print(\"# of files with both:\", len(intersect))\n",
+    "print()\n",
+    "\n",
+    "keys = [column1, column2]\n",
+    "if len(intersect) > 0:\n",
+    "    for file in intersect:\n",
+    "        df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n",
+    "        df.columns = [col.lower() for col in df.columns]\n",
+    "        for key in keys:\n",
+    "            # Extract info\n",
+    "            mean = df[key].mean()\n",
+    "            sd = df[key].std()\n",
+    "            min_ = df[key].min()\n",
+    "            max_ = df[key].max()\n",
+    "            url = get_dataset_url(file)\n",
+    "            # Show\n",
+    "            print(\n",
+    "                f\"[{i}] '{key}' Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\"\n",
+    "            )\n",
+    "        # Plot\n",
+    "        plt.figure(figsize=(16, 4))\n",
+    "        for key in keys:\n",
+    "            factor = 1 if key == \"elevation\" else -1\n",
+    "            plt.plot(factor * df[key], label=key.capitalize())\n",
+    "        plt.legend()\n",
+    "        plt.show()"
+   ]
   }
  ],
  "metadata": {
diff --git a/pangaea_downloader/citations.py b/pangaea_downloader/citations.py
index db6796f..833463c 100644
--- a/pangaea_downloader/citations.py
+++ b/pangaea_downloader/citations.py
@@ -1,13 +1,14 @@
 import pickle
 
 import pandas as pd
-import requests
+
+from .tools import requesting
 
 
 def get_bibtex(ds_id: str, verbose=False) -> str:
     """Get the BibTex Citation of a Pangaea dataset using the dataset ID."""
     bib_url = f"https://doi.pangaea.de/10.1594/PANGAEA.{ds_id}?format=citation_bibtex"
-    resp = requests.get(bib_url)
+    resp = requesting.get_request_with_backoff(bib_url)
     if verbose:
         print("\tStatus code:", resp.status_code)
     return resp.text
diff --git a/pangaea_downloader/licenses.py b/pangaea_downloader/licenses.py
index 4ba9a38..8cfac90 100644
--- a/pangaea_downloader/licenses.py
+++ b/pangaea_downloader/licenses.py
@@ -5,10 +5,11 @@
 from typing import Dict, Optional, Union
 
 import pandas as pd
-import requests
 from bs4 import BeautifulSoup
 from tqdm import tqdm
 
+from .tools import requesting
+
 
 def get_dataset_url(ds_id: Union[str, int]) -> str:
     """Return dataset URL given the six digit dataset ID."""
@@ -18,7 +19,7 @@ def get_dataset_url(ds_id: Union[str, int]) -> str:
 def get_dataset_license_info(url: str) -> Optional[Dict[str, str]]:
     """Return a dictionary with license information given the dataset URL."""
     # Make a request to the URL and parse the html
-    resp = requests.get(url)
+    resp = requesting.get_request_with_backoff(url)
     soup = BeautifulSoup(resp.text, "lxml")
     # Get the tag containing the license info
     license_tag = soup.find("a", attrs={"rel": "license"})
diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py
index eb15e1a..a7532fa 100755
--- a/pangaea_downloader/merge_benthic_datasets.py
+++ b/pangaea_downloader/merge_benthic_datasets.py
@@ -10,21 +10,28 @@
 import os
 import re
 from collections import defaultdict
+from functools import partial
 
-import dateutil.parser
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from IPython.display import display
+import scipy.interpolate
+from pandas.api.types import is_numeric_dtype
+from pangaeapy import PanDataSet
 from tqdm.auto import tqdm
 
 from pangaea_downloader import __meta__
 from pangaea_downloader.tools import checker
 
 try:
-    from benthicnet.io import fixup_repeated_output_paths
+    from benthicnet.io import fixup_repeated_output_paths, row2basename
 except ImportError:
     fixup_repeated_output_paths = None
+    row2basename = None
+
+# Create new `pandas` methods which use `tqdm` progress
+# (can use tqdm_gui, optional kwargs, etc.)
+tqdm.pandas()
 
 TAXONOMY_RANKS = [
     ["Kingdom", "Regnum"],
@@ -196,6 +203,13 @@ def check_title(title):
         return False
     if title.startswith("Images of shell cross sections"):
         return False
+    if (
+        "early biofouling processes in a coastal lagoon" in title.lower()
+        or "early biofouling processes in a coastal la goon" in title.lower()
+    ):
+        return False
+    if "photographs of tiles" in title.lower():
+        pass
 
     return True
 
@@ -226,8 +240,21 @@ def reformat_df(df, remove_duplicate_columns=True):
     # Make a copy of the dataframe so we can't overwrite the input
     df = df.copy()
 
-    # Remove bad columns
-    df.drop(labels=["-"], axis="columns", inplace=True, errors="ignore")
+    # Get dataset id from first row
+    ds_id = df.iloc[0]["ds_id"]
+    if isinstance(ds_id, str):
+        ds_id = ds_id.split("-")[-1]
+
+    # Handle Area column
+    for col in ["Area", "Area_2", "Area_3"]:
+        # Area is sometimes the seafloor surface area of the image in
+        # meters^2 and sometimes used as a synonym for location
+        if col in df.columns and not all(df[col].isna()) and is_numeric_dtype(df[col]):
+            print(df.columns)
+            print(f"{ds_id}: Using {col} for area measurement")
+            df.rename(columns={col: "area"}, inplace=True, errors="raise")
+            break
+
     # Remove duplicately named columns
     cols_to_drop = []
     if remove_duplicate_columns:
@@ -241,6 +268,8 @@ def reformat_df(df, remove_duplicate_columns=True):
             ):
                 cols_to_drop.append(col)
         df.drop(labels=cols_to_drop, axis="columns", inplace=True)
+    # Remove bad columns
+    df.drop(labels=["-"], axis="columns", inplace=True, errors="ignore")
 
     # Find the correct URL column, and drop other columns containing "url"
     cols_to_drop = []
@@ -256,8 +285,9 @@ def reformat_df(df, remove_duplicate_columns=True):
     # is the output column name, and the value is a list of search names
     # in order of priority. The first match will be kept and others discarded.
     desired_columns = {
-        "dataset": ["ds_id", "dataset", "Campaign", "campaign"],
-        "site": ["Event", "event", "Site", "site", "deployment"],
+        "url_thumbnail": ["urlthumb", "urlthumbnail"],
+        "dataset": ["ds_id"],
+        "site": ["Event", "event", "deployment"],
         "image": ["image", "filename"],
         "datetime": [
             "Date/Time",
@@ -274,7 +304,7 @@ def reformat_df(df, remove_duplicate_columns=True):
             "latitude+",
             "latitudemed",
             "latitudenorth",
-            "latitudesouth",
+            # "latitudesouth",  # special handling
         ],
         "longitude": [
             "Longitude",
@@ -283,20 +313,15 @@ def reformat_df(df, remove_duplicate_columns=True):
             "long",
             "longitude+",
             "longitudemed",
-            "longitudewest",
             "longitudeeast",
+            # "longitudewest",  # special handling
         ],
         "x_pos": [],
         "y_pos": [],
-        "altitude": ["altitude", "height"],
-        "depth": [
-            "depthwater",
-            "bathydepth",
-            "bathymetry",
-            "bathy",
-            "depth",
-            "elevation",
-        ],
+        "altitude": ["altitude", "heightaboveseafloor", "height"],
+        "depth_camera": ["depthwater", "depth"],
+        "depth_seafloor": ["bathydepth", "bathymetry", "bathy"],
+        "elevation": ["elevation"],
         "backscatter": [],
         "temperature": ["temperature", "temp"],
         "salinity": ["salinity", "sal"],
@@ -325,6 +350,8 @@ def reformat_df(df, remove_duplicate_columns=True):
             if not found:
                 found = True
                 mapping[col] = canon
+                if col in cols_to_drop:
+                    cols_to_drop.remove(col)
                 if col != canon and canon in df.columns:
                     cols_to_drop.append(canon)
             elif col not in mapping and col not in cols_to_drop:
@@ -338,6 +365,8 @@ def reformat_df(df, remove_duplicate_columns=True):
             if not found:
                 found = True
                 mapping[col] = canon
+                if col in cols_to_drop:
+                    cols_to_drop.remove(col)
                 if col != canon and canon in df.columns:
                     cols_to_drop.append(canon)
             elif col not in mapping and col not in cols_to_drop:
@@ -351,6 +380,8 @@ def reformat_df(df, remove_duplicate_columns=True):
             if not found:
                 found = True
                 mapping[col] = canon
+                if col in cols_to_drop:
+                    cols_to_drop.remove(col)
                 if col != canon and canon in df.columns:
                     cols_to_drop.append(canon)
             elif col not in mapping and col not in cols_to_drop:
@@ -358,15 +389,43 @@ def reformat_df(df, remove_duplicate_columns=True):
 
     # Remove superfluous columns
     df.drop(labels=cols_to_drop, axis="columns", inplace=True)
+
     # Rename columns to canonical names
     df.rename(columns=mapping, inplace=True, errors="raise")
 
+    # Handle latitudesouth and longitudewest
+    if "latitude" not in df.columns and "latitudesouth" in lower_cols:
+        col = df.columns[lower_cols.index("latitudesouth")]
+        print(f"Using {col} for {df.iloc[0]['dataset']}")
+        df["latitude"] = -df[col]
+    if "latitude" not in df.columns and "latitude-" in lower_cols:
+        col = df.columns[lower_cols.index("latitude-")]
+        print(f"Using {col} for {df.iloc[0]['dataset']}")
+        df["latitude"] = -df[col]
+    if "longitude" not in df.columns and "longitudewest" in lower_cols:
+        col = df.columns[lower_cols.index("longitudewest")]
+        print(f"Using {col} for {df.iloc[0]['dataset']}")
+        df["longitude"] = -df[col]
+    if "longitude" not in df.columns and "longitude-" in lower_cols:
+        col = df.columns[lower_cols.index("longitude-")]
+        print(f"Using {col} for {df.iloc[0]['dataset']}")
+        df["longitude"] = -df[col]
+
+    # Remove datapoints with erroneous negative depth
+    if "depth_of_observer" in df.columns:
+        # Only observed two datapoints where this happens
+        df.loc[df["depth_of_observer"] < 0, "depth_of_observer"] = pd.NA
+
     # Add file extension to image
     df["image"] = df.apply(add_file_extension, axis=1)
     # if "timestamp" not in df.columns and "datetime" in df.columns:
     #     df["timestamp"] = df["datetime"].apply(datetime2timestamp)
 
-    if any([c in clean_cols for c in ["Kingdom", "Phylum", "Genus"]]):
+    # Add default site if it is missing
+    if "site" not in df.columns:
+        df["site"] = df["dataset"] + "_site"
+
+    if any(c in clean_cols for c in ["Kingdom", "Phylum", "Genus"]):
         df["taxonomy"] = df.apply(row2taxonomy, axis=1)
         df.drop(
             labels=[x for syn in TAXONOMY_RANKS for x in syn],
@@ -479,6 +538,7 @@ def check_image_url(url):
     if (
         url.startswith("https://hs.pangaea.de/Images/Benthos/AntGlassSponges/")
         and "AHEAD" not in url
+        and "DOWN" not in url
     ):
         # Images of AntGlassSponges must contain "AHEAD" to be kept
         # otherwise, they are of sponges after removal
@@ -682,6 +742,903 @@ def fixup_repeated_urls(
     return df
 
 
+def fixup_favourite_images(df, verbose=1):
+    """
+    Drop duplicated favourite images.
+
+    These occur in Schewe and Bergmann's datasets along OFOS profiles during
+    POLARSTERN cruises, PANGAEA dataset ids 849814--849816. 873995--874002,
+    895102--895104, 896545--896549, 896653--896657, 912471.
+
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        A PANGAEA dataframe with Type column.
+    verbose : int, default=1
+        Level of verbosity.
+
+    Returns
+    -------
+    df : pandas.DataFrame
+        As input dataframe, but with all Type entries starting with favourite
+        removed (case-insensitive).
+    """
+    n_samples_before = len(df)
+    if "Type" in df.columns:
+        # Remove all Favourite timer, Favourite hotkey, FAVOURITE_TIMER, and
+        # FAVOURITE_HOTKEY entries, which although they have unique URLs for their
+        # images are actually identical images to the ones occuring immediately
+        # after them in the dataframe.
+        df = df[~df["Type"].str.lower().str.startswith("favourite")]
+    if "image" in df.columns:
+        # Check if the image filename field is repeated except for a leading
+        # "FAVOURITE_" string, if so remove it. These images are identical
+        # copies of the other images.
+        select = df["image"].str.lower().str.startswith("favourite")
+        image_tmp = df["image"].str.replace("FAVOURITE_", "", case=False, regex=False)
+        is_repeated = image_tmp.duplicated(False)
+        # Remove favourite images which are repeated
+        df = df[~(select & is_repeated)]
+    n_samples_after = len(df)
+    if verbose >= 1 and n_samples_after != n_samples_before:
+        print(
+            f"{df.iloc[0]['dataset']}:"
+            f" Removed {n_samples_before - n_samples_after} favourited images."
+            f" {n_samples_before} -> {n_samples_after} rows"
+        )
+    return df
+
+
+def get_dataset_datetime(ds_id):
+    """
+    Determine a generic date for a dataset from the min and max extent datetimes.
+
+    Parameters
+    ----------
+    ds_id : int
+        The identifier of a PANGAEA dataset.
+
+    Returns
+    -------
+    dt_avg : str
+        The average datetime between the min and max extent, with precision
+        reduced to reflect what can accurately be represented.
+    """
+    ds = PanDataSet(ds_id, enable_cache=True)
+    dt_min = pd.to_datetime(ds.mintimeextent)
+    dt_max = pd.to_datetime(ds.maxtimeextent)
+    if dt_min is None and dt_max is None:
+        return pd.NaT
+    elif dt_min is None:
+        return dt_max.strftime("%Y-%m-%d")
+    elif dt_max is None:
+        return dt_min.strftime("%Y-%m-%d")
+    delta = dt_max - dt_min
+    dt_avg = dt_min + delta / 2
+    if delta > datetime.timedelta(days=90):
+        return dt_avg.strftime("%Y")
+    if delta > datetime.timedelta(days=4):
+        return dt_avg.strftime("%Y-%m")
+    if delta > datetime.timedelta(hours=3):
+        return dt_avg.strftime("%Y-%m-%d")
+    if delta > datetime.timedelta(minutes=5):
+        return dt_avg.strftime("%Y-%m-%d %H:00:00")
+    if delta > datetime.timedelta(seconds=5):
+        return dt_avg.strftime("%Y-%m-%d %H:%M:00")
+    return dt_avg.strftime("%Y-%m-%d %H:%M:%S")
+
+
+def fix_missing_datetime_from_image_name(df, ds_id, verbose=1):
+    """
+    Extract datetime information from the contents of the image column in the dataframe.
+
+    Note that the extraction operation is only performed on dataset IDs for
+    which the image naming scheme has been manually evaluated, and is not
+    applied blindly to datasets which have not been inspected.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Input dataframe.
+    ds_id : int
+        The identifier of the PANGAEA dataset.
+    verbose : int, default=1
+        Verbosity level.
+
+    Returns
+    -------
+    df : pandas.DataFrame
+        As input, but with missing datetime cells filled in from the image.
+        Existing datetime values are unchanged.
+    """
+    if "datetime" not in df.columns:
+        df["datetime"] = pd.NaT
+
+    ds_id = int(ds_id)
+
+    select = df["datetime"].isna()
+
+    if row2basename is None:
+        selected_image = df.loc[select, "image"]
+    else:
+        selected_image = df[select].apply(
+            partial(row2basename, use_url_extension=True), axis=1
+        )
+
+    selected_image_no_ext = selected_image.apply(lambda x: os.path.splitext(x)[0])
+
+    if 371062 <= ds_id <= 371064:
+        # e.g. PO309_41-1_2004-04-05T08_55_41.jpg
+        # e.g. PO309_41-2-1_2004-04-05T11_28_26.jpg
+        # e.g. PO322_211-4-1_2005-05-18T19_35_31.jpg
+        dtstr = selected_image_no_ext.apply(lambda x: "-".join(x.split("_")[2:]))
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%dT%H-%M-%S")
+
+    elif ds_id in [
+        785104,
+        785105,
+        785106,
+        785107,
+        785108,
+        785109,
+        785110,
+        836457,
+        867771,
+        867772,
+        867773,
+        867774,
+        867775,
+        867776,
+        867777,
+        867778,
+        867806,
+        867807,
+        867808,
+        867852,
+        867853,
+        867861,
+        873541,
+        875713,
+        875714,
+        876422,
+        876423,
+        876511,
+        876512,
+        876513,
+        876514,
+        876515,
+        876516,
+        876517,
+        876518,
+        880043,
+        880044,
+        885666,
+        885667,
+        885668,
+        885669,
+        885670,
+        885672,
+        885674,
+        885675,
+        885709,
+        885712,
+        885713,
+        885714,
+        885715,
+        885716,
+        885717,
+        885718,
+        885719,
+        885720,
+    ]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. PP_107-100_2012-03-19.png
+        # e.g. PP_100_2012-06-05a.jpg
+        # e.g. TH_122_2012-03-27.jpg
+        # e.g. J_05_2017_05_24a.jpg
+        # e.g. J_overview_2017-05-24za.jpg
+        # e.g. J_40_2017_08_11a.jpg
+        # e.g. J_05_2017-08-11a.jpg
+        # e.g. LG_OVERVIEW_01_05_06_07_09_2013_02_24a.jpg
+        # e.g. LG_01_07_2010_11_11a.jpg
+        # e.g. LG_01_2010_11_11a.jpg
+        # e.g. LG_Cluster1_2012_01_31a.jpg
+        # e.g. LG_01_07_2012_04_22a.jpg
+        # e.g. LG_SCREW_2012_04_22a.jpg
+        # e.g. So_01_2014_02_15b.jpg
+        # e.g. XH_01_2013_01_12_a.jpg
+        # e.g. XH_01%2B09_2013_11_19_a.jpg
+        # e.g. XH_01_2010_04_22_a.jpg
+        # e.g. LH_020_2015_01_28a_counted.jpg
+        # e.g. LH_020_2015_01_28xx.jpg
+        # e.g. J_J40%2BJ46%2BJ41_2016_09_25_a.jpg
+        dtstr = selected_image_no_ext.str.lower().str.rstrip(
+            "abcdefghijklmnopqrstuvwxyz_-"
+        )
+        dtstr = dtstr.str[-11:].str.replace("_", "-").str.lstrip("-")
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d")
+
+    elif ds_id in [
+        789211,
+        789212,
+        789213,
+        789214,
+        789215,
+        789216,
+        789219,
+        819234,
+    ]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. 2003_W01-2.jpg
+        # e.g. 2004_B_bewachsen.jpg
+        # e.g. 2005_B.jpg
+        # e.g. 2013_B01-1.jpg
+        dtstr = selected_image_no_ext.str[:4]
+        # Test the format is correct; we will get an error if not
+        _ = pd.to_datetime(dtstr, format="%Y")
+        # But we actually want to keep the lower precision string
+        df.loc[select, "datetime"] = dtstr
+
+    elif ds_id in [
+        789217,
+        793210,
+        793211,
+        818906,
+        818907,
+        836263,
+        836264,
+        836265,
+        836266,
+        837653,
+    ]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. 04_2011.jpg
+        # e.g. 04a_2011_analog.jpg
+        # e.g. 04.2-2008.jpg
+        # e.g. 08-2008.jpg
+        # e.g. 04a_2013.jpg
+        # e.g. 05a_2003.jpg
+        # e.g. 04_2007.jpg
+        dtstr = selected_image_no_ext.str.lower().str.rstrip(
+            "abcdefghijklmnopqrstuvwxyz_-"
+        )
+        dtstr = dtstr.str[-4:]
+        # Test the format is correct; we will get an error if not
+        _ = pd.to_datetime(dtstr, format="%Y")
+        # But we actually want to keep the lower precision string
+        df.loc[select, "datetime"] = dtstr
+
+    elif ds_id in [836024, 836025]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. 00setting_2014-08.jpg
+        # e.g. 39.9_2014.jpg
+        # e.g. 2014_B01-1.jpg
+        df.loc[select, "datetime"] = "2014"
+
+    elif ds_id in [840699, 840700, 840702, 840703, 840742, 840743]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. J_001_2012-01-31.jpg
+        # e.g. J_003_2012-01-31_2.jpg
+        # e.g. J_115_2012-01-31_a.jpg
+        # e.g. J_033_2012-08-08.jpg
+        dtstr = selected_image_no_ext.apply(lambda x: x.split("_")[2])
+        dtstr = dtstr.str[:10]
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d")
+
+    elif ds_id in [840701, 849298]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. J_002_2013-03_03a.jpg
+        # e.g. J_001_2015-01.jpg
+        # e.g. J_001_2015-01_a.jpg
+        # e.g. J_056_2013-03_06logger.jpg
+        dtstr = selected_image_no_ext.apply(lambda x: x.split("_")[2])
+        # Test the format is correct; we will get an error if not
+        _ = pd.to_datetime(dtstr, format="%Y-%m")
+        # But we actually want to keep the lower precision string
+        df.loc[select, "datetime"] = dtstr
+
+    elif ds_id in [872407, 872408, 872409, 872410, 872411]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. J_40_2017-01-12_a.jpg
+        # e.g. J_overview2_2017-02-02_x.jpg
+        # e.g. J_xx_2017-01-12_x-62.jpg
+        # e.g. J_17_2017-01-14.jpg
+        # e.g. J_23_2017-01-14_b-1.jpg
+        dtstr = selected_image_no_ext.apply(lambda x: x.split("_")[2])
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d")
+
+    elif ds_id in [878045, 888410]:
+        # Nothing to do
+        pass
+
+    elif ds_id in [894734]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. HOTKEY_2018_03_27at21_09_21CP4A4682
+        # e.g. TIMER_2018_03_18at04_04_09CP4A3970
+        dtstr = selected_image_no_ext.apply(lambda x: "_".join(x.split("_")[1:])[:20])
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y_%m_%dat%H_%M_%S")
+
+    elif ds_id in [896157, 896160, 896164]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. 2016-08-2600000.jpg
+        dtstr = selected_image_no_ext.str[:10]
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d")
+
+    if ds_id in [
+        918232,
+        918233,
+        918327,
+        918340,
+        918341,
+        918382,
+        918383,
+        918385,
+    ]:
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}"
+            )
+        # e.g. XH_01_2010_04_22_a.jpg
+        # e.g. XH_01_2010_04_28a.jpg
+        # e.g. XH_03_2018_10_18_a-1.jpg
+        dtstr = selected_image_no_ext.apply(lambda x: "-".join(x.split("_")[2:5])[:10])
+        df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d")
+
+    return df
+
+
+def add_missing_datetime(df, ds_id=None, verbose=1):
+    """
+    Add missing datetime values using either the mean extent or extraction from the file name.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Input dataframe.
+    ds_id : int, optional
+        The identifier of the PANGAEA dataset. The default behaviour is to
+        extract this from the dataset column of the dataframe.
+    verbose : int, default=1
+        Verbosity level.
+
+    Returns
+    -------
+    df : pandas.DataFrame
+        As input, but with missing datetime cells completed, either by using the
+        average from the datetime extent metadata, or by extracting it from the
+        image name.
+        All existing datetime values are left unchanged.
+    """
+    if "datetime" not in df.columns:
+        df["datetime"] = pd.NaT
+
+    if ds_id is None:
+        # Get dataset id from first row
+        ds_id = df.iloc[0]["dataset"].split("-")[-1]
+    ds_id = int(ds_id)
+
+    # Add datetimes that are still missing by inferring from the image filename
+    df = fix_missing_datetime_from_image_name(df, ds_id, verbose=verbose)
+
+    if all(df["datetime"].isna()):
+        # This dataset has no datetime values
+        # Try to determine average datetime from the datetime extent metadata on
+        # the dataset record
+        dt_avg = get_dataset_datetime(ds_id)
+        if dt_avg is not None:
+            if verbose >= 1:
+                print(
+                    f"{ds_id}: Using average datetime from extent"
+                    f" - filenames look like {df.iloc[0]['image']}"
+                )
+            df["datetime"] = dt_avg
+
+    if not any(df["datetime"].isna()):
+        # This dataframe already has all datetime information
+        return df
+
+    select = df["datetime"].isna()
+    if ds_id in [889035, 889025]:
+        if verbose >= 1:
+            print(f"{ds_id}: Adding manual missing datetime for {ds_id}")
+        # From the abstract on PANGAEA (sic):
+        # Experimet was setup during 2007-02-15 and 2007-06-13.
+        df.loc[select, "datetime"] = "2007"
+
+    if ds_id in [896160, 896164]:
+        if verbose >= 1:
+            print(f"{ds_id}: Adding manual missing datetime for {ds_id}")
+        # From the INDEX 2016 ROV (see dataset title and paper
+        # https://doi.org/10.3389/fmars.2019.00096)
+        df.loc[select, "datetime"] = "2016"
+
+    return df
+
+
+def interpolate_by_datetime(df, columns, **kwargs):
+    """
+    Use datetime column to interpolate values for selected columns.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Dataframe with ``"datetime"`` column, which may contain missing values
+        in other columns.
+    columns : str or iterable of str
+        Name of column or columns to fill in missing values with interpolation.
+    **kwargs
+        Additional arguments as per :func:`numpy.interp`.
+
+    Returns
+    -------
+    df : pandas.DataFrame
+        Like input, but with missing values in specified columns completed by
+        linear interpolation over datetime.
+    """
+    # Convert datetime string to a datetime object
+    datetime_actual = pd.to_datetime(df["datetime"])
+    has_datetime = ~datetime_actual.isna()
+    if isinstance(columns, str):
+        columns = [columns]
+    for col in columns:
+        if col not in df:
+            continue
+        interp_kwargs = kwargs
+        if col in ["depth", "depth_of_observer", "bathymetry", "altitude"]:
+            if "left" not in interp_kwargs:
+                interp_kwargs["left"] = np.nan
+            if "right" not in interp_kwargs:
+                interp_kwargs["right"] = np.nan
+        has_col = ~df[col].isna()
+        has_dt_and_col = has_datetime & has_col
+        has_dt_not_col = has_datetime & ~has_col
+        df.loc[has_dt_not_col, col] = np.interp(
+            datetime_actual[has_dt_not_col],
+            datetime_actual[has_dt_and_col],
+            df.loc[has_dt_and_col, col],
+            **interp_kwargs,
+        )
+    return df
+
+
+def fixup_incomplete_metadata(df, ds_id=None, verbose=1):
+    """
+    Fix datasets which have partial, but incomplete, lat/lon/datetime metadata.
+
+    Interpolation is performed as appropriate to the dataset. The methodology
+    was determined by manually inspecting each dataset.
+    Any latitude and longitude values which can not be resolved are filled in
+    with the dataset-level mean latitude and longitude as reported by PANGAEA.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Input dataframe.
+    ds_id : int, optional
+        The identifier of the PANGAEA dataset. The default behaviour is to
+        extract this from the dataset column of the dataframe.
+    verbose : int, default=1
+        Verbosity level.
+
+    Returns
+    -------
+    df : pandas.DataFrame
+        As input, but with missing datetime, latitude, longitude, and/or depth
+        cells completed by interpolation or similar.
+        All existing datetime values are left unchanged.
+    """
+    if ds_id is None:
+        # Get dataset id from first row
+        ds_id = df.iloc[0]["dataset"].split("-")[-1]
+    ds_id = int(ds_id)
+
+    if ds_id in [753197]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+            print("Nothing to be done.")
+
+    if ds_id in [805606, 805607, 805611, 805612]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+            print(f"{ds_id}: Interpolating by index")
+        indices = np.arange(len(df))
+        col = "datetime"
+        select_not_col = df[col].isna()
+        select_has_col = ~select_not_col
+        if any(select_has_col) and any(select_not_col):
+            missing_timestamps = np.interp(
+                indices[select_not_col],
+                indices[select_has_col],
+                pd.to_datetime(df.loc[select_has_col, "datetime"]).apply(
+                    lambda x: x.timestamp()
+                ),
+            )
+            df.loc[select_not_col, col] = [
+                datetime.datetime.fromtimestamp(int(ts)) for ts in missing_timestamps
+            ]
+
+    if ds_id == 875080:
+        # N.B. There is date metadata in the csv, but not time. But there is time
+        # metadata in the filename, so we could extract this if we wanted to.
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+            print("Nothing to be done.")
+        # lat/lon was only recorded for the first 11 images. Fill in the rest
+        # with the median latitude and longitude for the record at the end
+        # of this function.
+
+    if 873995 <= ds_id <= 874002:
+        if verbose >= 1:
+            print(f"Interpolating latitude, longitude, and depth for dataset {ds_id}")
+        # Interpolate lat, lon, and depth based on datetime
+        df = interpolate_by_datetime(
+            df, ["latitude", "longitude", "depth_of_observer", "bathymetry"]
+        )
+
+    if ds_id in [875071, 875073]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+        # Drop rows without datetime values (these have missing lat/lon as well)
+        # For 875071, these images are of the deck of the ship.
+        # For 875073, these images have a translation of less than half an image
+        # from the subsequent image, so we don't need the ones without metadata.
+        df = df[~df["datetime"].isna()]
+        # Interpolate missing depth values
+        df = interpolate_by_datetime(df, ["depth_of_observer", "bathymetry"])
+
+    if ds_id in [875084]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+        # For 875084, images without latitude and longitude are not useful.
+        # The first three are of the deck, the rest are dark watercolumn shots.
+        df = df[~df["longitude"].isna()]
+        # Interpolate missing depth values
+        df = interpolate_by_datetime(df, ["depth_of_observer", "bathymetry"])
+
+    if (878001 <= ds_id <= 878019) or ds_id == 878045:
+        if verbose >= 1:
+            print(f"{ds_id}: Dropping rows missing metadata for dataset {ds_id}")
+        # Images without metadata are of the water column and highly redundant.
+        df = df[~df["longitude"].isna()]
+
+    if ds_id in [894732, 894734]:
+        if verbose >= 1:
+            print(f"{ds_id}: Dropping rows missing metadata for dataset {ds_id}")
+        # It's not clear to me that any of these images are of the seafloor.
+        df = df[~df["longitude"].isna()]
+
+    if ds_id in [895557, 903782, 903788, 903850, 907025, 894801]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+            print(
+                f"{ds_id}: Interpolating by index over subset of images in the same series"
+            )
+        indices = np.arange(len(df))
+        image_no_ext = df["image"].apply(lambda x: os.path.splitext(x)[0])
+        image_major = image_no_ext.str[:-3]
+        missing_dt = df["datetime"].isna()
+        missing_lat = df["latitude"].isna()
+        missing_lon = df["longitude"].isna()
+        for image_major_i in image_major.unique():
+            select = image_major == image_major_i
+            col = "latitude"
+            select_and_col = select & ~missing_lat
+            select_not_col = select & missing_lat
+            if any(select_and_col) and any(select_not_col):
+                df.loc[select_not_col, col] = np.interp(
+                    indices[select_not_col],
+                    indices[select_and_col],
+                    df.loc[select_and_col, col],
+                )
+            col = "longitude"
+            select_and_col = select & ~missing_lon
+            select_not_col = select & missing_lon
+            if any(select_and_col) and any(select_not_col):
+                df.loc[select_not_col, col] = np.interp(
+                    indices[select_not_col],
+                    indices[select_and_col],
+                    df.loc[select_and_col, col],
+                )
+            col = "datetime"
+            select_and_col = select & ~missing_dt
+            select_not_col = select & missing_dt
+            if any(select_and_col) and any(select_not_col):
+                new_values = scipy.interpolate.interp1d(
+                    indices[select_and_col],
+                    pd.to_datetime(df.loc[select_and_col, col]).map(
+                        pd.Timestamp.timestamp
+                    ),
+                    kind="nearest",
+                    fill_value="extrapolate",
+                )(indices[select_not_col])
+                new_values = pd.to_datetime(new_values, unit="s")
+                new_values = new_values.strftime("%Y-%m-%d")
+                df.loc[select_not_col, col] = new_values
+
+    if ds_id in [911904, 918924, 919348]:
+        if verbose >= 1:
+            print(f"{ds_id}: Extracting missing datetime metadata for dataset {ds_id}")
+        # Extract missing datetime from the filename, formatted like (e.g.)
+        # TIMER_2019_03_31_at_05_50_12_IMG_0263
+        has_no_datetime = df["datetime"].isna()
+        fname_inner = df.loc[has_no_datetime, "image"].apply(
+            lambda x: "_".join(x.split("_")[1:-2])
+        )
+        df.loc[has_no_datetime, "datetime"] = pd.to_datetime(
+            fname_inner, format="%Y_%m_%d_at_%H_%M_%S"
+        )
+        if verbose >= 1:
+            print(
+                f"{ds_id}: Interpolating latitude, longitude, and depth for dataset {ds_id}"
+            )
+        df = interpolate_by_datetime(
+            df, ["latitude", "longitude", "depth_of_observer", "bathymetry"]
+        )
+
+    if ds_id in [914155]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+        # Images without datetime are too dark
+        df = df[~df["datetime"].isna()]
+        # Other images are missing latitude and longitude metadata
+        df = interpolate_by_datetime(df, ["latitude", "longitude"])
+
+    if ds_id in [914156, 914197]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+        # Some images are clearly of the same thing, but one is good visibility
+        # with no lat/lon, and the next is too dark and has no datetime.
+        for from_image, to_image in [
+            ("IMG_0393", "IMG_0392"),
+            ("IMG_0395", "IMG_0394"),
+        ]:
+            columns = ["latitude", "longitude"]
+            select_from = df["image"].str.startswith(from_image)
+            select_to = df["image"].str.startswith(to_image)
+            df.loc[select_to, columns] = df.loc[select_from, columns]
+        # Drop images without datetime
+        df = df[~df["datetime"].isna()]
+        # Fill in any missing latitude and longitude metadata
+        df = interpolate_by_datetime(df, ["latitude", "longitude"])
+
+    if ds_id in [914192]:
+        if verbose >= 1:
+            print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}")
+        # Some images are clearly of the same thing, but one is good visibility
+        # with no lat/lon, and the next is too dark and has no datetime.
+        for from_image, to_image in [
+            ("IMG_1776", "IMG_1775"),
+        ]:
+            columns = ["latitude", "longitude"]
+            select_from = df["image"].str.startswith(from_image)
+            select_to = df["image"].str.startswith(to_image)
+            df.loc[select_to, columns] = df.loc[select_from, columns]
+        # Drop images without datetime
+        df = df[~df["datetime"].isna()]
+        # Fill in any missing latitude and longitude metadata
+        df = interpolate_by_datetime(df, ["latitude", "longitude"])
+
+    if (
+        (702075 <= ds_id <= 702080)
+        or (818484 <= ds_id <= 818509)
+        or ds_id in [849287, 849289]
+        or 862084 <= ds_id <= 862097
+        or ds_id in [875072, 875074]
+        or 875081 <= ds_id <= 875085
+    ):
+        if verbose >= 1:
+            print(f"{ds_id}: Interpolating missing depth metadata for dataset {ds_id}")
+        df = interpolate_by_datetime(df, ["depth_of_observer", "bathymetry"])
+
+    if any(df["latitude"].isna() | df["longitude"].isna()):
+        # Fill in any missing latitude and longitude values with the
+        # mean coordinate reported at the dataset level
+        ds = PanDataSet(ds_id, enable_cache=True)
+        if hasattr(ds, "geometryextent"):
+            lat = None
+            long = None
+            for k in ["meanLatitude", "latitude", "Latitude"]:
+                if k in ds.geometryextent:
+                    lat = ds.geometryextent[k]
+                    break
+            for k in ["meanLongitude", "longitude", "Latitude"]:
+                if k in ds.geometryextent:
+                    long = ds.geometryextent[k]
+                    break
+            if lat is not None:
+                if verbose >= 1:
+                    print(f"{ds_id}: Using dataset mean latitude for missing values")
+                df.loc[df["latitude"].isna(), "latitude"] = lat
+            if long is not None:
+                if verbose >= 1:
+                    print(f"{ds_id}: Using dataset mean longitude for missing values")
+                df.loc[df["longitude"].isna(), "longitude"] = long
+
+    return df
+
+
+def merge_duplicated_urls(df):
+    """
+    Merge metadata across rows which have the same URL.
+    """
+    print("Original number of rows:", len(df))
+    df.drop_duplicates(inplace=True)
+    print("Number of rows after dropping simple duplicates:", len(df))
+    # Record the original sort index so we can get the data back in the original
+    # order.
+    df["original_index"] = df.index
+    # Determine how many images are at the same location. This indicates how
+    # accurate the latitude and longitude information is. We will want to keep
+    # the most accurate version of this.
+    repeat_location_counts = df[["longitude", "latitude"]].value_counts()
+    repeat_location_counts = repeat_location_counts.to_frame()
+    repeat_location_counts.rename(columns={0: "tally_repeated_location"}, inplace=True)
+    # Add the tally_repeated_location data as a new column
+    df = df.merge(repeat_location_counts, how="left", on=["latitude", "longitude"])
+
+    def resolve_duplicates(sdf):
+        if len(sdf) == 1:
+            # If there's only one row in the group, return it.
+            return sdf.iloc[0]
+        # Take the entry which has the fewest repetitions of the latitude and
+        # longitude value. We will use the version from the first dataset that
+        # had the fewest repetitions of the location for this image.
+        # We adopt this row's collection, dataset, and site values in addition
+        # to its coordinates.
+        idx = np.argmin(sdf["tally_repeated_location"])
+        row = sdf.iloc[idx].copy()
+        # For numeric columns (other than latitude and longitude), take the
+        # average of the values where they are present.
+        for col in [
+            "depth_of_observer",
+            "altitude",
+            "bathymetry",
+            "salinity",
+            "temperature",
+            "acidity",
+            "area",
+        ]:
+            if col not in sdf.columns:
+                continue
+            select = ~pd.isna(sdf[col])
+            if select.sum() == 0:
+                continue
+            row[col] = sdf[select][col].mean()
+        # Look to see if we are missing an image or thumbnail entry and one
+        # of the duplicates has its value.
+        for col in ["image", "url_thumbnail"]:
+            if col not in sdf.columns:
+                continue
+            if not pd.isna(row[col]):
+                continue
+            values = sdf[col]
+            values = values[~pd.isna(values)]
+            if len(values) == 0:
+                continue
+            row[col] = values.iloc[0]
+        # For datetime, use the fact that we encoded datetime as a string
+        # with varying levels of precision. More digits means higher precision.
+        # Take the most precise value, preferring the value from the selected
+        # record in the event of a tie.
+        datetime_len = sdf["datetime"].str.replace(" 00:00:00", "").str.len()
+        idx_dt = np.argmax(datetime_len)
+        if datetime_len.iloc[idx] != datetime_len.iloc[idx_dt]:
+            row["datetime"] = sdf.iloc[idx_dt]["datetime"]
+        return row
+
+    print("Merging metadata between rows with the same URL")
+    # Group by URL and apply our transformation to each group
+    df_out = df.groupby("url").progress_apply(resolve_duplicates)
+    # Reorder the dataframe to preseve implicit temporal information from the
+    # ordering of the images
+    df_out.sort_values("original_index", inplace=True)
+    df_out.drop(columns=["original_index", "tally_repeated_location"], inplace=True)
+    return df_out
+
+
+def process_single(df, ds_id=None, verbose=1, remove_duplicate_columns=False):
+    """
+    Reformat and cleanup metadata for a single dataset.
+
+    Parameters
+    ----------
+    df : pandas.Dataframe
+        The dataset to process.
+    ds_id : int, optional
+        The ID number for the PANGAEA dataset. If omitted, it is inferred from
+        the ``ds_id`` column of ``df``.
+    verbose : int, default=1
+        Verbosity level.
+    remove_duplicate_columns : bool, default=False
+        Whether to remove duplicate column names.
+
+    Returns
+    -------
+    df : pandas.Dataframe
+        A processed copy of the dataset.
+    """
+    if df is None or len(df) == 0:
+        return df
+
+    if ds_id is None:
+        ds_id = df.iloc[0]["ds_id"]
+    if isinstance(ds_id, str):
+        ds_id = int(ds_id.split("-")[-1])
+
+    if "ds_id" in df.columns:
+        df["ds_id"] = "pangaea-" + df["ds_id"].astype(str)
+        df["ds_id"] = df["ds_id"].str.replace("pangaea-pangaea-", "pangaea-")
+    else:
+        df["ds_id"] = f"pangaea-{ds_id}"
+    if "parent_ds_id" in df.columns:
+        df["parent_ds_id"] = "pangaea-" + df["parent_ds_id"].astype(str)
+        df["parent_ds_id"] = df["parent_ds_id"].str.replace(
+            "pangaea-pangaea-", "pangaea-"
+        )
+
+    df = reformat_df(df, remove_duplicate_columns=remove_duplicate_columns)
+    if df is None:
+        return df
+
+    url_col = "url"
+    df = df[df[url_col] != ""]
+    if len(df) == 0:
+        return df
+
+    df = filter_urls(df, url_column=url_col)
+    if len(df) == 0:
+        return df
+
+    # Drop rows that are complete duplicates
+    df.drop_duplicates(inplace=True)
+
+    # Try to fix repeated URLs that are accidental dups but should differ
+    df = fixup_repeated_urls(df, url_column=url_col, verbose=verbose)
+
+    # Check for any rows that are all NaNs
+    if sum(df.isna().all("columns")) > 0:
+        print(f"{ds_id} has a row which is all NaNs")
+
+    # Remove duplicated "favourited" images
+    df = fixup_favourite_images(df, verbose=verbose)
+
+    # Fix incomplete lat/lon/datetime metadata
+    df = fixup_incomplete_metadata(df, ds_id, verbose=verbose)
+
+    # Add datetime if it is completely missing
+    df = add_missing_datetime(df, ds_id, verbose=verbose)
+
+    return df
+
+
 def process_datasets(input_dirname, output_path=None, verbose=0):
     """
     Process a directory of datasets: clean, concatenate and save.
@@ -711,11 +1668,18 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
     n_valid = 0
     dfs = []
     dfs_fnames = []
+    ids_with_potential_labels = []
 
     for fname in tqdm(sorted(sorted(os.listdir(input_dirname)), key=len)):  # noqa: C414
+        if not fname.endswith(".csv"):
+            continue
         # for fname in tqdm(os.listdir(input_dirname)):
         ds_id = os.path.splitext(fname)[0]
-        df = pd.read_csv(os.path.join(input_dirname, fname))
+        if ds_id in ["805690", "803979"]:
+            # The title was not captured from this dataset for some reason,
+            # so we can't exclude it via the title.
+            continue
+        df = pd.read_csv(os.path.join(input_dirname, fname), low_memory=False)
         n_total += 1
         if not checker.has_url_col(df):
             continue
@@ -725,7 +1689,13 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
             files_without_url.append(fname)
             continue
 
-        df["ds_id"] = f"pangaea-{ds_id}"
+        if "ds_id" in df.columns:
+            df["ds_id"] = "pangaea-" + df["ds_id"].astype(str)
+        else:
+            df["ds_id"] = f"pangaea-{ds_id}"
+        if "parent_ds_id" in df.columns:
+            df["parent_ds_id"] = "pangaea-" + df["parent_ds_id"].astype(str)
+
         df = reformat_df(df)
         if df is None:
             continue
@@ -745,6 +1715,30 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
             column_count[col] += 1
             column_examples[col].append(fname)
 
+        for key in [
+            # "Type",
+            "Content",  # Yes!
+            # "Sample label",
+            # "ID",
+            # "Sample ID",
+            "Classification",  # Yes!
+            "Species",  # Yes!
+            # "Reference",
+            # "Samp type",
+            "Family",
+            "Genus",
+            # "Ind No",
+            # "Imagery",
+            # "Img brightness",  # No
+            "Ground vis",  # Yes!
+            "Marine litter",
+            "Fisheries plastic",
+            "Unident litter",
+        ]:
+            if key in df.columns:
+                print(f"{fname} has {key}")
+                ids_with_potential_labels.append(ds_id)
+
         # Drop rows that are complete duplicates
         df.drop_duplicates(inplace=True)
 
@@ -752,7 +1746,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
             files_with_repeat_urls.append(fname)
 
         # Try to fix repeated URLs that are accidental dups but should differ
-        df = fixup_repeated_urls(df, url_column=url_col, verbose=1)
+        df = fixup_repeated_urls(df, url_column=url_col, verbose=verbose)
 
         if len(df) != len(df.drop_duplicates(subset=url_col)):
             files_with_repeat_urls2.append(fname)
@@ -761,26 +1755,47 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
         if sum(df.isna().all("columns")) > 0:
             print(f"{ds_id} has a row which is all NaNs")
 
+        # Remove duplicated "favourited" images
+        df = fixup_favourite_images(df, verbose=verbose)
+
+        # Fix incomplete lat/lon/datetime metadata
+        df = fixup_incomplete_metadata(df, ds_id, verbose=verbose)
+
+        # Add datetime if it is completely missing
+        df = add_missing_datetime(df, ds_id, verbose=verbose)
+
         dfs.append(df)
         dfs_fnames.append(fname)
 
-    print(f"There are {n_valid} valid (of {n_total}) valid datasets")
-    print(
-        f"Of which {len(files_with_repeat_urls)} have repeated URLs (before replacing dups with image)"
-    )
-    print(
-        f"Of which {len(files_with_repeat_urls2)} have repeated URLs (after replacing dups with image)"
-    )
-    print()
-    print(f"There are {len(column_count)} unique column names:")
-    print()
-
-    for col, count in dict(
-        sorted(column_count.items(), key=lambda item: item[1], reverse=True)
-    ).items():
-        c = col + " "
-        print(f"{c:.<35s} {count:4d}")
-    print()
+    if verbose >= 0:
+        print(f"There are {n_valid} valid (of {n_total}) valid datasets")
+        print(
+            f"Of which {len(files_with_repeat_urls)} have repeated URLs (before replacing dups with image)"
+        )
+        for fname in files_with_repeat_urls:
+            print(f"    {fname}")
+        print(
+            f"Of which {len(files_with_repeat_urls2)} have repeated URLs (after replacing dups with image)"
+        )
+        for fname in files_with_repeat_urls2:
+            print(f"    {fname}")
+        print()
+        print(f"There are {len(column_count)} unique column names:")
+        print()
+
+        for col, count in dict(
+            sorted(column_count.items(), key=lambda item: item[1], reverse=True)
+        ).items():
+            c = col + " "
+            print(f"{c:.<35s} {count:4d}")
+        print()
+        ids_with_potential_labels = sorted(set(ids_with_potential_labels))
+        print(
+            f"There are {len(ids_with_potential_labels)} datasets which might have labels to extract:"
+        )
+        for ds_id in ids_with_potential_labels:
+            print(ds_id)
+        print()
 
     if verbose >= 1:
         print("Filter columns")
@@ -789,26 +1804,43 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
         "dataset",
         "site",
         "url",
+        "url_thumbnail",
         "image",
         "datetime",
         "latitude",
         "longitude",
+        "area",
         "altitude",
-        "depth",
+        "depth_of_observer",
+        "bathymetry",
         "backscatter",
         "temperature",
         "salinity",
         "chlorophyll",
         "acidity",
+        "parent_ds_id",
     }
     df_all = pd.concat(
         [df[df.columns.intersection(select_cols)] for df in dfs if len(df) > 0]
     )
+    df_all.rename(columns={"parent_ds_id": "collection"}, inplace=True)
+
+    print(f"There are {len(df_all)} records before dropping duplicated URLs")
+
+    if os.path.dirname(output_path):
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    output_path_with_dups = os.path.splitext(output_path)[0] + "_with-duplicates.csv"
+    if verbose >= 0:
+        print(f"Saving (with duplicates) to {output_path_with_dups}")
+    df_all.to_csv(output_path_with_dups, index=False)
 
     # Remove duplicate URLs
     if verbose >= 1:
-        print("Remove duplicates")
-    df_all.drop_duplicates(subset="url", inplace=True, keep="first")
+        print("Merge duplicated URLs")
+    # Convert datetime to string
+    df_all["datetime"] = df_all["datetime"].astype(str)
+    df_all = merge_duplicated_urls(df_all)
+    print(f"There are {len(df_all)} records after dropping duplicated URLs")
 
     # Fix repeated output paths by replacing with image field
     if fixup_repeated_output_paths is None:
@@ -817,12 +1849,10 @@ def process_datasets(input_dirname, output_path=None, verbose=0):
     else:
         if verbose >= 1:
             print("Fix repeated output paths to prevent collisions")
-        df_all = fixup_repeated_output_paths(df_all, inplace=True, verbose=2)
+        df_all = fixup_repeated_output_paths(df_all, inplace=True, verbose=verbose)
 
-    if os.path.dirname(output_path):
-        os.makedirs(os.path.dirname(output_path), exist_ok=True)
     if verbose >= 0:
-        print(f"Saving to {output_path}")
+        print(f"Saving (without duplicates) to {output_path}")
     df_all.to_csv(output_path, index=False)
 
 
@@ -875,7 +1905,7 @@ def get_parser():
         "--verbose",
         "-v",
         action="count",
-        default=0,
+        default=1,
         help=textwrap.dedent(
             """
             Increase the level of verbosity of the program. This can be
diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py
index bb17b7f..9623b10 100755
--- a/pangaea_downloader/pq_scraper.py
+++ b/pangaea_downloader/pq_scraper.py
@@ -16,7 +16,12 @@
 from pangaea_downloader.tools import datasets, process, scraper, search
 
 
-def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
+def search_and_download(
+    queries=None,
+    output_dir="query-outputs",
+    auth_token=None,
+    verbose=0,
+):
     """
     Search `PANGAEA`_ for a set of queries, and download datasets for each result.
 
@@ -31,6 +36,8 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
     output_dir : str, default="query-outputs"
         The output directory where downloaded datasets will be saved.
         Any existing output datasets will be skipped instead of downloaded.
+    auth_token : str, optional
+        Bearer authentication token.
     verbose : int, default=1
         Verbosity level.
     """
@@ -49,6 +56,11 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
     os.makedirs(output_dir, exist_ok=True)
     df_results.to_csv(output_dir.rstrip("/") + "_search_results.csv", index=False)
 
+    fname_child2parent = output_dir.rstrip("/") + "_child2parent.csv"
+    if not os.path.isfile(fname_child2parent):
+        with open(fname_child2parent, "w") as f:
+            f.write("child,parent\n")
+
     # Process each result dictionary
     n_files = 0
     n_downloads = 0
@@ -75,7 +87,11 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
         # ------------- ASSESS DATASET TYPE ------------- #
         try:
             if is_parent:
-                df_list = datasets.fetch_children(url, verbose=verbose - 1)
+                df_list = datasets.fetch_children(
+                    url,
+                    verbose=verbose - 1,
+                    auth_token=auth_token,
+                )
                 if df_list is None:
                     if verbose >= 1:
                         print(
@@ -93,9 +109,32 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
                             + colorama.Fore.RESET
                         )
                     continue
-                df = pd.concat(df_list)
+                for df in df_list:
+                    if df is None or len(df) == 0:
+                        continue
+                    # Add the parent's ID to the dataframe
+                    df["parent_ds_id"] = ds_id
+                    # Save the child to its own CSV, including a column that
+                    # records the parent's dataset ID
+                    child_id = df.iloc[0]["ds_id"]
+                    child_output_path = os.path.join(output_dir, f"{child_id}.csv")
+                    saved = datasets.save_df(
+                        df, child_output_path, level=1, verbose=verbose - 1
+                    )
+                    n_downloads += 1 if saved else 0
+                    with open(fname_child2parent, "a") as f:
+                        f.write(f"{child_id},{ds_id}\n")
+                # We have saved all the children individually, so will skip
+                # saving a redundant merged dataframe
+                # But we will save an empty file so we know to skip
+                with open(output_path, "w") as f:
+                    f.write("is_parent")
+                continue
             else:
-                dataset_type = process.ds_type(size)
+                try:
+                    dataset_type = process.ds_type(size)
+                except Exception:
+                    raise ValueError(f"Can't process type from size for {ds_id}")
                 if dataset_type == "video":
                     if verbose >= 1:
                         print(
@@ -107,7 +146,11 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
                 elif dataset_type == "paginated":
                     df = scraper.scrape_image_data(url, verbose=verbose - 1)
                 elif dataset_type == "tabular":
-                    df = datasets.fetch_child(url, verbose=verbose - 1)
+                    df = datasets.fetch_child(
+                        url,
+                        verbose=verbose - 1,
+                        auth_token=auth_token,
+                    )
         except Exception as err:
             if isinstance(err, KeyboardInterrupt):
                 raise
@@ -121,16 +164,7 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
         # ----------------- SAVE TO FILE ----------------- #
         if df is None:
             continue
-        try:
-            saved = datasets.save_df(df, output_path, level=1, verbose=verbose - 1)
-        except Exception as err:
-            # Delete partially saved file, if present
-            if os.path.isfile(output_path):
-                try:
-                    os.remove(output_path)
-                except Exception:
-                    pass
-            raise err
+        saved = datasets.save_df(df, output_path, level=1, verbose=verbose - 1)
         n_downloads += 1 if saved else 0
 
     if verbose >= 0:
@@ -138,7 +172,9 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0):
         print(f"Number of files previously saved: {n_files}.")
         print(f"Total dataset files: {n_files + n_downloads}")
         print(f"Number of dataset errors (excluding access): {len(errors)}.")
-
+        if len(errors) > 0:
+            print()
+            print("Captured errors are now repeated as follows.")
         for msg in errors:
             print()
             print(msg)
@@ -190,6 +226,11 @@ def get_parser():
         default="query-outputs",
         help="Directory for downloaded datasets. Default is %(default)s.",
     )
+    parser.add_argument(
+        "--auth-token",
+        type=str,
+        help="Bearer authentication token",
+    )
     parser.add_argument(
         "--verbose",
         "-v",
diff --git a/pangaea_downloader/tools/checker.py b/pangaea_downloader/tools/checker.py
index 7292c43..3d345d6 100644
--- a/pangaea_downloader/tools/checker.py
+++ b/pangaea_downloader/tools/checker.py
@@ -61,8 +61,8 @@ def is_invalid_file_ext(filename: str) -> bool:
 # --------------------------------------------- DataFrame Checkers --------------------------------------------- #
 def has_url_col(df: DataFrame) -> bool:
     """Take a Pandas DataFrame and return True if it has image URL column."""
-    condition1 = any(["url" in col.lower() for col in df.columns])
-    condition2 = any(["image" in col.lower() for col in df.columns])
+    condition1 = any("url" in col.lower() for col in df.columns)
+    condition2 = any("image" in col.lower() for col in df.columns)
     return condition1 or condition2
 
 
diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py
index b54c6d5..8b6f83a 100644
--- a/pangaea_downloader/tools/datasets.py
+++ b/pangaea_downloader/tools/datasets.py
@@ -6,6 +6,8 @@
       use pangaea_downloader.tools.scraper module.
 """
 import os
+import shutil
+import tempfile
 import time
 from typing import List, Optional
 
@@ -16,19 +18,24 @@
 from pangaea_downloader.tools import checker, process, scraper
 
 T_POLL_LAST = 0
-T_POLL_INTV = 0.1667
+# T_POLL_INTV = 0  # Allow rapid loading of cached records
+T_POLL_INTV = 0.1667  # Rate-limit ourselves; stay under 180 requests within 30s
 
 
-def fetch_child(child_url: str, verbose=1) -> Optional[DataFrame]:
+def fetch_child(
+    child_url: str,
+    verbose=1,
+    ensure_url=True,
+    auth_token=None,
+) -> Optional[DataFrame]:
     """Fetch Pangaea child dataset using provided URI/DOI and return DataFrame."""
     # Load data set
     global T_POLL_LAST
     global T_POLL_INTV
     t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time())
     time.sleep(t_wait)  # Stay under 180 requests every 30s
-    ds = PanDataSet(child_url)
+    ds = PanDataSet(child_url, enable_cache=True, auth_token=auth_token)
     T_POLL_LAST = time.time()
-    doi = getattr(ds, "doi", "").split("doi.org/")[-1]
     # Dataset is restricted
     if ds.loginstatus != "unrestricted":
         if verbose >= 1:
@@ -39,7 +46,7 @@ def fetch_child(child_url: str, verbose=1) -> Optional[DataFrame]:
             )
         return
     # Check for image URL column
-    if not checker.has_url_col(ds.data):
+    if ensure_url and not checker.has_url_col(ds.data):
         if verbose >= 1:
             print(
                 colorama.Fore.YELLOW
@@ -48,20 +55,29 @@ def fetch_child(child_url: str, verbose=1) -> Optional[DataFrame]:
             )
         return
     # Add metadata
-    df = set_metadata(ds, alt=doi)
+    df = set_metadata(ds)
     # Exclude unwanted rows
     df = exclude_rows(df)
+    # Add dataset ID
+    doi = getattr(ds, "doi", "")
+    ds_id = uri2dsid(doi if doi else child_url)
+    df["ds_id"] = ds_id
     return df
 
 
-def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]:
+def fetch_children(
+    parent_url: str,
+    verbose=1,
+    ensure_url=True,
+    auth_token=None,
+) -> Optional[List[DataFrame]]:
     """Take in url of a parent dataset, fetch and return list of child datasets."""
     # Fetch dataset
     global T_POLL_LAST
     global T_POLL_INTV
     t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time())
     time.sleep(t_wait)  # Stay under 180 requests every 30s
-    ds = PanDataSet(parent_url)
+    ds = PanDataSet(parent_url, enable_cache=True, auth_token=auth_token)
     T_POLL_LAST = time.time()
     # Check restriction
     if ds.loginstatus != "unrestricted":
@@ -78,9 +94,13 @@ def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]:
     df_list = []
     for i, child_uri in enumerate(ds.children):
         url = process.url_from_uri(child_uri)
+        ds_id = uri2dsid(child_uri)
         size = process.get_html_info(url)
         # Assess type
-        typ = process.ds_type(size)
+        try:
+            typ = process.ds_type(size)
+        except Exception:
+            raise ValueError(f"Can't process type from size for {url}")
         if typ == "video":
             if verbose >= 1:
                 print(
@@ -91,14 +111,12 @@ def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]:
             continue
         elif typ == "paginated":
             if verbose >= 1:
-                print(f"\t\t[{i+1}] Scrapping dataset...")
+                print(f"\t\t[{i+1}] Scraping dataset...")
             df = scraper.scrape_image_data(url)
-            if df is not None:
-                df_list.append(df)
         elif typ == "tabular":
             t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time())
             time.sleep(t_wait)  # Stay under 180 requests every 30s
-            child = PanDataSet(url)
+            child = PanDataSet(url, enable_cache=True, auth_token=auth_token)
             T_POLL_LAST = time.time()
             if ds.loginstatus != "unrestricted":
                 if verbose >= 1:
@@ -107,21 +125,23 @@ def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]:
                         + f"\t\t[{i+1}] [ERROR] Access restricted: '{ds.loginstatus}'. {url}"
                         + colorama.Fore.RESET
                     )
-                return
-            if not checker.has_url_col(child.data):
+                continue
+            if ensure_url and not checker.has_url_col(child.data):
                 if verbose >= 1:
                     print(
                         colorama.Fore.YELLOW
                         + f"\t\t[{i+1}] [WARNING] Image URL column NOT found! {url} Skipping..."
                         + colorama.Fore.RESET
                     )
-            else:
-                # Add metadata
-                child_doi = getattr(child, "doi", "").split("doi.org/")[-1]
-                df = set_metadata(child, alt=child_doi)
-                # Add child dataset to list
-                df = exclude_rows(df)
-                df_list.append(df)
+                continue
+            # Add metadata
+            df = set_metadata(child)
+            # Add child dataset to list
+            df = exclude_rows(df)
+        if df is None:
+            continue
+        df["ds_id"] = ds_id
+        df_list.append(df)
 
     # Return result
     if len(df_list) > 0:
@@ -131,20 +151,13 @@ def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]:
         return None
 
 
-def set_metadata(ds: PanDataSet, alt="unknown") -> DataFrame:
+def set_metadata(ds: PanDataSet) -> DataFrame:
     """Add metadata to a PanDataSet's dataframe."""
     ds.data["dataset_title"] = ds.title
     ds.data["doi"] = getattr(ds, "doi", "")
     # Dataset campaign
     if (len(ds.events) > 0) and (ds.events[0].campaign is not None):
         ds.data["campaign"] = ds.events[0].campaign.name
-    else:
-        ds.data["campaign"] = alt
-    # Dataset site/event/deployment
-    if "Event" in ds.data.columns:
-        ds.data["site"] = ds.data["Event"]
-    else:
-        ds.data["site"] = alt + "_site"
     return ds.data
 
 
@@ -162,9 +175,16 @@ def save_df(df: DataFrame, output_path: str, level=1, index=None, verbose=1) ->
         if verbose >= 1:
             print(f"{tabs}[{idx}] Empty DataFrame! File not saved!")
         return False
-    # Save if dataframe not empty
-    df.to_csv(output_path, index=False)
-    print(f"{tabs}[{idx}] Saved to '{output_path}'")
+    # Save dataframe if it is not empty
+    with tempfile.TemporaryDirectory() as dir_tmp:
+        # Write to a temporary file
+        tmp_path = os.path.join(dir_tmp, os.path.basename(output_path))
+        df.to_csv(tmp_path, index=False)
+        # Move our temporary file to the destination
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        shutil.move(tmp_path, output_path)
+    if verbose >= 1:
+        print(f"{tabs}[{idx}] Saved to '{output_path}'")
     return True
 
 
@@ -212,6 +232,13 @@ def fix_text(text: str) -> str:
     return text
 
 
+def uri2dsid(uri: str) -> str:
+    """
+    Extract PANGAEA dataset ID from url/uri/doi string.
+    """
+    return uri.split("PANGAEA.")[-1]
+
+
 def get_dataset_id(df: DataFrame) -> str:
     """Take a Pandas DataFrame as input and return the datasets Pangaea ID."""
     col = find_column_match("doi")
diff --git a/pangaea_downloader/tools/eda.py b/pangaea_downloader/tools/eda.py
index 219fa31..9696c93 100644
--- a/pangaea_downloader/tools/eda.py
+++ b/pangaea_downloader/tools/eda.py
@@ -4,10 +4,11 @@
 import matplotlib.cm
 import matplotlib.colors
 import numpy as np
-import requests
 from matplotlib.pyplot import get_cmap
 from sklearn.neighbors import KernelDensity
 
+from . import requesting
+
 
 def url_from_doi(doi: str) -> str:
     """
@@ -29,7 +30,7 @@ def img_from_url(url: str, verbose=False) -> np.array:
     """Take an image url and return retrieved image array."""
     success = False
     while not success:
-        resp = requests.get(url, stream=True)
+        resp = requesting.get_request_with_backoff(url, stream=True)
         print(f"status code: {resp.status_code}") if verbose else 0
         success = True if (resp.status_code == 200) else False
         if success:
diff --git a/pangaea_downloader/tools/process.py b/pangaea_downloader/tools/process.py
index 6d92e73..74d8ea6 100644
--- a/pangaea_downloader/tools/process.py
+++ b/pangaea_downloader/tools/process.py
@@ -1,9 +1,10 @@
 """Functions for processing each of the result items."""
 from typing import Optional, Tuple
 
-import requests
 from bs4 import BeautifulSoup
 
+from . import requesting
+
 
 def url_from_uri(uri: str, base_url="https://doi.pangaea.de/") -> str:
     """Take a pangaea uri/doi string as input and return its corresponding url string."""
@@ -28,7 +29,7 @@ def get_result_info(res: dict) -> Tuple[str, str, str, str, bool]:
 def get_html_info(url: str) -> Optional[str]:
     """Make get request to dataset webpage and return dataset size."""
     # Make get request to webpage
-    resp = requests.get(url)
+    resp = requesting.get_request_with_backoff(url)
     if resp.status_code == 200:
         # Parse html
         soup = BeautifulSoup(resp.text, "lxml")
diff --git a/pangaea_downloader/tools/requesting.py b/pangaea_downloader/tools/requesting.py
new file mode 100644
index 0000000..1ec7ef0
--- /dev/null
+++ b/pangaea_downloader/tools/requesting.py
@@ -0,0 +1,49 @@
+"""
+URL request utilities.
+"""
+
+import time
+
+import requests
+
+
+def get_request_with_backoff(url, retries=5, backoff_factor=1, verbose=1, **kwargs):
+    """
+    Fetch a URL resource using requests with a custom backoff strategy for re-attempts.
+
+    Parameters
+    ----------
+    url : str
+        The URL to request.
+    retries : int, default=5
+        Maximum number of attempts.
+    backoff_factor : float, default=1
+        Base time to wait for before attempting to download again when receiving
+        a 500 or 503 HTTP status code.
+    verbose : int, default=1
+        Verbosity level.
+    **kwargs
+        Additional arguments as per :func:`requests.get`.
+    """
+    for i_attempt in range(retries):
+        r = requests.get(url, **kwargs)
+        if r.status_code not in [429, 500, 503]:
+            # Status code looks good
+            break
+        # N.B. Could also retry on [408, 502, 504, 599]
+        if r.status_code == 429:
+            # PANGAEA has a maximum of 180 requests within a 30s period
+            # Wait for this to cool off completely.
+            t_wait = 30
+        else:
+            # Other errors indicate a server side error. Wait a
+            # short period and then retry to see if it alleviates.
+            t_wait = backoff_factor * 2**i_attempt
+        if verbose >= 1:
+            print(
+                "Retrying in {} seconds (HTTP Status {}): {}".format(
+                    t_wait, r.status_code, url
+                )
+            )
+        time.sleep(t_wait)
+    return r
diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py
index 95218f2..a36cd34 100644
--- a/pangaea_downloader/tools/scraper.py
+++ b/pangaea_downloader/tools/scraper.py
@@ -9,7 +9,7 @@
 from pangaeapy import PanDataSet
 from requests.compat import urljoin
 
-import pangaea_downloader.tools.datasets as datasets
+from pangaea_downloader.tools import datasets, requesting
 
 
 def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
@@ -17,16 +17,40 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
     # Load dataset
     t_wait = max(0, datasets.T_POLL_LAST + datasets.T_POLL_INTV - time.time())
     time.sleep(t_wait)  # Stay under 180 requests every 30s
-    ds = PanDataSet(url)
+    ds = PanDataSet(url, enable_cache=True)
     datasets.T_POLL_LAST = time.time()
     # Request dataset url
     if verbose >= 1:
         print("\t\t\t[INFO] Requesting:", url)
-    resp = requests.get(url)
+    resp = requesting.get_request_with_backoff(url)
     # Parse response
     soup = BeautifulSoup(resp.text, "lxml")
     # Get coordinates of expedition
     coordinates = get_metadata(soup)
+    if coordinates is None and hasattr(ds, "geometryextent"):
+        print(
+            colorama.Fore.YELLOW + "\t\t\t[ALERT] Trying to get coordinates from"
+            " PanDataSet.geometryextent" + colorama.Fore.RESET
+        )
+        lat = None
+        long = None
+        for k in ["meanLatitude", "latitude", "Latitude"]:
+            if k in ds.geometryextent:
+                lat = ds.geometryextent[k]
+                break
+        for k in ["meanLongitude", "longitude", "Latitude"]:
+            if k in ds.geometryextent:
+                long = ds.geometryextent[k]
+                break
+        if lat is None and long is None:
+            coordinates = None
+        coordinates = lat, long
+
+    if coordinates is None:
+        print(
+            colorama.Fore.RED + "\t\t\t[ERROR] Coordinate metadata not found on page!"
+            " Saved file won't have Longitude, Latitude columns!" + colorama.Fore.RESET
+        )
 
     # Get download link to photos page
     download_link = soup.find("div", attrs={"class": "text-block top-border"}).a["href"]
@@ -34,7 +58,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
     if verbose >= 1:
         print("\t\t\t[INFO] URL to photos page:", download_link)
     # Get to photos page (page 1)
-    resp = requests.get(download_link)
+    resp = requesting.get_request_with_backoff(download_link)
     photos_page = BeautifulSoup(resp.text, "lxml")
     img_urls = get_urls_from_each_page(photos_page, src_url, verbose=verbose)
     if img_urls is None:
@@ -47,17 +71,13 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]:
         lat, long = coordinates
         df["Longitude"] = long
         df["Latitude"] = lat
-    df["Dataset"] = ds.title
-    df["DOI"] = getattr(ds, "doi", "")
-    doi = getattr(ds, "doi", "").split("doi.org/")[-1]
+    df["dataset_title"] = ds.title
+    doi = getattr(ds, "doi", "")
+    df["DOI"] = doi
+    ds_id = datasets.uri2dsid(doi if doi else url)
+    df["ds_id"] = ds_id
     if (len(ds.events) > 0) and (ds.events[0].campaign is not None):
-        df["Campaign"] = ds.events[0].campaign.name
-    else:
-        df["Campaign"] = doi
-    if "Event" in ds.data.columns:
-        df["Site"] = ds.data["Event"]
-    else:
-        df["Site"] = doi + "_site"
+        df["campaign"] = ds.events[0].campaign.name
     return df
 
 
@@ -71,10 +91,6 @@ def get_metadata(page_soup: BeautifulSoup) -> Optional[Tuple[float, float]]:
         lat = float(coordinates.find("span", attrs={"class": "latitude"}).text)
         long = float(coordinates.find("span", attrs={"class": "longitude"}).text)
         return lat, long
-    print(
-        colorama.Fore.RED + "\t\t\t[ERROR] Coordinate metadata not found on page!"
-        " Saved file won't have Longitude, Latitude columns!" + colorama.Fore.RESET
-    )
     return None
 
 
@@ -91,7 +107,7 @@ def get_urls_from_each_page(
             if verbose >= 1:
                 print(f"\t\t\t[INFO] Processing Page {n}...")
             url = pagination[n]
-            resp = requests.get(url)
+            resp = requesting.get_request_with_backoff(url)
             soup = BeautifulSoup(resp.text, "lxml")
             urls = get_page_image_urls(soup, verbose=verbose)
             img_urls.extend(urls)
@@ -111,7 +127,7 @@ def get_pagination(page_soup: BeautifulSoup, src_url: str) -> Optional[dict]:
     # List of page URLs
     page_urls = [urljoin(src_url, a["href"]) for a in pagination.find_all("a")][:-1]
     # Page number : Page URL
-    page_dict = {k: v for k, v in zip(page_nums, page_urls)}
+    page_dict = dict(zip(page_nums, page_urls))
     return page_dict
 
 
diff --git a/requirements.txt b/requirements.txt
index 368ee16..05b0ea0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,8 @@ matplotlib>=3.4.2
 numpy>=1.20.3
 opencv-python>=4.5.2.54
 pandas>=1.2.5
-pangaeapy>=0.0.5
+pangaeapy>=1.0.6
 requests>=2.25.1
 scikit-learn>=0.24.2
+scipy
 tqdm