diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 19b80c0..52656ca 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -80,8 +80,6 @@ repos: - id: detect-private-key - id: end-of-file-fixer exclude: ^LICENSE|\.(html|csv|txt|svg|py)$ - - id: pretty-format-json - args: ["--autofix", "--no-ensure-ascii", "--no-sort-keys"] - id: requirements-txt-fixer - id: trailing-whitespace args: [--markdown-linebreak-ext=md] diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb index b84d525..53305b9 100644 --- a/notebooks/explore-depth-columns.ipynb +++ b/notebooks/explore-depth-columns.ipynb @@ -18,7 +18,7 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", - "from benthicnet.utils import sanitize_filename, sanitize_filename_series\n", + "from benthicnet.io import sanitize_filename, sanitize_filename_series\n", "from IPython.display import display\n", "from tqdm.auto import tqdm\n", "\n", @@ -30,13 +30,19 @@ "cell_type": "code", "execution_count": null, "id": "b6f9ebdb", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Load datasets from this directory\n", "dirname = \"../query-outputs_2022-01-01\"\n", + "dirname = \"../query-outputs_2023-03-07_extras/\"\n", + "dirname = \"../query-outputs_2023-03-30c/\"\n", + "# dirname = \"../query-outputs_2023-03-30c\"\n", "# Pangaea benthic image dataset file with filtered dataset IDs\n", "pangaea_file = \"../full-dataset/pangaea_2022-01-24_filtered.csv\"\n", + "pangaea_file = \"../datasetcsvs/pangaea_2023-03-30c_with-tiles4.csv\"\n", "pangaea_df = pd.read_csv(pangaea_file)\n", "ds_ids = pangaea_df.dataset.unique()\n", "print(f\"Total {len(ds_ids)} datasets to process.\")" @@ -186,6 +192,78 @@ " print(f\"{c:.<35s} {count:4d}\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a251b7dd-673b-43c0-b948-bb83019aedb1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column_examples[\"sal\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42a79516-2ab2-45ee-b876-daf12758ed00", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column_examples[\"area\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "235276c3-d887-46b6-a453-2873a636533a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column_examples[\"length\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "897336e0-d260-46d4-a71b-7e882e785ce5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column_examples[\"classification\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f159be9-f6dc-4d0f-ae6a-a781a9983cdf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column_examples[\"content\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d804d2f-6adb-42f3-b164-68fe42a08b92", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column_examples[\"ground vis\"]" + ] + }, { "cell_type": "markdown", "id": "a07b478a-bd3d-417f-8e88-f49ea585c812", @@ -249,7 +327,7 @@ "\n", "val_exception = {}\n", "for i, file in enumerate(column_examples[key]):\n", - " df = pd.read_csv(os.path.join(dirname, file))\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", " url_column = find_url_column(df)\n", " df.columns = [col.lower() for col in df.columns]\n", " # Extract info\n", @@ -265,14 +343,21 @@ " f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}, Depth start: {start}, end: {end}\"\n", " )\n", " plt.figure(figsize=(16, 4))\n", - " plt.plot(-df[key], label=key)\n", + " plt.plot(df[key], label=key)\n", + " plt.ylabel(key)\n", + " plt.title(url.split(\"/\")[-1] + \" : \" + key)\n", + " plt.gca().invert_yaxis()\n", " plt.show()\n", + " print(url)\n", " # Datasets that defy column value norms\n", " # if (min_ <= 0) or (max_ <= 0):\n", " # print(\"\\tMin or Max non-positive.\")\n", " # val_exception[url] = (mean, sd, min_, max_, start, end)\n", " if value_near_zero(start) or value_near_zero(end):\n", - " print(\"\\tStart or Ene near zero.\")\n", + " print(\"\\tStart or End near zero.\")\n", + " val_exception[url] = (mean, sd, min_, max_, start, end)\n", + " if min_ < 0:\n", + " print(\"\\tNegative depth.\")\n", " val_exception[url] = (mean, sd, min_, max_, start, end)" ] }, @@ -307,8 +392,9 @@ "# Column to find\n", "key = \"bathy depth\"\n", "\n", + "val_exception = {}\n", "for i, file in enumerate(column_examples[key]):\n", - " df = pd.read_csv(os.path.join(dirname, file))\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", " url_column = find_url_column(df)\n", " df.columns = [col.lower() for col in df.columns]\n", " # Extract info\n", @@ -320,8 +406,12 @@ " # Show\n", " print(f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\")\n", " plt.figure(figsize=(16, 4))\n", - " plt.plot(-df[key], label=key)\n", + " plt.plot(df[key], label=key)\n", + " plt.ylabel(key)\n", + " plt.title(url.split(\"/\")[-1] + \" : \" + key)\n", + " plt.gca().invert_yaxis()\n", " plt.show()\n", + " print(url)\n", " if (min_ < 0) or (max_ < 0):\n", " print(\"\\tDoes not satisfy column value norms.\")\n", " val_exception[url] = (mean, sd, min_, max_)" @@ -361,7 +451,7 @@ "# Depth bot & depth top\n", "\n", "for i, file in enumerate(column_examples[keys[0]]):\n", - " df = pd.read_csv(os.path.join(dirname, file))\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", " url_column = find_url_column(df)\n", " df.columns = [col.lower() for col in df.columns]\n", " for key in keys:\n", @@ -380,7 +470,76 @@ " plt.plot(df[key], label=key)\n", " plt.plot(abs(df[\"depth top\"] - df[\"depth bot\"]), label=\"diff\", linestyle=\":\")\n", " plt.legend()\n", - " plt.show()" + " plt.title(url.split(\"/\")[-1])\n", + " plt.show()\n", + " print(url)" + ] + }, + { + "cell_type": "markdown", + "id": "0ee401a9-e936-4d8b-915d-ed3b1303fd65", + "metadata": {}, + "source": [ + "### 2.4 Elevation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d1ae559-e6ee-47b8-8f20-69bcef238cb5", + "metadata": { + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Column to find\n", + "key = \"elevation\"\n", + "\n", + "val_exception = {}\n", + "for i, file in enumerate(column_examples[key]):\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", + " url_column = find_url_column(df)\n", + " df.columns = [col.lower() for col in df.columns]\n", + " # Extract info\n", + " mean = df[key].mean()\n", + " sd = df[key].std()\n", + " min_ = df[key].min()\n", + " max_ = df[key].max()\n", + " url = get_dataset_url(file)\n", + " # Check for start and end at 0 altitude/depth\n", + " start, end = df[key].iloc[0], df[key].iloc[-1]\n", + " # Show\n", + " print(\n", + " f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}, Depth start: {start}, end: {end}\"\n", + " )\n", + " plt.figure(figsize=(16, 4))\n", + " plt.plot(df[key], label=key)\n", + " plt.ylabel(key)\n", + " plt.title(url.split(\"/\")[-1] + \" : \" + key)\n", + " plt.show()\n", + " print(url)\n", + " # Datasets that defy column value norms\n", + " # if (min_ <= 0) or (max_ <= 0):\n", + " # print(\"\\tMin or Max non-positive.\")\n", + " # val_exception[url] = (mean, sd, min_, max_, start, end)\n", + " if max_ > 0:\n", + " print(\"\\tPositive elevation.\")\n", + " val_exception[url] = (mean, sd, min_, max_, start, end)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceb64c1f-b5b3-4d8c-9943-b9c6810a1d53", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "val_exception" ] }, { @@ -402,7 +561,7 @@ "print(len(column_examples[\"depth water\"]))\n", "print(len(column_examples[\"bathy depth\"]))\n", "print(len(column_examples[\"bathy depth_2\"]))\n", - "print(len(column_examples[\"bathy_depth\"]))" + "print(len(column_examples[\"elevation\"]))" ] }, { @@ -458,7 +617,7 @@ "keys = [\"depth\", \"bathy depth\"]\n", "if len(intersect) > 0:\n", " for file in intersect:\n", - " df = pd.read_csv(os.path.join(dirname, file))\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", " df.columns = [col.lower() for col in df.columns]\n", " for key in keys:\n", " # Extract info\n", @@ -513,7 +672,7 @@ "keys = [\"depth water\", \"bathy depth\"]\n", "if len(intersect) > 0:\n", " for file in intersect:\n", - " df = pd.read_csv(os.path.join(dirname, file))\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", " df.columns = [col.lower() for col in df.columns]\n", " for key in keys:\n", " # Extract info\n", @@ -600,13 +759,113 @@ "**NOTE:** Upon checking the dataset webpages we see that the two bathy depth columns correspond to the original collection and recollection sites." ] }, + { + "cell_type": "markdown", + "id": "61d6de0f-09d8-43f5-a2b6-c47afed77a9d", + "metadata": {}, + "source": [ + "## 3.5 Datasets with depth water and elevation" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "6aabda82", + "id": "c3627d1c-717d-4dc2-b20d-761adebd513d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column1 = \"depth water\"\n", + "column2 = \"elevation\"\n", + "\n", + "fnames_set1 = set(column_examples[column1])\n", + "fnames_set2 = set(column_examples[column2])\n", + "intersect = fnames_set1.intersection(fnames_set2)\n", + "\n", + "print(f\"{column1} count:\", len(fnames_set1))\n", + "print(f\"{column2} count:\", len(fnames_set2))\n", + "print(\"# of files with both:\", len(intersect))\n", + "print()\n", + "\n", + "keys = [column1, column2]\n", + "if len(intersect) > 0:\n", + " for file in intersect:\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", + " df.columns = [col.lower() for col in df.columns]\n", + " for key in keys:\n", + " # Extract info\n", + " mean = df[key].mean()\n", + " sd = df[key].std()\n", + " min_ = df[key].min()\n", + " max_ = df[key].max()\n", + " url = get_dataset_url(file)\n", + " # Show\n", + " print(\n", + " f\"[{i}] '{key}' Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\"\n", + " )\n", + " # Plot\n", + " plt.figure(figsize=(16, 4))\n", + " for key in keys:\n", + " factor = 1 if key == \"elevation\" else -1\n", + " plt.plot(factor * df[key], label=key.capitalize())\n", + " plt.legend()\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "eeae8d64-0038-47ea-bc0b-8e59e0724b5e", "metadata": {}, + "source": [ + "## 3.6 Datasets with bathy depth and elevation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eccfc931-307a-4007-8306-6ea918a1489b", + "metadata": { + "tags": [] + }, "outputs": [], - "source": [] + "source": [ + "column1 = \"bathy depth\"\n", + "column2 = \"elevation\"\n", + "\n", + "fnames_set1 = set(column_examples[column1])\n", + "fnames_set2 = set(column_examples[column2])\n", + "intersect = fnames_set1.intersection(fnames_set2)\n", + "\n", + "print(f\"{column1} count:\", len(fnames_set1))\n", + "print(f\"{column2} count:\", len(fnames_set2))\n", + "print(\"# of files with both:\", len(intersect))\n", + "print()\n", + "\n", + "keys = [column1, column2]\n", + "if len(intersect) > 0:\n", + " for file in intersect:\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", + " df.columns = [col.lower() for col in df.columns]\n", + " for key in keys:\n", + " # Extract info\n", + " mean = df[key].mean()\n", + " sd = df[key].std()\n", + " min_ = df[key].min()\n", + " max_ = df[key].max()\n", + " url = get_dataset_url(file)\n", + " # Show\n", + " print(\n", + " f\"[{i}] '{key}' Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\"\n", + " )\n", + " # Plot\n", + " plt.figure(figsize=(16, 4))\n", + " for key in keys:\n", + " factor = 1 if key == \"elevation\" else -1\n", + " plt.plot(factor * df[key], label=key.capitalize())\n", + " plt.legend()\n", + " plt.show()" + ] } ], "metadata": { diff --git a/pangaea_downloader/citations.py b/pangaea_downloader/citations.py index db6796f..833463c 100644 --- a/pangaea_downloader/citations.py +++ b/pangaea_downloader/citations.py @@ -1,13 +1,14 @@ import pickle import pandas as pd -import requests + +from .tools import requesting def get_bibtex(ds_id: str, verbose=False) -> str: """Get the BibTex Citation of a Pangaea dataset using the dataset ID.""" bib_url = f"https://doi.pangaea.de/10.1594/PANGAEA.{ds_id}?format=citation_bibtex" - resp = requests.get(bib_url) + resp = requesting.get_request_with_backoff(bib_url) if verbose: print("\tStatus code:", resp.status_code) return resp.text diff --git a/pangaea_downloader/licenses.py b/pangaea_downloader/licenses.py index 4ba9a38..8cfac90 100644 --- a/pangaea_downloader/licenses.py +++ b/pangaea_downloader/licenses.py @@ -5,10 +5,11 @@ from typing import Dict, Optional, Union import pandas as pd -import requests from bs4 import BeautifulSoup from tqdm import tqdm +from .tools import requesting + def get_dataset_url(ds_id: Union[str, int]) -> str: """Return dataset URL given the six digit dataset ID.""" @@ -18,7 +19,7 @@ def get_dataset_url(ds_id: Union[str, int]) -> str: def get_dataset_license_info(url: str) -> Optional[Dict[str, str]]: """Return a dictionary with license information given the dataset URL.""" # Make a request to the URL and parse the html - resp = requests.get(url) + resp = requesting.get_request_with_backoff(url) soup = BeautifulSoup(resp.text, "lxml") # Get the tag containing the license info license_tag = soup.find("a", attrs={"rel": "license"}) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index eb15e1a..a7532fa 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -10,21 +10,28 @@ import os import re from collections import defaultdict +from functools import partial -import dateutil.parser import matplotlib.pyplot as plt import numpy as np import pandas as pd -from IPython.display import display +import scipy.interpolate +from pandas.api.types import is_numeric_dtype +from pangaeapy import PanDataSet from tqdm.auto import tqdm from pangaea_downloader import __meta__ from pangaea_downloader.tools import checker try: - from benthicnet.io import fixup_repeated_output_paths + from benthicnet.io import fixup_repeated_output_paths, row2basename except ImportError: fixup_repeated_output_paths = None + row2basename = None + +# Create new `pandas` methods which use `tqdm` progress +# (can use tqdm_gui, optional kwargs, etc.) +tqdm.pandas() TAXONOMY_RANKS = [ ["Kingdom", "Regnum"], @@ -196,6 +203,13 @@ def check_title(title): return False if title.startswith("Images of shell cross sections"): return False + if ( + "early biofouling processes in a coastal lagoon" in title.lower() + or "early biofouling processes in a coastal la goon" in title.lower() + ): + return False + if "photographs of tiles" in title.lower(): + pass return True @@ -226,8 +240,21 @@ def reformat_df(df, remove_duplicate_columns=True): # Make a copy of the dataframe so we can't overwrite the input df = df.copy() - # Remove bad columns - df.drop(labels=["-"], axis="columns", inplace=True, errors="ignore") + # Get dataset id from first row + ds_id = df.iloc[0]["ds_id"] + if isinstance(ds_id, str): + ds_id = ds_id.split("-")[-1] + + # Handle Area column + for col in ["Area", "Area_2", "Area_3"]: + # Area is sometimes the seafloor surface area of the image in + # meters^2 and sometimes used as a synonym for location + if col in df.columns and not all(df[col].isna()) and is_numeric_dtype(df[col]): + print(df.columns) + print(f"{ds_id}: Using {col} for area measurement") + df.rename(columns={col: "area"}, inplace=True, errors="raise") + break + # Remove duplicately named columns cols_to_drop = [] if remove_duplicate_columns: @@ -241,6 +268,8 @@ def reformat_df(df, remove_duplicate_columns=True): ): cols_to_drop.append(col) df.drop(labels=cols_to_drop, axis="columns", inplace=True) + # Remove bad columns + df.drop(labels=["-"], axis="columns", inplace=True, errors="ignore") # Find the correct URL column, and drop other columns containing "url" cols_to_drop = [] @@ -256,8 +285,9 @@ def reformat_df(df, remove_duplicate_columns=True): # is the output column name, and the value is a list of search names # in order of priority. The first match will be kept and others discarded. desired_columns = { - "dataset": ["ds_id", "dataset", "Campaign", "campaign"], - "site": ["Event", "event", "Site", "site", "deployment"], + "url_thumbnail": ["urlthumb", "urlthumbnail"], + "dataset": ["ds_id"], + "site": ["Event", "event", "deployment"], "image": ["image", "filename"], "datetime": [ "Date/Time", @@ -274,7 +304,7 @@ def reformat_df(df, remove_duplicate_columns=True): "latitude+", "latitudemed", "latitudenorth", - "latitudesouth", + # "latitudesouth", # special handling ], "longitude": [ "Longitude", @@ -283,20 +313,15 @@ def reformat_df(df, remove_duplicate_columns=True): "long", "longitude+", "longitudemed", - "longitudewest", "longitudeeast", + # "longitudewest", # special handling ], "x_pos": [], "y_pos": [], - "altitude": ["altitude", "height"], - "depth": [ - "depthwater", - "bathydepth", - "bathymetry", - "bathy", - "depth", - "elevation", - ], + "altitude": ["altitude", "heightaboveseafloor", "height"], + "depth_camera": ["depthwater", "depth"], + "depth_seafloor": ["bathydepth", "bathymetry", "bathy"], + "elevation": ["elevation"], "backscatter": [], "temperature": ["temperature", "temp"], "salinity": ["salinity", "sal"], @@ -325,6 +350,8 @@ def reformat_df(df, remove_duplicate_columns=True): if not found: found = True mapping[col] = canon + if col in cols_to_drop: + cols_to_drop.remove(col) if col != canon and canon in df.columns: cols_to_drop.append(canon) elif col not in mapping and col not in cols_to_drop: @@ -338,6 +365,8 @@ def reformat_df(df, remove_duplicate_columns=True): if not found: found = True mapping[col] = canon + if col in cols_to_drop: + cols_to_drop.remove(col) if col != canon and canon in df.columns: cols_to_drop.append(canon) elif col not in mapping and col not in cols_to_drop: @@ -351,6 +380,8 @@ def reformat_df(df, remove_duplicate_columns=True): if not found: found = True mapping[col] = canon + if col in cols_to_drop: + cols_to_drop.remove(col) if col != canon and canon in df.columns: cols_to_drop.append(canon) elif col not in mapping and col not in cols_to_drop: @@ -358,15 +389,43 @@ def reformat_df(df, remove_duplicate_columns=True): # Remove superfluous columns df.drop(labels=cols_to_drop, axis="columns", inplace=True) + # Rename columns to canonical names df.rename(columns=mapping, inplace=True, errors="raise") + # Handle latitudesouth and longitudewest + if "latitude" not in df.columns and "latitudesouth" in lower_cols: + col = df.columns[lower_cols.index("latitudesouth")] + print(f"Using {col} for {df.iloc[0]['dataset']}") + df["latitude"] = -df[col] + if "latitude" not in df.columns and "latitude-" in lower_cols: + col = df.columns[lower_cols.index("latitude-")] + print(f"Using {col} for {df.iloc[0]['dataset']}") + df["latitude"] = -df[col] + if "longitude" not in df.columns and "longitudewest" in lower_cols: + col = df.columns[lower_cols.index("longitudewest")] + print(f"Using {col} for {df.iloc[0]['dataset']}") + df["longitude"] = -df[col] + if "longitude" not in df.columns and "longitude-" in lower_cols: + col = df.columns[lower_cols.index("longitude-")] + print(f"Using {col} for {df.iloc[0]['dataset']}") + df["longitude"] = -df[col] + + # Remove datapoints with erroneous negative depth + if "depth_of_observer" in df.columns: + # Only observed two datapoints where this happens + df.loc[df["depth_of_observer"] < 0, "depth_of_observer"] = pd.NA + # Add file extension to image df["image"] = df.apply(add_file_extension, axis=1) # if "timestamp" not in df.columns and "datetime" in df.columns: # df["timestamp"] = df["datetime"].apply(datetime2timestamp) - if any([c in clean_cols for c in ["Kingdom", "Phylum", "Genus"]]): + # Add default site if it is missing + if "site" not in df.columns: + df["site"] = df["dataset"] + "_site" + + if any(c in clean_cols for c in ["Kingdom", "Phylum", "Genus"]): df["taxonomy"] = df.apply(row2taxonomy, axis=1) df.drop( labels=[x for syn in TAXONOMY_RANKS for x in syn], @@ -479,6 +538,7 @@ def check_image_url(url): if ( url.startswith("https://hs.pangaea.de/Images/Benthos/AntGlassSponges/") and "AHEAD" not in url + and "DOWN" not in url ): # Images of AntGlassSponges must contain "AHEAD" to be kept # otherwise, they are of sponges after removal @@ -682,6 +742,903 @@ def fixup_repeated_urls( return df +def fixup_favourite_images(df, verbose=1): + """ + Drop duplicated favourite images. + + These occur in Schewe and Bergmann's datasets along OFOS profiles during + POLARSTERN cruises, PANGAEA dataset ids 849814--849816. 873995--874002, + 895102--895104, 896545--896549, 896653--896657, 912471. + + + Parameters + ---------- + df : pandas.DataFrame + A PANGAEA dataframe with Type column. + verbose : int, default=1 + Level of verbosity. + + Returns + ------- + df : pandas.DataFrame + As input dataframe, but with all Type entries starting with favourite + removed (case-insensitive). + """ + n_samples_before = len(df) + if "Type" in df.columns: + # Remove all Favourite timer, Favourite hotkey, FAVOURITE_TIMER, and + # FAVOURITE_HOTKEY entries, which although they have unique URLs for their + # images are actually identical images to the ones occuring immediately + # after them in the dataframe. + df = df[~df["Type"].str.lower().str.startswith("favourite")] + if "image" in df.columns: + # Check if the image filename field is repeated except for a leading + # "FAVOURITE_" string, if so remove it. These images are identical + # copies of the other images. + select = df["image"].str.lower().str.startswith("favourite") + image_tmp = df["image"].str.replace("FAVOURITE_", "", case=False, regex=False) + is_repeated = image_tmp.duplicated(False) + # Remove favourite images which are repeated + df = df[~(select & is_repeated)] + n_samples_after = len(df) + if verbose >= 1 and n_samples_after != n_samples_before: + print( + f"{df.iloc[0]['dataset']}:" + f" Removed {n_samples_before - n_samples_after} favourited images." + f" {n_samples_before} -> {n_samples_after} rows" + ) + return df + + +def get_dataset_datetime(ds_id): + """ + Determine a generic date for a dataset from the min and max extent datetimes. + + Parameters + ---------- + ds_id : int + The identifier of a PANGAEA dataset. + + Returns + ------- + dt_avg : str + The average datetime between the min and max extent, with precision + reduced to reflect what can accurately be represented. + """ + ds = PanDataSet(ds_id, enable_cache=True) + dt_min = pd.to_datetime(ds.mintimeextent) + dt_max = pd.to_datetime(ds.maxtimeextent) + if dt_min is None and dt_max is None: + return pd.NaT + elif dt_min is None: + return dt_max.strftime("%Y-%m-%d") + elif dt_max is None: + return dt_min.strftime("%Y-%m-%d") + delta = dt_max - dt_min + dt_avg = dt_min + delta / 2 + if delta > datetime.timedelta(days=90): + return dt_avg.strftime("%Y") + if delta > datetime.timedelta(days=4): + return dt_avg.strftime("%Y-%m") + if delta > datetime.timedelta(hours=3): + return dt_avg.strftime("%Y-%m-%d") + if delta > datetime.timedelta(minutes=5): + return dt_avg.strftime("%Y-%m-%d %H:00:00") + if delta > datetime.timedelta(seconds=5): + return dt_avg.strftime("%Y-%m-%d %H:%M:00") + return dt_avg.strftime("%Y-%m-%d %H:%M:%S") + + +def fix_missing_datetime_from_image_name(df, ds_id, verbose=1): + """ + Extract datetime information from the contents of the image column in the dataframe. + + Note that the extraction operation is only performed on dataset IDs for + which the image naming scheme has been manually evaluated, and is not + applied blindly to datasets which have not been inspected. + + Parameters + ---------- + df : pandas.DataFrame + Input dataframe. + ds_id : int + The identifier of the PANGAEA dataset. + verbose : int, default=1 + Verbosity level. + + Returns + ------- + df : pandas.DataFrame + As input, but with missing datetime cells filled in from the image. + Existing datetime values are unchanged. + """ + if "datetime" not in df.columns: + df["datetime"] = pd.NaT + + ds_id = int(ds_id) + + select = df["datetime"].isna() + + if row2basename is None: + selected_image = df.loc[select, "image"] + else: + selected_image = df[select].apply( + partial(row2basename, use_url_extension=True), axis=1 + ) + + selected_image_no_ext = selected_image.apply(lambda x: os.path.splitext(x)[0]) + + if 371062 <= ds_id <= 371064: + # e.g. PO309_41-1_2004-04-05T08_55_41.jpg + # e.g. PO309_41-2-1_2004-04-05T11_28_26.jpg + # e.g. PO322_211-4-1_2005-05-18T19_35_31.jpg + dtstr = selected_image_no_ext.apply(lambda x: "-".join(x.split("_")[2:])) + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%dT%H-%M-%S") + + elif ds_id in [ + 785104, + 785105, + 785106, + 785107, + 785108, + 785109, + 785110, + 836457, + 867771, + 867772, + 867773, + 867774, + 867775, + 867776, + 867777, + 867778, + 867806, + 867807, + 867808, + 867852, + 867853, + 867861, + 873541, + 875713, + 875714, + 876422, + 876423, + 876511, + 876512, + 876513, + 876514, + 876515, + 876516, + 876517, + 876518, + 880043, + 880044, + 885666, + 885667, + 885668, + 885669, + 885670, + 885672, + 885674, + 885675, + 885709, + 885712, + 885713, + 885714, + 885715, + 885716, + 885717, + 885718, + 885719, + 885720, + ]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. PP_107-100_2012-03-19.png + # e.g. PP_100_2012-06-05a.jpg + # e.g. TH_122_2012-03-27.jpg + # e.g. J_05_2017_05_24a.jpg + # e.g. J_overview_2017-05-24za.jpg + # e.g. J_40_2017_08_11a.jpg + # e.g. J_05_2017-08-11a.jpg + # e.g. LG_OVERVIEW_01_05_06_07_09_2013_02_24a.jpg + # e.g. LG_01_07_2010_11_11a.jpg + # e.g. LG_01_2010_11_11a.jpg + # e.g. LG_Cluster1_2012_01_31a.jpg + # e.g. LG_01_07_2012_04_22a.jpg + # e.g. LG_SCREW_2012_04_22a.jpg + # e.g. So_01_2014_02_15b.jpg + # e.g. XH_01_2013_01_12_a.jpg + # e.g. XH_01%2B09_2013_11_19_a.jpg + # e.g. XH_01_2010_04_22_a.jpg + # e.g. LH_020_2015_01_28a_counted.jpg + # e.g. LH_020_2015_01_28xx.jpg + # e.g. J_J40%2BJ46%2BJ41_2016_09_25_a.jpg + dtstr = selected_image_no_ext.str.lower().str.rstrip( + "abcdefghijklmnopqrstuvwxyz_-" + ) + dtstr = dtstr.str[-11:].str.replace("_", "-").str.lstrip("-") + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d") + + elif ds_id in [ + 789211, + 789212, + 789213, + 789214, + 789215, + 789216, + 789219, + 819234, + ]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. 2003_W01-2.jpg + # e.g. 2004_B_bewachsen.jpg + # e.g. 2005_B.jpg + # e.g. 2013_B01-1.jpg + dtstr = selected_image_no_ext.str[:4] + # Test the format is correct; we will get an error if not + _ = pd.to_datetime(dtstr, format="%Y") + # But we actually want to keep the lower precision string + df.loc[select, "datetime"] = dtstr + + elif ds_id in [ + 789217, + 793210, + 793211, + 818906, + 818907, + 836263, + 836264, + 836265, + 836266, + 837653, + ]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. 04_2011.jpg + # e.g. 04a_2011_analog.jpg + # e.g. 04.2-2008.jpg + # e.g. 08-2008.jpg + # e.g. 04a_2013.jpg + # e.g. 05a_2003.jpg + # e.g. 04_2007.jpg + dtstr = selected_image_no_ext.str.lower().str.rstrip( + "abcdefghijklmnopqrstuvwxyz_-" + ) + dtstr = dtstr.str[-4:] + # Test the format is correct; we will get an error if not + _ = pd.to_datetime(dtstr, format="%Y") + # But we actually want to keep the lower precision string + df.loc[select, "datetime"] = dtstr + + elif ds_id in [836024, 836025]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. 00setting_2014-08.jpg + # e.g. 39.9_2014.jpg + # e.g. 2014_B01-1.jpg + df.loc[select, "datetime"] = "2014" + + elif ds_id in [840699, 840700, 840702, 840703, 840742, 840743]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. J_001_2012-01-31.jpg + # e.g. J_003_2012-01-31_2.jpg + # e.g. J_115_2012-01-31_a.jpg + # e.g. J_033_2012-08-08.jpg + dtstr = selected_image_no_ext.apply(lambda x: x.split("_")[2]) + dtstr = dtstr.str[:10] + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d") + + elif ds_id in [840701, 849298]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. J_002_2013-03_03a.jpg + # e.g. J_001_2015-01.jpg + # e.g. J_001_2015-01_a.jpg + # e.g. J_056_2013-03_06logger.jpg + dtstr = selected_image_no_ext.apply(lambda x: x.split("_")[2]) + # Test the format is correct; we will get an error if not + _ = pd.to_datetime(dtstr, format="%Y-%m") + # But we actually want to keep the lower precision string + df.loc[select, "datetime"] = dtstr + + elif ds_id in [872407, 872408, 872409, 872410, 872411]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. J_40_2017-01-12_a.jpg + # e.g. J_overview2_2017-02-02_x.jpg + # e.g. J_xx_2017-01-12_x-62.jpg + # e.g. J_17_2017-01-14.jpg + # e.g. J_23_2017-01-14_b-1.jpg + dtstr = selected_image_no_ext.apply(lambda x: x.split("_")[2]) + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d") + + elif ds_id in [878045, 888410]: + # Nothing to do + pass + + elif ds_id in [894734]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. HOTKEY_2018_03_27at21_09_21CP4A4682 + # e.g. TIMER_2018_03_18at04_04_09CP4A3970 + dtstr = selected_image_no_ext.apply(lambda x: "_".join(x.split("_")[1:])[:20]) + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y_%m_%dat%H_%M_%S") + + elif ds_id in [896157, 896160, 896164]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. 2016-08-2600000.jpg + dtstr = selected_image_no_ext.str[:10] + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d") + + if ds_id in [ + 918232, + 918233, + 918327, + 918340, + 918341, + 918382, + 918383, + 918385, + ]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. XH_01_2010_04_22_a.jpg + # e.g. XH_01_2010_04_28a.jpg + # e.g. XH_03_2018_10_18_a-1.jpg + dtstr = selected_image_no_ext.apply(lambda x: "-".join(x.split("_")[2:5])[:10]) + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d") + + return df + + +def add_missing_datetime(df, ds_id=None, verbose=1): + """ + Add missing datetime values using either the mean extent or extraction from the file name. + + Parameters + ---------- + df : pandas.DataFrame + Input dataframe. + ds_id : int, optional + The identifier of the PANGAEA dataset. The default behaviour is to + extract this from the dataset column of the dataframe. + verbose : int, default=1 + Verbosity level. + + Returns + ------- + df : pandas.DataFrame + As input, but with missing datetime cells completed, either by using the + average from the datetime extent metadata, or by extracting it from the + image name. + All existing datetime values are left unchanged. + """ + if "datetime" not in df.columns: + df["datetime"] = pd.NaT + + if ds_id is None: + # Get dataset id from first row + ds_id = df.iloc[0]["dataset"].split("-")[-1] + ds_id = int(ds_id) + + # Add datetimes that are still missing by inferring from the image filename + df = fix_missing_datetime_from_image_name(df, ds_id, verbose=verbose) + + if all(df["datetime"].isna()): + # This dataset has no datetime values + # Try to determine average datetime from the datetime extent metadata on + # the dataset record + dt_avg = get_dataset_datetime(ds_id) + if dt_avg is not None: + if verbose >= 1: + print( + f"{ds_id}: Using average datetime from extent" + f" - filenames look like {df.iloc[0]['image']}" + ) + df["datetime"] = dt_avg + + if not any(df["datetime"].isna()): + # This dataframe already has all datetime information + return df + + select = df["datetime"].isna() + if ds_id in [889035, 889025]: + if verbose >= 1: + print(f"{ds_id}: Adding manual missing datetime for {ds_id}") + # From the abstract on PANGAEA (sic): + # Experimet was setup during 2007-02-15 and 2007-06-13. + df.loc[select, "datetime"] = "2007" + + if ds_id in [896160, 896164]: + if verbose >= 1: + print(f"{ds_id}: Adding manual missing datetime for {ds_id}") + # From the INDEX 2016 ROV (see dataset title and paper + # https://doi.org/10.3389/fmars.2019.00096) + df.loc[select, "datetime"] = "2016" + + return df + + +def interpolate_by_datetime(df, columns, **kwargs): + """ + Use datetime column to interpolate values for selected columns. + + Parameters + ---------- + df : pandas.DataFrame + Dataframe with ``"datetime"`` column, which may contain missing values + in other columns. + columns : str or iterable of str + Name of column or columns to fill in missing values with interpolation. + **kwargs + Additional arguments as per :func:`numpy.interp`. + + Returns + ------- + df : pandas.DataFrame + Like input, but with missing values in specified columns completed by + linear interpolation over datetime. + """ + # Convert datetime string to a datetime object + datetime_actual = pd.to_datetime(df["datetime"]) + has_datetime = ~datetime_actual.isna() + if isinstance(columns, str): + columns = [columns] + for col in columns: + if col not in df: + continue + interp_kwargs = kwargs + if col in ["depth", "depth_of_observer", "bathymetry", "altitude"]: + if "left" not in interp_kwargs: + interp_kwargs["left"] = np.nan + if "right" not in interp_kwargs: + interp_kwargs["right"] = np.nan + has_col = ~df[col].isna() + has_dt_and_col = has_datetime & has_col + has_dt_not_col = has_datetime & ~has_col + df.loc[has_dt_not_col, col] = np.interp( + datetime_actual[has_dt_not_col], + datetime_actual[has_dt_and_col], + df.loc[has_dt_and_col, col], + **interp_kwargs, + ) + return df + + +def fixup_incomplete_metadata(df, ds_id=None, verbose=1): + """ + Fix datasets which have partial, but incomplete, lat/lon/datetime metadata. + + Interpolation is performed as appropriate to the dataset. The methodology + was determined by manually inspecting each dataset. + Any latitude and longitude values which can not be resolved are filled in + with the dataset-level mean latitude and longitude as reported by PANGAEA. + + Parameters + ---------- + df : pandas.DataFrame + Input dataframe. + ds_id : int, optional + The identifier of the PANGAEA dataset. The default behaviour is to + extract this from the dataset column of the dataframe. + verbose : int, default=1 + Verbosity level. + + Returns + ------- + df : pandas.DataFrame + As input, but with missing datetime, latitude, longitude, and/or depth + cells completed by interpolation or similar. + All existing datetime values are left unchanged. + """ + if ds_id is None: + # Get dataset id from first row + ds_id = df.iloc[0]["dataset"].split("-")[-1] + ds_id = int(ds_id) + + if ds_id in [753197]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + print("Nothing to be done.") + + if ds_id in [805606, 805607, 805611, 805612]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + print(f"{ds_id}: Interpolating by index") + indices = np.arange(len(df)) + col = "datetime" + select_not_col = df[col].isna() + select_has_col = ~select_not_col + if any(select_has_col) and any(select_not_col): + missing_timestamps = np.interp( + indices[select_not_col], + indices[select_has_col], + pd.to_datetime(df.loc[select_has_col, "datetime"]).apply( + lambda x: x.timestamp() + ), + ) + df.loc[select_not_col, col] = [ + datetime.datetime.fromtimestamp(int(ts)) for ts in missing_timestamps + ] + + if ds_id == 875080: + # N.B. There is date metadata in the csv, but not time. But there is time + # metadata in the filename, so we could extract this if we wanted to. + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + print("Nothing to be done.") + # lat/lon was only recorded for the first 11 images. Fill in the rest + # with the median latitude and longitude for the record at the end + # of this function. + + if 873995 <= ds_id <= 874002: + if verbose >= 1: + print(f"Interpolating latitude, longitude, and depth for dataset {ds_id}") + # Interpolate lat, lon, and depth based on datetime + df = interpolate_by_datetime( + df, ["latitude", "longitude", "depth_of_observer", "bathymetry"] + ) + + if ds_id in [875071, 875073]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + # Drop rows without datetime values (these have missing lat/lon as well) + # For 875071, these images are of the deck of the ship. + # For 875073, these images have a translation of less than half an image + # from the subsequent image, so we don't need the ones without metadata. + df = df[~df["datetime"].isna()] + # Interpolate missing depth values + df = interpolate_by_datetime(df, ["depth_of_observer", "bathymetry"]) + + if ds_id in [875084]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + # For 875084, images without latitude and longitude are not useful. + # The first three are of the deck, the rest are dark watercolumn shots. + df = df[~df["longitude"].isna()] + # Interpolate missing depth values + df = interpolate_by_datetime(df, ["depth_of_observer", "bathymetry"]) + + if (878001 <= ds_id <= 878019) or ds_id == 878045: + if verbose >= 1: + print(f"{ds_id}: Dropping rows missing metadata for dataset {ds_id}") + # Images without metadata are of the water column and highly redundant. + df = df[~df["longitude"].isna()] + + if ds_id in [894732, 894734]: + if verbose >= 1: + print(f"{ds_id}: Dropping rows missing metadata for dataset {ds_id}") + # It's not clear to me that any of these images are of the seafloor. + df = df[~df["longitude"].isna()] + + if ds_id in [895557, 903782, 903788, 903850, 907025, 894801]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + print( + f"{ds_id}: Interpolating by index over subset of images in the same series" + ) + indices = np.arange(len(df)) + image_no_ext = df["image"].apply(lambda x: os.path.splitext(x)[0]) + image_major = image_no_ext.str[:-3] + missing_dt = df["datetime"].isna() + missing_lat = df["latitude"].isna() + missing_lon = df["longitude"].isna() + for image_major_i in image_major.unique(): + select = image_major == image_major_i + col = "latitude" + select_and_col = select & ~missing_lat + select_not_col = select & missing_lat + if any(select_and_col) and any(select_not_col): + df.loc[select_not_col, col] = np.interp( + indices[select_not_col], + indices[select_and_col], + df.loc[select_and_col, col], + ) + col = "longitude" + select_and_col = select & ~missing_lon + select_not_col = select & missing_lon + if any(select_and_col) and any(select_not_col): + df.loc[select_not_col, col] = np.interp( + indices[select_not_col], + indices[select_and_col], + df.loc[select_and_col, col], + ) + col = "datetime" + select_and_col = select & ~missing_dt + select_not_col = select & missing_dt + if any(select_and_col) and any(select_not_col): + new_values = scipy.interpolate.interp1d( + indices[select_and_col], + pd.to_datetime(df.loc[select_and_col, col]).map( + pd.Timestamp.timestamp + ), + kind="nearest", + fill_value="extrapolate", + )(indices[select_not_col]) + new_values = pd.to_datetime(new_values, unit="s") + new_values = new_values.strftime("%Y-%m-%d") + df.loc[select_not_col, col] = new_values + + if ds_id in [911904, 918924, 919348]: + if verbose >= 1: + print(f"{ds_id}: Extracting missing datetime metadata for dataset {ds_id}") + # Extract missing datetime from the filename, formatted like (e.g.) + # TIMER_2019_03_31_at_05_50_12_IMG_0263 + has_no_datetime = df["datetime"].isna() + fname_inner = df.loc[has_no_datetime, "image"].apply( + lambda x: "_".join(x.split("_")[1:-2]) + ) + df.loc[has_no_datetime, "datetime"] = pd.to_datetime( + fname_inner, format="%Y_%m_%d_at_%H_%M_%S" + ) + if verbose >= 1: + print( + f"{ds_id}: Interpolating latitude, longitude, and depth for dataset {ds_id}" + ) + df = interpolate_by_datetime( + df, ["latitude", "longitude", "depth_of_observer", "bathymetry"] + ) + + if ds_id in [914155]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + # Images without datetime are too dark + df = df[~df["datetime"].isna()] + # Other images are missing latitude and longitude metadata + df = interpolate_by_datetime(df, ["latitude", "longitude"]) + + if ds_id in [914156, 914197]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + # Some images are clearly of the same thing, but one is good visibility + # with no lat/lon, and the next is too dark and has no datetime. + for from_image, to_image in [ + ("IMG_0393", "IMG_0392"), + ("IMG_0395", "IMG_0394"), + ]: + columns = ["latitude", "longitude"] + select_from = df["image"].str.startswith(from_image) + select_to = df["image"].str.startswith(to_image) + df.loc[select_to, columns] = df.loc[select_from, columns] + # Drop images without datetime + df = df[~df["datetime"].isna()] + # Fill in any missing latitude and longitude metadata + df = interpolate_by_datetime(df, ["latitude", "longitude"]) + + if ds_id in [914192]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + # Some images are clearly of the same thing, but one is good visibility + # with no lat/lon, and the next is too dark and has no datetime. + for from_image, to_image in [ + ("IMG_1776", "IMG_1775"), + ]: + columns = ["latitude", "longitude"] + select_from = df["image"].str.startswith(from_image) + select_to = df["image"].str.startswith(to_image) + df.loc[select_to, columns] = df.loc[select_from, columns] + # Drop images without datetime + df = df[~df["datetime"].isna()] + # Fill in any missing latitude and longitude metadata + df = interpolate_by_datetime(df, ["latitude", "longitude"]) + + if ( + (702075 <= ds_id <= 702080) + or (818484 <= ds_id <= 818509) + or ds_id in [849287, 849289] + or 862084 <= ds_id <= 862097 + or ds_id in [875072, 875074] + or 875081 <= ds_id <= 875085 + ): + if verbose >= 1: + print(f"{ds_id}: Interpolating missing depth metadata for dataset {ds_id}") + df = interpolate_by_datetime(df, ["depth_of_observer", "bathymetry"]) + + if any(df["latitude"].isna() | df["longitude"].isna()): + # Fill in any missing latitude and longitude values with the + # mean coordinate reported at the dataset level + ds = PanDataSet(ds_id, enable_cache=True) + if hasattr(ds, "geometryextent"): + lat = None + long = None + for k in ["meanLatitude", "latitude", "Latitude"]: + if k in ds.geometryextent: + lat = ds.geometryextent[k] + break + for k in ["meanLongitude", "longitude", "Latitude"]: + if k in ds.geometryextent: + long = ds.geometryextent[k] + break + if lat is not None: + if verbose >= 1: + print(f"{ds_id}: Using dataset mean latitude for missing values") + df.loc[df["latitude"].isna(), "latitude"] = lat + if long is not None: + if verbose >= 1: + print(f"{ds_id}: Using dataset mean longitude for missing values") + df.loc[df["longitude"].isna(), "longitude"] = long + + return df + + +def merge_duplicated_urls(df): + """ + Merge metadata across rows which have the same URL. + """ + print("Original number of rows:", len(df)) + df.drop_duplicates(inplace=True) + print("Number of rows after dropping simple duplicates:", len(df)) + # Record the original sort index so we can get the data back in the original + # order. + df["original_index"] = df.index + # Determine how many images are at the same location. This indicates how + # accurate the latitude and longitude information is. We will want to keep + # the most accurate version of this. + repeat_location_counts = df[["longitude", "latitude"]].value_counts() + repeat_location_counts = repeat_location_counts.to_frame() + repeat_location_counts.rename(columns={0: "tally_repeated_location"}, inplace=True) + # Add the tally_repeated_location data as a new column + df = df.merge(repeat_location_counts, how="left", on=["latitude", "longitude"]) + + def resolve_duplicates(sdf): + if len(sdf) == 1: + # If there's only one row in the group, return it. + return sdf.iloc[0] + # Take the entry which has the fewest repetitions of the latitude and + # longitude value. We will use the version from the first dataset that + # had the fewest repetitions of the location for this image. + # We adopt this row's collection, dataset, and site values in addition + # to its coordinates. + idx = np.argmin(sdf["tally_repeated_location"]) + row = sdf.iloc[idx].copy() + # For numeric columns (other than latitude and longitude), take the + # average of the values where they are present. + for col in [ + "depth_of_observer", + "altitude", + "bathymetry", + "salinity", + "temperature", + "acidity", + "area", + ]: + if col not in sdf.columns: + continue + select = ~pd.isna(sdf[col]) + if select.sum() == 0: + continue + row[col] = sdf[select][col].mean() + # Look to see if we are missing an image or thumbnail entry and one + # of the duplicates has its value. + for col in ["image", "url_thumbnail"]: + if col not in sdf.columns: + continue + if not pd.isna(row[col]): + continue + values = sdf[col] + values = values[~pd.isna(values)] + if len(values) == 0: + continue + row[col] = values.iloc[0] + # For datetime, use the fact that we encoded datetime as a string + # with varying levels of precision. More digits means higher precision. + # Take the most precise value, preferring the value from the selected + # record in the event of a tie. + datetime_len = sdf["datetime"].str.replace(" 00:00:00", "").str.len() + idx_dt = np.argmax(datetime_len) + if datetime_len.iloc[idx] != datetime_len.iloc[idx_dt]: + row["datetime"] = sdf.iloc[idx_dt]["datetime"] + return row + + print("Merging metadata between rows with the same URL") + # Group by URL and apply our transformation to each group + df_out = df.groupby("url").progress_apply(resolve_duplicates) + # Reorder the dataframe to preseve implicit temporal information from the + # ordering of the images + df_out.sort_values("original_index", inplace=True) + df_out.drop(columns=["original_index", "tally_repeated_location"], inplace=True) + return df_out + + +def process_single(df, ds_id=None, verbose=1, remove_duplicate_columns=False): + """ + Reformat and cleanup metadata for a single dataset. + + Parameters + ---------- + df : pandas.Dataframe + The dataset to process. + ds_id : int, optional + The ID number for the PANGAEA dataset. If omitted, it is inferred from + the ``ds_id`` column of ``df``. + verbose : int, default=1 + Verbosity level. + remove_duplicate_columns : bool, default=False + Whether to remove duplicate column names. + + Returns + ------- + df : pandas.Dataframe + A processed copy of the dataset. + """ + if df is None or len(df) == 0: + return df + + if ds_id is None: + ds_id = df.iloc[0]["ds_id"] + if isinstance(ds_id, str): + ds_id = int(ds_id.split("-")[-1]) + + if "ds_id" in df.columns: + df["ds_id"] = "pangaea-" + df["ds_id"].astype(str) + df["ds_id"] = df["ds_id"].str.replace("pangaea-pangaea-", "pangaea-") + else: + df["ds_id"] = f"pangaea-{ds_id}" + if "parent_ds_id" in df.columns: + df["parent_ds_id"] = "pangaea-" + df["parent_ds_id"].astype(str) + df["parent_ds_id"] = df["parent_ds_id"].str.replace( + "pangaea-pangaea-", "pangaea-" + ) + + df = reformat_df(df, remove_duplicate_columns=remove_duplicate_columns) + if df is None: + return df + + url_col = "url" + df = df[df[url_col] != ""] + if len(df) == 0: + return df + + df = filter_urls(df, url_column=url_col) + if len(df) == 0: + return df + + # Drop rows that are complete duplicates + df.drop_duplicates(inplace=True) + + # Try to fix repeated URLs that are accidental dups but should differ + df = fixup_repeated_urls(df, url_column=url_col, verbose=verbose) + + # Check for any rows that are all NaNs + if sum(df.isna().all("columns")) > 0: + print(f"{ds_id} has a row which is all NaNs") + + # Remove duplicated "favourited" images + df = fixup_favourite_images(df, verbose=verbose) + + # Fix incomplete lat/lon/datetime metadata + df = fixup_incomplete_metadata(df, ds_id, verbose=verbose) + + # Add datetime if it is completely missing + df = add_missing_datetime(df, ds_id, verbose=verbose) + + return df + + def process_datasets(input_dirname, output_path=None, verbose=0): """ Process a directory of datasets: clean, concatenate and save. @@ -711,11 +1668,18 @@ def process_datasets(input_dirname, output_path=None, verbose=0): n_valid = 0 dfs = [] dfs_fnames = [] + ids_with_potential_labels = [] for fname in tqdm(sorted(sorted(os.listdir(input_dirname)), key=len)): # noqa: C414 + if not fname.endswith(".csv"): + continue # for fname in tqdm(os.listdir(input_dirname)): ds_id = os.path.splitext(fname)[0] - df = pd.read_csv(os.path.join(input_dirname, fname)) + if ds_id in ["805690", "803979"]: + # The title was not captured from this dataset for some reason, + # so we can't exclude it via the title. + continue + df = pd.read_csv(os.path.join(input_dirname, fname), low_memory=False) n_total += 1 if not checker.has_url_col(df): continue @@ -725,7 +1689,13 @@ def process_datasets(input_dirname, output_path=None, verbose=0): files_without_url.append(fname) continue - df["ds_id"] = f"pangaea-{ds_id}" + if "ds_id" in df.columns: + df["ds_id"] = "pangaea-" + df["ds_id"].astype(str) + else: + df["ds_id"] = f"pangaea-{ds_id}" + if "parent_ds_id" in df.columns: + df["parent_ds_id"] = "pangaea-" + df["parent_ds_id"].astype(str) + df = reformat_df(df) if df is None: continue @@ -745,6 +1715,30 @@ def process_datasets(input_dirname, output_path=None, verbose=0): column_count[col] += 1 column_examples[col].append(fname) + for key in [ + # "Type", + "Content", # Yes! + # "Sample label", + # "ID", + # "Sample ID", + "Classification", # Yes! + "Species", # Yes! + # "Reference", + # "Samp type", + "Family", + "Genus", + # "Ind No", + # "Imagery", + # "Img brightness", # No + "Ground vis", # Yes! + "Marine litter", + "Fisheries plastic", + "Unident litter", + ]: + if key in df.columns: + print(f"{fname} has {key}") + ids_with_potential_labels.append(ds_id) + # Drop rows that are complete duplicates df.drop_duplicates(inplace=True) @@ -752,7 +1746,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0): files_with_repeat_urls.append(fname) # Try to fix repeated URLs that are accidental dups but should differ - df = fixup_repeated_urls(df, url_column=url_col, verbose=1) + df = fixup_repeated_urls(df, url_column=url_col, verbose=verbose) if len(df) != len(df.drop_duplicates(subset=url_col)): files_with_repeat_urls2.append(fname) @@ -761,26 +1755,47 @@ def process_datasets(input_dirname, output_path=None, verbose=0): if sum(df.isna().all("columns")) > 0: print(f"{ds_id} has a row which is all NaNs") + # Remove duplicated "favourited" images + df = fixup_favourite_images(df, verbose=verbose) + + # Fix incomplete lat/lon/datetime metadata + df = fixup_incomplete_metadata(df, ds_id, verbose=verbose) + + # Add datetime if it is completely missing + df = add_missing_datetime(df, ds_id, verbose=verbose) + dfs.append(df) dfs_fnames.append(fname) - print(f"There are {n_valid} valid (of {n_total}) valid datasets") - print( - f"Of which {len(files_with_repeat_urls)} have repeated URLs (before replacing dups with image)" - ) - print( - f"Of which {len(files_with_repeat_urls2)} have repeated URLs (after replacing dups with image)" - ) - print() - print(f"There are {len(column_count)} unique column names:") - print() - - for col, count in dict( - sorted(column_count.items(), key=lambda item: item[1], reverse=True) - ).items(): - c = col + " " - print(f"{c:.<35s} {count:4d}") - print() + if verbose >= 0: + print(f"There are {n_valid} valid (of {n_total}) valid datasets") + print( + f"Of which {len(files_with_repeat_urls)} have repeated URLs (before replacing dups with image)" + ) + for fname in files_with_repeat_urls: + print(f" {fname}") + print( + f"Of which {len(files_with_repeat_urls2)} have repeated URLs (after replacing dups with image)" + ) + for fname in files_with_repeat_urls2: + print(f" {fname}") + print() + print(f"There are {len(column_count)} unique column names:") + print() + + for col, count in dict( + sorted(column_count.items(), key=lambda item: item[1], reverse=True) + ).items(): + c = col + " " + print(f"{c:.<35s} {count:4d}") + print() + ids_with_potential_labels = sorted(set(ids_with_potential_labels)) + print( + f"There are {len(ids_with_potential_labels)} datasets which might have labels to extract:" + ) + for ds_id in ids_with_potential_labels: + print(ds_id) + print() if verbose >= 1: print("Filter columns") @@ -789,26 +1804,43 @@ def process_datasets(input_dirname, output_path=None, verbose=0): "dataset", "site", "url", + "url_thumbnail", "image", "datetime", "latitude", "longitude", + "area", "altitude", - "depth", + "depth_of_observer", + "bathymetry", "backscatter", "temperature", "salinity", "chlorophyll", "acidity", + "parent_ds_id", } df_all = pd.concat( [df[df.columns.intersection(select_cols)] for df in dfs if len(df) > 0] ) + df_all.rename(columns={"parent_ds_id": "collection"}, inplace=True) + + print(f"There are {len(df_all)} records before dropping duplicated URLs") + + if os.path.dirname(output_path): + os.makedirs(os.path.dirname(output_path), exist_ok=True) + output_path_with_dups = os.path.splitext(output_path)[0] + "_with-duplicates.csv" + if verbose >= 0: + print(f"Saving (with duplicates) to {output_path_with_dups}") + df_all.to_csv(output_path_with_dups, index=False) # Remove duplicate URLs if verbose >= 1: - print("Remove duplicates") - df_all.drop_duplicates(subset="url", inplace=True, keep="first") + print("Merge duplicated URLs") + # Convert datetime to string + df_all["datetime"] = df_all["datetime"].astype(str) + df_all = merge_duplicated_urls(df_all) + print(f"There are {len(df_all)} records after dropping duplicated URLs") # Fix repeated output paths by replacing with image field if fixup_repeated_output_paths is None: @@ -817,12 +1849,10 @@ def process_datasets(input_dirname, output_path=None, verbose=0): else: if verbose >= 1: print("Fix repeated output paths to prevent collisions") - df_all = fixup_repeated_output_paths(df_all, inplace=True, verbose=2) + df_all = fixup_repeated_output_paths(df_all, inplace=True, verbose=verbose) - if os.path.dirname(output_path): - os.makedirs(os.path.dirname(output_path), exist_ok=True) if verbose >= 0: - print(f"Saving to {output_path}") + print(f"Saving (without duplicates) to {output_path}") df_all.to_csv(output_path, index=False) @@ -875,7 +1905,7 @@ def get_parser(): "--verbose", "-v", action="count", - default=0, + default=1, help=textwrap.dedent( """ Increase the level of verbosity of the program. This can be diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py index bb17b7f..9623b10 100755 --- a/pangaea_downloader/pq_scraper.py +++ b/pangaea_downloader/pq_scraper.py @@ -16,7 +16,12 @@ from pangaea_downloader.tools import datasets, process, scraper, search -def search_and_download(queries=None, output_dir="query-outputs", verbose=0): +def search_and_download( + queries=None, + output_dir="query-outputs", + auth_token=None, + verbose=0, +): """ Search `PANGAEA`_ for a set of queries, and download datasets for each result. @@ -31,6 +36,8 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): output_dir : str, default="query-outputs" The output directory where downloaded datasets will be saved. Any existing output datasets will be skipped instead of downloaded. + auth_token : str, optional + Bearer authentication token. verbose : int, default=1 Verbosity level. """ @@ -49,6 +56,11 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): os.makedirs(output_dir, exist_ok=True) df_results.to_csv(output_dir.rstrip("/") + "_search_results.csv", index=False) + fname_child2parent = output_dir.rstrip("/") + "_child2parent.csv" + if not os.path.isfile(fname_child2parent): + with open(fname_child2parent, "w") as f: + f.write("child,parent\n") + # Process each result dictionary n_files = 0 n_downloads = 0 @@ -75,7 +87,11 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): # ------------- ASSESS DATASET TYPE ------------- # try: if is_parent: - df_list = datasets.fetch_children(url, verbose=verbose - 1) + df_list = datasets.fetch_children( + url, + verbose=verbose - 1, + auth_token=auth_token, + ) if df_list is None: if verbose >= 1: print( @@ -93,9 +109,32 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): + colorama.Fore.RESET ) continue - df = pd.concat(df_list) + for df in df_list: + if df is None or len(df) == 0: + continue + # Add the parent's ID to the dataframe + df["parent_ds_id"] = ds_id + # Save the child to its own CSV, including a column that + # records the parent's dataset ID + child_id = df.iloc[0]["ds_id"] + child_output_path = os.path.join(output_dir, f"{child_id}.csv") + saved = datasets.save_df( + df, child_output_path, level=1, verbose=verbose - 1 + ) + n_downloads += 1 if saved else 0 + with open(fname_child2parent, "a") as f: + f.write(f"{child_id},{ds_id}\n") + # We have saved all the children individually, so will skip + # saving a redundant merged dataframe + # But we will save an empty file so we know to skip + with open(output_path, "w") as f: + f.write("is_parent") + continue else: - dataset_type = process.ds_type(size) + try: + dataset_type = process.ds_type(size) + except Exception: + raise ValueError(f"Can't process type from size for {ds_id}") if dataset_type == "video": if verbose >= 1: print( @@ -107,7 +146,11 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): elif dataset_type == "paginated": df = scraper.scrape_image_data(url, verbose=verbose - 1) elif dataset_type == "tabular": - df = datasets.fetch_child(url, verbose=verbose - 1) + df = datasets.fetch_child( + url, + verbose=verbose - 1, + auth_token=auth_token, + ) except Exception as err: if isinstance(err, KeyboardInterrupt): raise @@ -121,16 +164,7 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): # ----------------- SAVE TO FILE ----------------- # if df is None: continue - try: - saved = datasets.save_df(df, output_path, level=1, verbose=verbose - 1) - except Exception as err: - # Delete partially saved file, if present - if os.path.isfile(output_path): - try: - os.remove(output_path) - except Exception: - pass - raise err + saved = datasets.save_df(df, output_path, level=1, verbose=verbose - 1) n_downloads += 1 if saved else 0 if verbose >= 0: @@ -138,7 +172,9 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): print(f"Number of files previously saved: {n_files}.") print(f"Total dataset files: {n_files + n_downloads}") print(f"Number of dataset errors (excluding access): {len(errors)}.") - + if len(errors) > 0: + print() + print("Captured errors are now repeated as follows.") for msg in errors: print() print(msg) @@ -190,6 +226,11 @@ def get_parser(): default="query-outputs", help="Directory for downloaded datasets. Default is %(default)s.", ) + parser.add_argument( + "--auth-token", + type=str, + help="Bearer authentication token", + ) parser.add_argument( "--verbose", "-v", diff --git a/pangaea_downloader/tools/checker.py b/pangaea_downloader/tools/checker.py index 7292c43..3d345d6 100644 --- a/pangaea_downloader/tools/checker.py +++ b/pangaea_downloader/tools/checker.py @@ -61,8 +61,8 @@ def is_invalid_file_ext(filename: str) -> bool: # --------------------------------------------- DataFrame Checkers --------------------------------------------- # def has_url_col(df: DataFrame) -> bool: """Take a Pandas DataFrame and return True if it has image URL column.""" - condition1 = any(["url" in col.lower() for col in df.columns]) - condition2 = any(["image" in col.lower() for col in df.columns]) + condition1 = any("url" in col.lower() for col in df.columns) + condition2 = any("image" in col.lower() for col in df.columns) return condition1 or condition2 diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index b54c6d5..8b6f83a 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -6,6 +6,8 @@ use pangaea_downloader.tools.scraper module. """ import os +import shutil +import tempfile import time from typing import List, Optional @@ -16,19 +18,24 @@ from pangaea_downloader.tools import checker, process, scraper T_POLL_LAST = 0 -T_POLL_INTV = 0.1667 +# T_POLL_INTV = 0 # Allow rapid loading of cached records +T_POLL_INTV = 0.1667 # Rate-limit ourselves; stay under 180 requests within 30s -def fetch_child(child_url: str, verbose=1) -> Optional[DataFrame]: +def fetch_child( + child_url: str, + verbose=1, + ensure_url=True, + auth_token=None, +) -> Optional[DataFrame]: """Fetch Pangaea child dataset using provided URI/DOI and return DataFrame.""" # Load data set global T_POLL_LAST global T_POLL_INTV t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time()) time.sleep(t_wait) # Stay under 180 requests every 30s - ds = PanDataSet(child_url) + ds = PanDataSet(child_url, enable_cache=True, auth_token=auth_token) T_POLL_LAST = time.time() - doi = getattr(ds, "doi", "").split("doi.org/")[-1] # Dataset is restricted if ds.loginstatus != "unrestricted": if verbose >= 1: @@ -39,7 +46,7 @@ def fetch_child(child_url: str, verbose=1) -> Optional[DataFrame]: ) return # Check for image URL column - if not checker.has_url_col(ds.data): + if ensure_url and not checker.has_url_col(ds.data): if verbose >= 1: print( colorama.Fore.YELLOW @@ -48,20 +55,29 @@ def fetch_child(child_url: str, verbose=1) -> Optional[DataFrame]: ) return # Add metadata - df = set_metadata(ds, alt=doi) + df = set_metadata(ds) # Exclude unwanted rows df = exclude_rows(df) + # Add dataset ID + doi = getattr(ds, "doi", "") + ds_id = uri2dsid(doi if doi else child_url) + df["ds_id"] = ds_id return df -def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]: +def fetch_children( + parent_url: str, + verbose=1, + ensure_url=True, + auth_token=None, +) -> Optional[List[DataFrame]]: """Take in url of a parent dataset, fetch and return list of child datasets.""" # Fetch dataset global T_POLL_LAST global T_POLL_INTV t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time()) time.sleep(t_wait) # Stay under 180 requests every 30s - ds = PanDataSet(parent_url) + ds = PanDataSet(parent_url, enable_cache=True, auth_token=auth_token) T_POLL_LAST = time.time() # Check restriction if ds.loginstatus != "unrestricted": @@ -78,9 +94,13 @@ def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]: df_list = [] for i, child_uri in enumerate(ds.children): url = process.url_from_uri(child_uri) + ds_id = uri2dsid(child_uri) size = process.get_html_info(url) # Assess type - typ = process.ds_type(size) + try: + typ = process.ds_type(size) + except Exception: + raise ValueError(f"Can't process type from size for {url}") if typ == "video": if verbose >= 1: print( @@ -91,14 +111,12 @@ def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]: continue elif typ == "paginated": if verbose >= 1: - print(f"\t\t[{i+1}] Scrapping dataset...") + print(f"\t\t[{i+1}] Scraping dataset...") df = scraper.scrape_image_data(url) - if df is not None: - df_list.append(df) elif typ == "tabular": t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time()) time.sleep(t_wait) # Stay under 180 requests every 30s - child = PanDataSet(url) + child = PanDataSet(url, enable_cache=True, auth_token=auth_token) T_POLL_LAST = time.time() if ds.loginstatus != "unrestricted": if verbose >= 1: @@ -107,21 +125,23 @@ def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]: + f"\t\t[{i+1}] [ERROR] Access restricted: '{ds.loginstatus}'. {url}" + colorama.Fore.RESET ) - return - if not checker.has_url_col(child.data): + continue + if ensure_url and not checker.has_url_col(child.data): if verbose >= 1: print( colorama.Fore.YELLOW + f"\t\t[{i+1}] [WARNING] Image URL column NOT found! {url} Skipping..." + colorama.Fore.RESET ) - else: - # Add metadata - child_doi = getattr(child, "doi", "").split("doi.org/")[-1] - df = set_metadata(child, alt=child_doi) - # Add child dataset to list - df = exclude_rows(df) - df_list.append(df) + continue + # Add metadata + df = set_metadata(child) + # Add child dataset to list + df = exclude_rows(df) + if df is None: + continue + df["ds_id"] = ds_id + df_list.append(df) # Return result if len(df_list) > 0: @@ -131,20 +151,13 @@ def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]: return None -def set_metadata(ds: PanDataSet, alt="unknown") -> DataFrame: +def set_metadata(ds: PanDataSet) -> DataFrame: """Add metadata to a PanDataSet's dataframe.""" ds.data["dataset_title"] = ds.title ds.data["doi"] = getattr(ds, "doi", "") # Dataset campaign if (len(ds.events) > 0) and (ds.events[0].campaign is not None): ds.data["campaign"] = ds.events[0].campaign.name - else: - ds.data["campaign"] = alt - # Dataset site/event/deployment - if "Event" in ds.data.columns: - ds.data["site"] = ds.data["Event"] - else: - ds.data["site"] = alt + "_site" return ds.data @@ -162,9 +175,16 @@ def save_df(df: DataFrame, output_path: str, level=1, index=None, verbose=1) -> if verbose >= 1: print(f"{tabs}[{idx}] Empty DataFrame! File not saved!") return False - # Save if dataframe not empty - df.to_csv(output_path, index=False) - print(f"{tabs}[{idx}] Saved to '{output_path}'") + # Save dataframe if it is not empty + with tempfile.TemporaryDirectory() as dir_tmp: + # Write to a temporary file + tmp_path = os.path.join(dir_tmp, os.path.basename(output_path)) + df.to_csv(tmp_path, index=False) + # Move our temporary file to the destination + os.makedirs(os.path.dirname(output_path), exist_ok=True) + shutil.move(tmp_path, output_path) + if verbose >= 1: + print(f"{tabs}[{idx}] Saved to '{output_path}'") return True @@ -212,6 +232,13 @@ def fix_text(text: str) -> str: return text +def uri2dsid(uri: str) -> str: + """ + Extract PANGAEA dataset ID from url/uri/doi string. + """ + return uri.split("PANGAEA.")[-1] + + def get_dataset_id(df: DataFrame) -> str: """Take a Pandas DataFrame as input and return the datasets Pangaea ID.""" col = find_column_match("doi") diff --git a/pangaea_downloader/tools/eda.py b/pangaea_downloader/tools/eda.py index 219fa31..9696c93 100644 --- a/pangaea_downloader/tools/eda.py +++ b/pangaea_downloader/tools/eda.py @@ -4,10 +4,11 @@ import matplotlib.cm import matplotlib.colors import numpy as np -import requests from matplotlib.pyplot import get_cmap from sklearn.neighbors import KernelDensity +from . import requesting + def url_from_doi(doi: str) -> str: """ @@ -29,7 +30,7 @@ def img_from_url(url: str, verbose=False) -> np.array: """Take an image url and return retrieved image array.""" success = False while not success: - resp = requests.get(url, stream=True) + resp = requesting.get_request_with_backoff(url, stream=True) print(f"status code: {resp.status_code}") if verbose else 0 success = True if (resp.status_code == 200) else False if success: diff --git a/pangaea_downloader/tools/process.py b/pangaea_downloader/tools/process.py index 6d92e73..74d8ea6 100644 --- a/pangaea_downloader/tools/process.py +++ b/pangaea_downloader/tools/process.py @@ -1,9 +1,10 @@ """Functions for processing each of the result items.""" from typing import Optional, Tuple -import requests from bs4 import BeautifulSoup +from . import requesting + def url_from_uri(uri: str, base_url="https://doi.pangaea.de/") -> str: """Take a pangaea uri/doi string as input and return its corresponding url string.""" @@ -28,7 +29,7 @@ def get_result_info(res: dict) -> Tuple[str, str, str, str, bool]: def get_html_info(url: str) -> Optional[str]: """Make get request to dataset webpage and return dataset size.""" # Make get request to webpage - resp = requests.get(url) + resp = requesting.get_request_with_backoff(url) if resp.status_code == 200: # Parse html soup = BeautifulSoup(resp.text, "lxml") diff --git a/pangaea_downloader/tools/requesting.py b/pangaea_downloader/tools/requesting.py new file mode 100644 index 0000000..1ec7ef0 --- /dev/null +++ b/pangaea_downloader/tools/requesting.py @@ -0,0 +1,49 @@ +""" +URL request utilities. +""" + +import time + +import requests + + +def get_request_with_backoff(url, retries=5, backoff_factor=1, verbose=1, **kwargs): + """ + Fetch a URL resource using requests with a custom backoff strategy for re-attempts. + + Parameters + ---------- + url : str + The URL to request. + retries : int, default=5 + Maximum number of attempts. + backoff_factor : float, default=1 + Base time to wait for before attempting to download again when receiving + a 500 or 503 HTTP status code. + verbose : int, default=1 + Verbosity level. + **kwargs + Additional arguments as per :func:`requests.get`. + """ + for i_attempt in range(retries): + r = requests.get(url, **kwargs) + if r.status_code not in [429, 500, 503]: + # Status code looks good + break + # N.B. Could also retry on [408, 502, 504, 599] + if r.status_code == 429: + # PANGAEA has a maximum of 180 requests within a 30s period + # Wait for this to cool off completely. + t_wait = 30 + else: + # Other errors indicate a server side error. Wait a + # short period and then retry to see if it alleviates. + t_wait = backoff_factor * 2**i_attempt + if verbose >= 1: + print( + "Retrying in {} seconds (HTTP Status {}): {}".format( + t_wait, r.status_code, url + ) + ) + time.sleep(t_wait) + return r diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py index 95218f2..a36cd34 100644 --- a/pangaea_downloader/tools/scraper.py +++ b/pangaea_downloader/tools/scraper.py @@ -9,7 +9,7 @@ from pangaeapy import PanDataSet from requests.compat import urljoin -import pangaea_downloader.tools.datasets as datasets +from pangaea_downloader.tools import datasets, requesting def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: @@ -17,16 +17,40 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: # Load dataset t_wait = max(0, datasets.T_POLL_LAST + datasets.T_POLL_INTV - time.time()) time.sleep(t_wait) # Stay under 180 requests every 30s - ds = PanDataSet(url) + ds = PanDataSet(url, enable_cache=True) datasets.T_POLL_LAST = time.time() # Request dataset url if verbose >= 1: print("\t\t\t[INFO] Requesting:", url) - resp = requests.get(url) + resp = requesting.get_request_with_backoff(url) # Parse response soup = BeautifulSoup(resp.text, "lxml") # Get coordinates of expedition coordinates = get_metadata(soup) + if coordinates is None and hasattr(ds, "geometryextent"): + print( + colorama.Fore.YELLOW + "\t\t\t[ALERT] Trying to get coordinates from" + " PanDataSet.geometryextent" + colorama.Fore.RESET + ) + lat = None + long = None + for k in ["meanLatitude", "latitude", "Latitude"]: + if k in ds.geometryextent: + lat = ds.geometryextent[k] + break + for k in ["meanLongitude", "longitude", "Latitude"]: + if k in ds.geometryextent: + long = ds.geometryextent[k] + break + if lat is None and long is None: + coordinates = None + coordinates = lat, long + + if coordinates is None: + print( + colorama.Fore.RED + "\t\t\t[ERROR] Coordinate metadata not found on page!" + " Saved file won't have Longitude, Latitude columns!" + colorama.Fore.RESET + ) # Get download link to photos page download_link = soup.find("div", attrs={"class": "text-block top-border"}).a["href"] @@ -34,7 +58,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: if verbose >= 1: print("\t\t\t[INFO] URL to photos page:", download_link) # Get to photos page (page 1) - resp = requests.get(download_link) + resp = requesting.get_request_with_backoff(download_link) photos_page = BeautifulSoup(resp.text, "lxml") img_urls = get_urls_from_each_page(photos_page, src_url, verbose=verbose) if img_urls is None: @@ -47,17 +71,13 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: lat, long = coordinates df["Longitude"] = long df["Latitude"] = lat - df["Dataset"] = ds.title - df["DOI"] = getattr(ds, "doi", "") - doi = getattr(ds, "doi", "").split("doi.org/")[-1] + df["dataset_title"] = ds.title + doi = getattr(ds, "doi", "") + df["DOI"] = doi + ds_id = datasets.uri2dsid(doi if doi else url) + df["ds_id"] = ds_id if (len(ds.events) > 0) and (ds.events[0].campaign is not None): - df["Campaign"] = ds.events[0].campaign.name - else: - df["Campaign"] = doi - if "Event" in ds.data.columns: - df["Site"] = ds.data["Event"] - else: - df["Site"] = doi + "_site" + df["campaign"] = ds.events[0].campaign.name return df @@ -71,10 +91,6 @@ def get_metadata(page_soup: BeautifulSoup) -> Optional[Tuple[float, float]]: lat = float(coordinates.find("span", attrs={"class": "latitude"}).text) long = float(coordinates.find("span", attrs={"class": "longitude"}).text) return lat, long - print( - colorama.Fore.RED + "\t\t\t[ERROR] Coordinate metadata not found on page!" - " Saved file won't have Longitude, Latitude columns!" + colorama.Fore.RESET - ) return None @@ -91,7 +107,7 @@ def get_urls_from_each_page( if verbose >= 1: print(f"\t\t\t[INFO] Processing Page {n}...") url = pagination[n] - resp = requests.get(url) + resp = requesting.get_request_with_backoff(url) soup = BeautifulSoup(resp.text, "lxml") urls = get_page_image_urls(soup, verbose=verbose) img_urls.extend(urls) @@ -111,7 +127,7 @@ def get_pagination(page_soup: BeautifulSoup, src_url: str) -> Optional[dict]: # List of page URLs page_urls = [urljoin(src_url, a["href"]) for a in pagination.find_all("a")][:-1] # Page number : Page URL - page_dict = {k: v for k, v in zip(page_nums, page_urls)} + page_dict = dict(zip(page_nums, page_urls)) return page_dict diff --git a/requirements.txt b/requirements.txt index 368ee16..05b0ea0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,8 @@ matplotlib>=3.4.2 numpy>=1.20.3 opencv-python>=4.5.2.54 pandas>=1.2.5 -pangaeapy>=0.0.5 +pangaeapy>=1.0.6 requests>=2.25.1 scikit-learn>=0.24.2 +scipy tqdm