From 9a64b6eaa7b47e43c19bbe2ab7ac23ffa9389e72 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 1 Sep 2022 09:33:56 +0100 Subject: [PATCH 01/92] MNT: Show which ds has None for size --- pangaea_downloader/pq_scraper.py | 5 ++++- pangaea_downloader/tools/datasets.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py index bb17b7f..dfbf485 100755 --- a/pangaea_downloader/pq_scraper.py +++ b/pangaea_downloader/pq_scraper.py @@ -95,7 +95,10 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): continue df = pd.concat(df_list) else: - dataset_type = process.ds_type(size) + try: + dataset_type = process.ds_type(size) + except Exception: + raise ValueError(f"Can't process type from size for {ds_id}") if dataset_type == "video": if verbose >= 1: print( diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index b54c6d5..1965371 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -80,7 +80,10 @@ def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]: url = process.url_from_uri(child_uri) size = process.get_html_info(url) # Assess type - typ = process.ds_type(size) + try: + typ = process.ds_type(size) + except Exception: + raise ValueError(f"Can't process type from size for {url}") if typ == "video": if verbose >= 1: print( From 0460e6021a9d8c273f9cc7427ef3796c49881269 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 9 Dec 2021 16:06:10 -0400 Subject: [PATCH 02/92] ENH: Add option to control whether URL columns are required --- pangaea_downloader/tools/datasets.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index 1965371..dbc841a 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -19,7 +19,7 @@ T_POLL_INTV = 0.1667 -def fetch_child(child_url: str, verbose=1) -> Optional[DataFrame]: +def fetch_child(child_url: str, verbose=1, ensure_url=True) -> Optional[DataFrame]: """Fetch Pangaea child dataset using provided URI/DOI and return DataFrame.""" # Load data set global T_POLL_LAST @@ -39,7 +39,7 @@ def fetch_child(child_url: str, verbose=1) -> Optional[DataFrame]: ) return # Check for image URL column - if not checker.has_url_col(ds.data): + if ensure_url and not checker.has_url_col(ds.data): if verbose >= 1: print( colorama.Fore.YELLOW @@ -54,7 +54,9 @@ def fetch_child(child_url: str, verbose=1) -> Optional[DataFrame]: return df -def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]: +def fetch_children( + parent_url: str, verbose=1, ensure_url=True +) -> Optional[List[DataFrame]]: """Take in url of a parent dataset, fetch and return list of child datasets.""" # Fetch dataset global T_POLL_LAST @@ -111,7 +113,7 @@ def fetch_children(parent_url: str, verbose=1) -> Optional[List[DataFrame]]: + colorama.Fore.RESET ) return - if not checker.has_url_col(child.data): + if ensure_url and not checker.has_url_col(child.data): if verbose >= 1: print( colorama.Fore.YELLOW From b038c4f9cad8b098f833ec2381d6784144dd09dd Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 1 Sep 2022 09:40:53 +0100 Subject: [PATCH 03/92] API: Disable ensure url option in search --- pangaea_downloader/pq_scraper.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py index dfbf485..7ea135a 100755 --- a/pangaea_downloader/pq_scraper.py +++ b/pangaea_downloader/pq_scraper.py @@ -75,7 +75,9 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): # ------------- ASSESS DATASET TYPE ------------- # try: if is_parent: - df_list = datasets.fetch_children(url, verbose=verbose - 1) + df_list = datasets.fetch_children( + url, verbose=verbose - 1, ensure_url=False + ) if df_list is None: if verbose >= 1: print( @@ -110,7 +112,9 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): elif dataset_type == "paginated": df = scraper.scrape_image_data(url, verbose=verbose - 1) elif dataset_type == "tabular": - df = datasets.fetch_child(url, verbose=verbose - 1) + df = datasets.fetch_child( + url, verbose=verbose - 1, ensure_url=False + ) except Exception as err: if isinstance(err, KeyboardInterrupt): raise From 09006e283455634771e33df063dc21779bb24078 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 1 Sep 2022 09:53:55 +0100 Subject: [PATCH 04/92] MNT: Handle alt campaign name within set_metadata --- pangaea_downloader/tools/datasets.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index dbc841a..e9c5880 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -28,7 +28,6 @@ def fetch_child(child_url: str, verbose=1, ensure_url=True) -> Optional[DataFram time.sleep(t_wait) # Stay under 180 requests every 30s ds = PanDataSet(child_url) T_POLL_LAST = time.time() - doi = getattr(ds, "doi", "").split("doi.org/")[-1] # Dataset is restricted if ds.loginstatus != "unrestricted": if verbose >= 1: @@ -48,7 +47,7 @@ def fetch_child(child_url: str, verbose=1, ensure_url=True) -> Optional[DataFram ) return # Add metadata - df = set_metadata(ds, alt=doi) + df = set_metadata(ds) # Exclude unwanted rows df = exclude_rows(df) return df @@ -122,8 +121,7 @@ def fetch_children( ) else: # Add metadata - child_doi = getattr(child, "doi", "").split("doi.org/")[-1] - df = set_metadata(child, alt=child_doi) + df = set_metadata(child) # Add child dataset to list df = exclude_rows(df) df_list.append(df) @@ -136,11 +134,12 @@ def fetch_children( return None -def set_metadata(ds: PanDataSet, alt="unknown") -> DataFrame: +def set_metadata(ds: PanDataSet) -> DataFrame: """Add metadata to a PanDataSet's dataframe.""" ds.data["dataset_title"] = ds.title ds.data["doi"] = getattr(ds, "doi", "") # Dataset campaign + alt = ds.data["doi"].split("doi.org/")[-1] if (len(ds.events) > 0) and (ds.events[0].campaign is not None): ds.data["campaign"] = ds.events[0].campaign.name else: From ac76bc7b13d15c78b91af084b00e5655d3f9b9d7 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 1 Sep 2022 09:55:08 +0100 Subject: [PATCH 05/92] MNT: Use dataset ID as alt instead of DOI --- pangaea_downloader/tools/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index e9c5880..96ad334 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -139,7 +139,7 @@ def set_metadata(ds: PanDataSet) -> DataFrame: ds.data["dataset_title"] = ds.title ds.data["doi"] = getattr(ds, "doi", "") # Dataset campaign - alt = ds.data["doi"].split("doi.org/")[-1] + alt = str(ds.id) if (len(ds.events) > 0) and (ds.events[0].campaign is not None): ds.data["campaign"] = ds.events[0].campaign.name else: From f156f499dab0f4db040caabf707aa7f8821cea52 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 1 Sep 2022 11:53:07 +0100 Subject: [PATCH 06/92] MNT: Skip children of parents without URL --- pangaea_downloader/pq_scraper.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py index 7ea135a..743cb70 100755 --- a/pangaea_downloader/pq_scraper.py +++ b/pangaea_downloader/pq_scraper.py @@ -75,9 +75,7 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): # ------------- ASSESS DATASET TYPE ------------- # try: if is_parent: - df_list = datasets.fetch_children( - url, verbose=verbose - 1, ensure_url=False - ) + df_list = datasets.fetch_children(url, verbose=verbose - 1) if df_list is None: if verbose >= 1: print( From d8894f9845a4dcfb902824bd37dbc2039f93a2a3 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 1 Sep 2022 14:17:26 +0100 Subject: [PATCH 07/92] STY: Split arg per line with ensure_url included --- pangaea_downloader/tools/datasets.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index 96ad334..e0adc9e 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -54,7 +54,9 @@ def fetch_child(child_url: str, verbose=1, ensure_url=True) -> Optional[DataFram def fetch_children( - parent_url: str, verbose=1, ensure_url=True + parent_url: str, + verbose=1, + ensure_url=True, ) -> Optional[List[DataFrame]]: """Take in url of a parent dataset, fetch and return list of child datasets.""" # Fetch dataset From 13920281d0a651ff04142993fd6edbd24fadc510 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Fri, 2 Sep 2022 13:28:24 +0100 Subject: [PATCH 08/92] ENH: Get lat/lon from PanDataSet if not scraped --- pangaea_downloader/tools/scraper.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py index 95218f2..a6bb0bc 100644 --- a/pangaea_downloader/tools/scraper.py +++ b/pangaea_downloader/tools/scraper.py @@ -27,6 +27,30 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: soup = BeautifulSoup(resp.text, "lxml") # Get coordinates of expedition coordinates = get_metadata(soup) + if coordinates is None and hasattr(ds, "geometryextent"): + print( + colorama.Fore.RED + "\t\t\t[ALERT] Trying to get coordinates from" + " PanDataSet.geometryextent" + colorama.Fore.RESET + ) + lat = None + long = None + for k in ["meanLatitude", "latitude", "Latitude"]: + if k in ds.geometryextent: + lat = ds.geometryextent[k] + break + for k in ["meanLongitude", "longitude", "Latitude"]: + if k in ds.geometryextent: + long = ds.geometryextent[k] + break + if lat is None and long is None: + coordinates = None + coordinates = lat, long + + if coordinates is None: + print( + colorama.Fore.RED + "\t\t\t[ERROR] Coordinate metadata not found on page!" + " Saved file won't have Longitude, Latitude columns!" + colorama.Fore.RESET + ) # Get download link to photos page download_link = soup.find("div", attrs={"class": "text-block top-border"}).a["href"] @@ -71,10 +95,6 @@ def get_metadata(page_soup: BeautifulSoup) -> Optional[Tuple[float, float]]: lat = float(coordinates.find("span", attrs={"class": "latitude"}).text) long = float(coordinates.find("span", attrs={"class": "longitude"}).text) return lat, long - print( - colorama.Fore.RED + "\t\t\t[ERROR] Coordinate metadata not found on page!" - " Saved file won't have Longitude, Latitude columns!" + colorama.Fore.RESET - ) return None From f52e3b58497ba2143c2f115f6ef23a5a3a9551f8 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Fri, 2 Sep 2022 13:29:58 +0100 Subject: [PATCH 09/92] ENH: Add auth_token support --- pangaea_downloader/pq_scraper.py | 25 ++++++++++++++++++++++--- pangaea_downloader/tools/datasets.py | 14 ++++++++++---- requirements.txt | 2 +- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py index 743cb70..48aa831 100755 --- a/pangaea_downloader/pq_scraper.py +++ b/pangaea_downloader/pq_scraper.py @@ -16,7 +16,12 @@ from pangaea_downloader.tools import datasets, process, scraper, search -def search_and_download(queries=None, output_dir="query-outputs", verbose=0): +def search_and_download( + queries=None, + output_dir="query-outputs", + auth_token=None, + verbose=0, +): """ Search `PANGAEA`_ for a set of queries, and download datasets for each result. @@ -31,6 +36,8 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): output_dir : str, default="query-outputs" The output directory where downloaded datasets will be saved. Any existing output datasets will be skipped instead of downloaded. + auth_token : str, optional + Bearer authentication token. verbose : int, default=1 Verbosity level. """ @@ -75,7 +82,11 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): # ------------- ASSESS DATASET TYPE ------------- # try: if is_parent: - df_list = datasets.fetch_children(url, verbose=verbose - 1) + df_list = datasets.fetch_children( + url, + verbose=verbose - 1, + auth_token=auth_token, + ) if df_list is None: if verbose >= 1: print( @@ -111,7 +122,10 @@ def search_and_download(queries=None, output_dir="query-outputs", verbose=0): df = scraper.scrape_image_data(url, verbose=verbose - 1) elif dataset_type == "tabular": df = datasets.fetch_child( - url, verbose=verbose - 1, ensure_url=False + url, + verbose=verbose - 1, + ensure_url=False, + auth_token=auth_token, ) except Exception as err: if isinstance(err, KeyboardInterrupt): @@ -195,6 +209,11 @@ def get_parser(): default="query-outputs", help="Directory for downloaded datasets. Default is %(default)s.", ) + parser.add_argument( + "--auth-token", + type=str, + help="Bearer authentication token", + ) parser.add_argument( "--verbose", "-v", diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index e0adc9e..cb335b3 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -19,14 +19,19 @@ T_POLL_INTV = 0.1667 -def fetch_child(child_url: str, verbose=1, ensure_url=True) -> Optional[DataFrame]: +def fetch_child( + child_url: str, + verbose=1, + ensure_url=True, + auth_token=None, +) -> Optional[DataFrame]: """Fetch Pangaea child dataset using provided URI/DOI and return DataFrame.""" # Load data set global T_POLL_LAST global T_POLL_INTV t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time()) time.sleep(t_wait) # Stay under 180 requests every 30s - ds = PanDataSet(child_url) + ds = PanDataSet(child_url, auth_token=auth_token) T_POLL_LAST = time.time() # Dataset is restricted if ds.loginstatus != "unrestricted": @@ -57,6 +62,7 @@ def fetch_children( parent_url: str, verbose=1, ensure_url=True, + auth_token=None, ) -> Optional[List[DataFrame]]: """Take in url of a parent dataset, fetch and return list of child datasets.""" # Fetch dataset @@ -64,7 +70,7 @@ def fetch_children( global T_POLL_INTV t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time()) time.sleep(t_wait) # Stay under 180 requests every 30s - ds = PanDataSet(parent_url) + ds = PanDataSet(parent_url, auth_token=auth_token) T_POLL_LAST = time.time() # Check restriction if ds.loginstatus != "unrestricted": @@ -104,7 +110,7 @@ def fetch_children( elif typ == "tabular": t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time()) time.sleep(t_wait) # Stay under 180 requests every 30s - child = PanDataSet(url) + child = PanDataSet(url, auth_token=auth_token) T_POLL_LAST = time.time() if ds.loginstatus != "unrestricted": if verbose >= 1: diff --git a/requirements.txt b/requirements.txt index 368ee16..8726b63 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ matplotlib>=3.4.2 numpy>=1.20.3 opencv-python>=4.5.2.54 pandas>=1.2.5 -pangaeapy>=0.0.5 +pangaeapy>=1.0.6 requests>=2.25.1 scikit-learn>=0.24.2 tqdm From 4da4e30197d8f6c07bcfa811d9414f53501571aa Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Tue, 7 Mar 2023 12:25:58 -0500 Subject: [PATCH 10/92] MNT: Only print saving dataframe if verbosity high enough --- pangaea_downloader/tools/datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index cb335b3..397b969 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -176,7 +176,8 @@ def save_df(df: DataFrame, output_path: str, level=1, index=None, verbose=1) -> return False # Save if dataframe not empty df.to_csv(output_path, index=False) - print(f"{tabs}[{idx}] Saved to '{output_path}'") + if verbose >= 1: + print(f"{tabs}[{idx}] Saved to '{output_path}'") return True From aec74c6304f9f080036c328feeb2ca06d435a758 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 Mar 2023 11:08:46 -0500 Subject: [PATCH 11/92] BUG: Remove unused import of IPython, not specified in requirements --- pangaea_downloader/merge_benthic_datasets.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index eb15e1a..8376eb3 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -15,7 +15,6 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from IPython.display import display from tqdm.auto import tqdm from pangaea_downloader import __meta__ From 682fba462315032a8b39b11bfbc006be296d6275 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 Mar 2023 11:09:15 -0500 Subject: [PATCH 12/92] BUG: Skip non-CSV files when processing outputs --- pangaea_downloader/merge_benthic_datasets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 8376eb3..6aee8f8 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -712,6 +712,8 @@ def process_datasets(input_dirname, output_path=None, verbose=0): dfs_fnames = [] for fname in tqdm(sorted(sorted(os.listdir(input_dirname)), key=len)): # noqa: C414 + if not fname.endswith(".csv"): + continue # for fname in tqdm(os.listdir(input_dirname)): ds_id = os.path.splitext(fname)[0] df = pd.read_csv(os.path.join(input_dirname, fname)) From 759b1be9f4ec07153f461ccabeaa70ca6961faf9 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 Mar 2023 11:10:40 -0500 Subject: [PATCH 13/92] MNT: Don't add dummy campaign and site columns when downloading datasets --- pangaea_downloader/tools/datasets.py | 8 -------- pangaea_downloader/tools/scraper.py | 7 ------- 2 files changed, 15 deletions(-) diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index 397b969..23c7688 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -147,16 +147,8 @@ def set_metadata(ds: PanDataSet) -> DataFrame: ds.data["dataset_title"] = ds.title ds.data["doi"] = getattr(ds, "doi", "") # Dataset campaign - alt = str(ds.id) if (len(ds.events) > 0) and (ds.events[0].campaign is not None): ds.data["campaign"] = ds.events[0].campaign.name - else: - ds.data["campaign"] = alt - # Dataset site/event/deployment - if "Event" in ds.data.columns: - ds.data["site"] = ds.data["Event"] - else: - ds.data["site"] = alt + "_site" return ds.data diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py index a6bb0bc..22f4e84 100644 --- a/pangaea_downloader/tools/scraper.py +++ b/pangaea_downloader/tools/scraper.py @@ -73,15 +73,8 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: df["Latitude"] = lat df["Dataset"] = ds.title df["DOI"] = getattr(ds, "doi", "") - doi = getattr(ds, "doi", "").split("doi.org/")[-1] if (len(ds.events) > 0) and (ds.events[0].campaign is not None): df["Campaign"] = ds.events[0].campaign.name - else: - df["Campaign"] = doi - if "Event" in ds.data.columns: - df["Site"] = ds.data["Event"] - else: - df["Site"] = doi + "_site" return df From 18651b8a12b504ccb9b3b8b4fc43aa18e2c7b4bc Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 Mar 2023 13:50:41 -0500 Subject: [PATCH 14/92] MNT: Print message explaining errors being repeated --- pangaea_downloader/pq_scraper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py index 48aa831..6f618b2 100755 --- a/pangaea_downloader/pq_scraper.py +++ b/pangaea_downloader/pq_scraper.py @@ -157,7 +157,9 @@ def search_and_download( print(f"Number of files previously saved: {n_files}.") print(f"Total dataset files: {n_files + n_downloads}") print(f"Number of dataset errors (excluding access): {len(errors)}.") - + if len(errors) > 0: + print() + print("Captured errors are now repeated as follows.") for msg in errors: print() print(msg) From 53324810d17eb8f722b70ef287146131277912d3 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 Mar 2023 13:51:21 -0500 Subject: [PATCH 15/92] MNT: Ignore existing dataset and site columns --- pangaea_downloader/merge_benthic_datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 6aee8f8..b0baa64 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -255,8 +255,8 @@ def reformat_df(df, remove_duplicate_columns=True): # is the output column name, and the value is a list of search names # in order of priority. The first match will be kept and others discarded. desired_columns = { - "dataset": ["ds_id", "dataset", "Campaign", "campaign"], - "site": ["Event", "event", "Site", "site", "deployment"], + "dataset": ["ds_id"], + "site": ["Event", "event", "deployment"], "image": ["image", "filename"], "datetime": [ "Date/Time", From df49254d14df4122d3d4907954e342fa4fcd8749 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 Mar 2023 13:51:50 -0500 Subject: [PATCH 16/92] MNT: Change default site to be based on dataset name, not DOI --- pangaea_downloader/merge_benthic_datasets.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index b0baa64..bd2f688 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -365,6 +365,10 @@ def reformat_df(df, remove_duplicate_columns=True): # if "timestamp" not in df.columns and "datetime" in df.columns: # df["timestamp"] = df["datetime"].apply(datetime2timestamp) + # Add default site if it is missing + if "site" not in df.columns: + df["site"] = df["dataset"] + "_site" + if any([c in clean_cols for c in ["Kingdom", "Phylum", "Genus"]]): df["taxonomy"] = df.apply(row2taxonomy, axis=1) df.drop( From a5786557c5f9297f75e6b1c310246e9adf16b0eb Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 Mar 2023 13:52:36 -0500 Subject: [PATCH 17/92] BUG: Reflect latitudesouth, latitude-, longitudewest, longitude- --- pangaea_downloader/merge_benthic_datasets.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index bd2f688..987e485 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -273,7 +273,7 @@ def reformat_df(df, remove_duplicate_columns=True): "latitude+", "latitudemed", "latitudenorth", - "latitudesouth", + # "latitudesouth", # special handling ], "longitude": [ "Longitude", @@ -282,8 +282,8 @@ def reformat_df(df, remove_duplicate_columns=True): "long", "longitude+", "longitudemed", - "longitudewest", "longitudeeast", + # "longitudewest", # special handling ], "x_pos": [], "y_pos": [], @@ -360,6 +360,16 @@ def reformat_df(df, remove_duplicate_columns=True): # Rename columns to canonical names df.rename(columns=mapping, inplace=True, errors="raise") + # Handle latitudesouth and longitudewest + if "latitude" not in df.columns and "latitudesouth" in df.columns: + df["latitude"] = -df["latitudesouth"] + if "latitude" not in df.columns and "latitude-" in df.columns: + df["latitude"] = -df["latitude-"] + if "longitude" not in df.columns and "longitudewest" in df.columns: + df["longitude"] = -df["longitudewest"] + if "longitude" not in df.columns and "longitude-" in df.columns: + df["longitude"] = -df["longitude-"] + # Add file extension to image df["image"] = df.apply(add_file_extension, axis=1) # if "timestamp" not in df.columns and "datetime" in df.columns: From ea5eba4e596766dc10a5c57a27a2bf07ed43dd04 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 Mar 2023 19:18:41 -0500 Subject: [PATCH 18/92] MNT: Save results for parents whose children don't have URLs --- pangaea_downloader/pq_scraper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py index 6f618b2..fac599b 100755 --- a/pangaea_downloader/pq_scraper.py +++ b/pangaea_downloader/pq_scraper.py @@ -85,6 +85,7 @@ def search_and_download( df_list = datasets.fetch_children( url, verbose=verbose - 1, + ensure_url=False, auth_token=auth_token, ) if df_list is None: From 8203880f99b9fe38938ad53f682c14ae5661c446 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Fri, 24 Mar 2023 17:29:21 +0000 Subject: [PATCH 19/92] MNT: Inherit verbosity from caller --- pangaea_downloader/merge_benthic_datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 987e485..f8b7f98 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -767,7 +767,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0): files_with_repeat_urls.append(fname) # Try to fix repeated URLs that are accidental dups but should differ - df = fixup_repeated_urls(df, url_column=url_col, verbose=1) + df = fixup_repeated_urls(df, url_column=url_col, verbose=verbose) if len(df) != len(df.drop_duplicates(subset=url_col)): files_with_repeat_urls2.append(fname) @@ -832,7 +832,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0): else: if verbose >= 1: print("Fix repeated output paths to prevent collisions") - df_all = fixup_repeated_output_paths(df_all, inplace=True, verbose=2) + df_all = fixup_repeated_output_paths(df_all, inplace=True, verbose=verbose) if os.path.dirname(output_path): os.makedirs(os.path.dirname(output_path), exist_ok=True) From 77d4cf1619d5adf71672365b79732e6a9f6cd898 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 00:21:40 +0100 Subject: [PATCH 20/92] MNT: Read CSV files without low_memory mode due to 'mixed types' --- pangaea_downloader/merge_benthic_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index f8b7f98..f4a79fe 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -730,7 +730,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0): continue # for fname in tqdm(os.listdir(input_dirname)): ds_id = os.path.splitext(fname)[0] - df = pd.read_csv(os.path.join(input_dirname, fname)) + df = pd.read_csv(os.path.join(input_dirname, fname), low_memory=False) n_total += 1 if not checker.has_url_col(df): continue From db0fae0361ed049e39f05294377c5a012385cecc Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 00:25:23 +0100 Subject: [PATCH 21/92] ENH: Interpolate or extract missing lat, lon, datetime metadata --- pangaea_downloader/merge_benthic_datasets.py | 707 ++++++++++++++++++- requirements.txt | 1 + 2 files changed, 707 insertions(+), 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index f4a79fe..b6ffb72 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -10,20 +10,24 @@ import os import re from collections import defaultdict +from functools import partial import dateutil.parser import matplotlib.pyplot as plt import numpy as np import pandas as pd +import scipy.interpolate +from pangaeapy import PanDataSet from tqdm.auto import tqdm from pangaea_downloader import __meta__ from pangaea_downloader.tools import checker try: - from benthicnet.io import fixup_repeated_output_paths + from benthicnet.io import fixup_repeated_output_paths, row2basename except ImportError: fixup_repeated_output_paths = None + row2basename = None TAXONOMY_RANKS = [ ["Kingdom", "Regnum"], @@ -695,6 +699,698 @@ def fixup_repeated_urls( return df +def fixup_favourite_images(df, verbose=1): + """ + Drop duplicated favourite images. + + These occur in Ingo Schewe's datasets along OFOS profiles during POLARSTERN + cruises, PANGAEA dataset ids 849814--849816 and 873995--874002. + + Parameters + ---------- + df : pandas.DataFrame + A PANGAEA dataframe with Type column. + verbose : int, default=1 + Level of verbosity. + + Returns + ------- + df : pandas.DataFrame + As input dataframe, but with all Type entries starting with favourite + removed (case-insensitive). + """ + if "Type" not in df.columns: + return df + # Remove all Favourite timer, Favourite hotkey, FAVOURITE_TIMER, and + # FAVOURITE_HOTKEY entries, which although they have unique URLs for their + # images are actually identical images to the ones occuring immediately + # after them in the dataframe. + n_samples_before = len(df) + df = df[~df["Type"].str.lower().str.startswith("favourite")] + n_samples_after = len(df) + if verbose >= 1 and n_samples_after != n_samples_before: + print( + f"{df.iloc[0]['dataset']}:" + f" Removed {n_samples_before - n_samples_after} favourited images." + f" {n_samples_before} -> {n_samples_after} rows" + ) + return df + + +def get_dataset_datetime(ds_id): + """ + Determine a generic date for a dataset from the min and max extent datetimes. + + Parameters + ---------- + ds_id : int + The identifier of a PANGAEA dataset. + + Returns + ------- + dt_avg : str + The average datetime between the min and max extent, with precision + reduced to reflect what can accurately be represented. + """ + ds = PanDataSet(ds_id) + dt_min = pd.to_datetime(ds.mintimeextent) + dt_max = pd.to_datetime(ds.maxtimeextent) + if dt_min is None and dt_max is None: + return pd.NaT + elif dt_min is None: + return dt_max.strftime("%Y-%m-%d") + elif dt_max is None: + return dt_min.strftime("%Y-%m-%d") + delta = dt_max - dt_min + dt_avg = dt_min + delta / 2 + if delta > datetime.timedelta(days=90): + return dt_avg.strftime("%Y") + if delta > datetime.timedelta(days=4): + return dt_avg.strftime("%Y-%m") + if delta > datetime.timedelta(hours=3): + return dt_avg.strftime("%Y-%m-%d") + if delta > datetime.timedelta(minutes=5): + return dt_avg.strftime("%Y-%m-%d %H:00:00") + if delta > datetime.timedelta(seconds=5): + return dt_avg.strftime("%Y-%m-%d %H:%M:00") + return dt_avg.strftime("%Y-%m-%d %H:%M:%S") + + +def fix_missing_datetime_from_image_name(df, ds_id, verbose=1): + """ + Extract datetime information from the contents of the image column in the dataframe. + + Note that the extraction operation is only performed on dataset IDs for + which the image naming scheme has been manually evaluated, and is not + applied blindly to datasets which have not been inspected. + + Parameters + ---------- + df : pandas.DataFrame + Input dataframe. + ds_id : int + The identifier of the PANGAEA dataset. + verbose : int, default=1 + Verbosity level. + + Returns + ------- + df : pandas.DataFrame + As input, but with missing datetime cells filled in from the image. + Existing datetime values are unchanged. + """ + if "datetime" not in df.columns: + df["datetime"] = pd.NaT + + ds_id = int(ds_id) + + select = df["datetime"].isna() + + if row2basename is None: + selected_image = df.loc[select, "image"] + else: + selected_image = df[select].apply( + partial(row2basename, use_url_extension=True), axis=1 + ) + + selected_image_no_ext = selected_image.apply(lambda x: os.path.splitext(x)[0]) + + if ds_id in [ + 785104, + 785105, + 785108, + 785109, + 785110, + 836457, + 867771, + 867772, + 867773, + 867774, + 867775, + 867776, + 867777, + 867778, + 867806, + 867807, + 867808, + 867852, + 867853, + 867861, + 873541, + 875713, + 875714, + 876422, + 876423, + 876511, + 876512, + 876513, + 876514, + 876515, + 876516, + 876517, + 876518, + 880043, + 880044, + 885666, + 885667, + 885668, + 885669, + 885670, + 885672, + 885674, + 885675, + 885709, + 885712, + 885713, + 885714, + 885715, + 885716, + 885717, + 885718, + 885719, + 885720, + ]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. PP_107-100_2012-03-19.png + # e.g. PP_100_2012-06-05a.jpg + # e.g. TH_122_2012-03-27.jpg + # e.g. J_05_2017_05_24a.jpg + # e.g. J_overview_2017-05-24za.jpg + # e.g. J_40_2017_08_11a.jpg + # e.g. J_05_2017-08-11a.jpg + # e.g. LG_OVERVIEW_01_05_06_07_09_2013_02_24a.jpg + # e.g. LG_01_07_2010_11_11a.jpg + # e.g. LG_01_2010_11_11a.jpg + # e.g. LG_Cluster1_2012_01_31a.jpg + # e.g. LG_01_07_2012_04_22a.jpg + # e.g. LG_SCREW_2012_04_22a.jpg + # e.g. So_01_2014_02_15b.jpg + # e.g. XH_01_2013_01_12_a.jpg + # e.g. XH_01%2B09_2013_11_19_a.jpg + # e.g. XH_01_2010_04_22_a.jpg + # e.g. LH_020_2015_01_28a_counted.jpg + # e.g. LH_020_2015_01_28xx.jpg + # e.g. J_J40%2BJ46%2BJ41_2016_09_25_a.jpg + dtstr = selected_image_no_ext.str.lower().str.rstrip( + "abcdefghijklmnopqrstuvwxyz_-" + ) + dtstr = dtstr.str[-11:].str.replace("_", "-").str.lstrip("-") + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d") + + elif ds_id in [ + 789211, + 789212, + 789213, + 789214, + 789215, + 789216, + 789219, + 819234, + ]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. 2003_W01-2.jpg + # e.g. 2004_B_bewachsen.jpg + # e.g. 2005_B.jpg + # e.g. 2013_B01-1.jpg + dtstr = selected_image_no_ext.str[:4] + # Test the format is correct; we will get an error if not + _ = pd.to_datetime(dtstr, format="%Y") + # But we actually want to keep the lower precision string + df.loc[select, "datetime"] = dtstr + + elif ds_id in [ + 789217, + 793210, + 793211, + 818906, + 818907, + 836263, + 836264, + 836265, + 836266, + 837653, + ]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. 04_2011.jpg + # e.g. 04a_2011_analog.jpg + # e.g. 04.2-2008.jpg + # e.g. 08-2008.jpg + # e.g. 04a_2013.jpg + # e.g. 05a_2003.jpg + # e.g. 04_2007.jpg + dtstr = selected_image_no_ext.str.lower().str.rstrip( + "abcdefghijklmnopqrstuvwxyz_-" + ) + dtstr = dtstr.str[-4:] + # Test the format is correct; we will get an error if not + _ = pd.to_datetime(dtstr, format="%Y") + # But we actually want to keep the lower precision string + df.loc[select, "datetime"] = dtstr + + elif ds_id in [836024, 836025]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. 00setting_2014-08.jpg + # e.g. 39.9_2014.jpg + # e.g. 2014_B01-1.jpg + df.loc[select, "datetime"] = "2014" + + elif ds_id in [840699, 840700, 840702, 840703, 840742, 840743]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. J_001_2012-01-31.jpg + # e.g. J_003_2012-01-31_2.jpg + # e.g. J_115_2012-01-31_a.jpg + # e.g. J_033_2012-08-08.jpg + dtstr = selected_image_no_ext.apply(lambda x: x.split("_")[2]) + dtstr = dtstr.str[:10] + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d") + + elif ds_id in [840701, 849298]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. J_002_2013-03_03a.jpg + # e.g. J_001_2015-01.jpg + # e.g. J_001_2015-01_a.jpg + # e.g. J_056_2013-03_06logger.jpg + dtstr = selected_image_no_ext.apply(lambda x: x.split("_")[2]) + # Test the format is correct; we will get an error if not + _ = pd.to_datetime(dtstr, format="%Y-%m") + # But we actually want to keep the lower precision string + df.loc[select, "datetime"] = dtstr + + elif ds_id in [872407, 872408, 872409, 872410, 872411]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. J_40_2017-01-12_a.jpg + # e.g. J_overview2_2017-02-02_x.jpg + # e.g. J_xx_2017-01-12_x-62.jpg + # e.g. J_17_2017-01-14.jpg + # e.g. J_23_2017-01-14_b-1.jpg + dtstr = selected_image_no_ext.apply(lambda x: x.split("_")[2]) + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d") + + elif ds_id in [878045, 888410]: + # Nothing to do + pass + + elif ds_id in [894734]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. HOTKEY_2018_03_27at21_09_21CP4A4682 + # e.g. TIMER_2018_03_18at04_04_09CP4A3970 + dtstr = selected_image_no_ext.apply(lambda x: "_".join(x.split("_")[1:])[:20]) + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y_%m_%dat%H_%M_%S") + + elif ds_id in [896157]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. 2016-08-2600000.jpg + dtstr = selected_image_no_ext.str[:10] + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d") + + if ds_id in [ + 918232, + 918233, + 918327, + 918340, + 918341, + 918382, + 918383, + 918385, + ]: + if verbose >= 1: + print( + f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" + ) + # e.g. XH_01_2010_04_22_a.jpg + # e.g. XH_01_2010_04_28a.jpg + # e.g. XH_03_2018_10_18_a-1.jpg + dtstr = selected_image_no_ext.apply(lambda x: "-".join(x.split("_")[2:5])[:10]) + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%d") + + return df + + +def add_missing_datetime(df, ds_id=None, verbose=1): + """ + Add missing datetime values using either the mean extent or extraction from the file name. + + Parameters + ---------- + df : pandas.DataFrame + Input dataframe. + ds_id : int, optional + The identifier of the PANGAEA dataset. The default behaviour is to + extract this from the dataset column of the dataframe. + verbose : int, default=1 + Verbosity level. + + Returns + ------- + df : pandas.DataFrame + As input, but with missing datetime cells completed, either by using the + average from the datetime extent metadata, or by extracting it from the + image name. + All existing datetime values are left unchanged. + """ + if "datetime" not in df.columns: + df["datetime"] = pd.NaT + + if ds_id is None: + # Get dataset id from first row + ds_id = df.iloc[0]["dataset"].split("-")[-1] + ds_id = int(ds_id) + + # Add datetimes that are still missing by inferring from the image filename + df = fix_missing_datetime_from_image_name(df, ds_id, verbose=verbose) + + if all(df["datetime"].isna()): + # This dataset has no datetime values + # Try to determine average datetime from the datetime extent metadata on + # the dataset record + dt_avg = get_dataset_datetime(ds_id) + if dt_avg is not None: + if verbose >= 1: + print( + f"{ds_id}: Using average datetime from extent" + f" - filenames look like {df.iloc[0]['image']}" + ) + df["datetime"] = dt_avg + + if not any(df["datetime"].isna()): + # This dataframe already has all datetime information + return df + + select = df["datetime"].isna() + if ds_id in [889035, 889025]: + if verbose >= 1: + print(f"{ds_id}: Adding manual missing datetime for {ds_id}") + # From the abstract on PANGAEA (sic): + # Experimet was setup during 2007-02-15 and 2007-06-13. + df.loc[select, "datetime"] = "2007" + + if ds_id in [896160, 896164]: + if verbose >= 1: + print(f"{ds_id}: Adding manual missing datetime for {ds_id}") + # From the INDEX 2016 ROV (see dataset title and paper + # https://doi.org/10.3389/fmars.2019.00096) + df.loc[select, "datetime"] = "2016" + + return df + + +def interpolate_by_datetime(df, columns): + """ + Use datetime column to interpolate values for selected columns. + + Parameters + ---------- + df : pandas.DataFrame + Dataframe with ``"datetime"`` column, which may contain missing values + in other columns. + columns : str or iterable of str + Name of column or columns to fill in missing values with interpolation. + + Returns + ------- + df : pandas.DataFrame + Like input, but with missing values in specified columns completed by + linear interpolation over datetime. + """ + # Convert datetime string to a datetime object + datetime_actual = pd.to_datetime(df["datetime"]) + has_datetime = ~datetime_actual.isna() + if isinstance(columns, str): + columns = [columns] + for col in columns: + has_col = ~df[col].isna() + has_dt_and_col = has_datetime & has_col + has_dt_not_col = has_datetime & ~has_col + df.loc[has_dt_not_col, col] = np.interp( + datetime_actual[has_dt_not_col], + datetime_actual[has_dt_and_col], + df.loc[has_dt_and_col, col], + ) + return df + + +def fixup_incomplete_metadata(df, ds_id=None, verbose=1): + """ + Fix datasets which have partial, but incomplete, lat/lon/datetime metadata. + + Interpolation is performed as appropriate to the dataset. The methodology + was determined by manually inspecting each dataset. + Any latitude and longitude values which can not be resolved are filled in + with the dataset-level mean latitude and longitude as reported by PANGAEA. + + Parameters + ---------- + df : pandas.DataFrame + Input dataframe. + ds_id : int, optional + The identifier of the PANGAEA dataset. The default behaviour is to + extract this from the dataset column of the dataframe. + verbose : int, default=1 + Verbosity level. + + Returns + ------- + df : pandas.DataFrame + As input, but with missing datetime, latitude, longitude, and/or depth + cells completed by interpolation or similar. + All existing datetime values are left unchanged. + """ + if ds_id is None: + # Get dataset id from first row + ds_id = df.iloc[0]["dataset"].split("-")[-1] + ds_id = int(ds_id) + + if ds_id in [753197]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + print("Nothing to be done.") + + if ds_id in [805606, 805607, 805611, 805612]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + print(f"{ds_id}: Interpolating by index") + indices = np.arange(len(df)) + col = "datetime" + select_not_col = df[col].isna() + select_has_col = ~select_not_col + if any(select_has_col) and any(select_not_col): + missing_timestamps = np.interp( + indices[select_not_col], + indices[select_has_col], + pd.to_datetime(df.loc[select_has_col, "datetime"]).apply( + lambda x: x.timestamp() + ), + ) + df.loc[select_not_col, col] = [ + datetime.datetime.fromtimestamp(int(ts)) for ts in missing_timestamps + ] + + if ds_id == 875080: + # N.B. There is date metadata in the csv, but not time. But there is time + # metadata in the filename, so we could extract this if we wanted to. + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + print("Nothing to be done.") + # lat/lon was only recorded for the first 11 images. Fill in the rest + # with the median latitude and longitude for the record at the end + # of this function. + + if 873995 <= ds_id <= 874002: + if verbose >= 1: + print(f"Interpolating latitude, longitude, and depth for dataset {ds_id}") + # Interpolate lat, lon, and depth based on datetime + df = interpolate_by_datetime(df, ["latitude", "longitude", "depth"]) + + if ds_id in [875071, 875073]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + # Drop rows without datetime values (these have missing lat/lon as well) + # For 875071, these images are of the deck of the ship. + # For 875073, these images have a translation of less than half an image + # from the subsequent image, so we don't need the ones without metadata. + df = df[~df["datetime"].isna()] + # Interpolate missing depth values + df = interpolate_by_datetime(df, ["depth"]) + + if ds_id in [875084]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + # For 875084, images without latitude and longitude are not useful. + # The first three are of the deck, the rest are dark watercolumn shots. + df = df[~df["longitude"].isna()] + # Interpolate missing depth values + df = interpolate_by_datetime(df, ["depth"]) + + if (878001 <= ds_id <= 878019) or ds_id == 878045: + if verbose >= 1: + print(f"{ds_id}: Dropping rows missing metadata for dataset {ds_id}") + # Images without metadata are of the water column and highly redundant. + df = df[~df["longitude"].isna()] + + if ds_id in [894732, 894734]: + if verbose >= 1: + print(f"{ds_id}: Dropping rows missing metadata for dataset {ds_id}") + # It's not clear to me that any of these images are of the seafloor. + df = df[~df["longitude"].isna()] + + if ds_id in [895557, 903782, 903788, 903850, 907025, 894801]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + print( + f"{ds_id}: Interpolating by index over subset of images in the same series" + ) + indices = np.arange(len(df)) + image_no_ext = df["image"].apply(lambda x: os.path.splitext(x)[0]) + image_major = image_no_ext.str[:-3] + missing_dt = df["datetime"].isna() + missing_lat = df["latitude"].isna() + missing_lon = df["longitude"].isna() + for image_major_i in image_major.unique(): + select = image_major == image_major_i + col = "latitude" + select_and_col = select & ~missing_lat + select_not_col = select & missing_lat + if any(select_and_col) and any(select_not_col): + df.loc[select_not_col, col] = np.interp( + indices[select_not_col], + indices[select_and_col], + df.loc[select_and_col, col], + ) + col = "longitude" + select_and_col = select & ~missing_lon + select_not_col = select & missing_lon + if any(select_and_col) and any(select_not_col): + df.loc[select_not_col, col] = np.interp( + indices[select_not_col], + indices[select_and_col], + df.loc[select_and_col, col], + ) + col = "datetime" + select_and_col = select & ~missing_dt + select_not_col = select & missing_dt + if any(select_and_col) and any(select_not_col): + df.loc[select_not_col, col] = scipy.interpolate.interp1d( + indices[select_and_col], + pd.to_datetime(df.loc[select_and_col, col]), + kind="nearest", + fill_value="extrapolate", + )(indices[select_not_col]) + + if ds_id in [911904, 918924, 919348]: + if verbose >= 1: + print(f"{ds_id}: Extracting missing datetime metadata for dataset {ds_id}") + # Extract missing datetime from the filename, formatted like (e.g.) + # TIMER_2019_03_31_at_05_50_12_IMG_0263 + has_no_datetime = df["datetime"].isna() + fname_inner = df.loc[has_no_datetime, "image"].apply( + lambda x: "_".join(x.split("_")[1:-2]) + ) + df.loc[has_no_datetime, "datetime"] = pd.to_datetime( + fname_inner, format="%Y_%m_%d_at_%H_%M_%S" + ) + if verbose >= 1: + print( + f"{ds_id}: Interpolating latitude, longitude, and depth for dataset {ds_id}" + ) + df = interpolate_by_datetime(df, ["latitude", "longitude", "depth"]) + + if ds_id in [914155]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + # Images without datetime are too dark + df = df[~df["datetime"].isna()] + # Other images are missing latitude and longitude metadata + df = interpolate_by_datetime(df, ["latitude", "longitude"]) + + if ds_id in [914156, 914197]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + # Some images are clearly of the same thing, but one is good visibility + # with no lat/lon, and the next is too dark and has no datetime. + for from_image, to_image in [ + ("IMG_0393", "IMG_0392"), + ("IMG_0395", "IMG_0394"), + ]: + columns = ["latitude", "longitude"] + select_from = df["image"].str.startswith(from_image) + select_to = df["image"].str.startswith(to_image) + df.loc[select_to, columns] = df.loc[select_from, columns] + # Drop images without datetime + df = df[~df["datetime"].isna()] + # Fill in any missing latitude and longitude metadata + df = interpolate_by_datetime(df, ["latitude", "longitude"]) + + if ds_id in [914192]: + if verbose >= 1: + print(f"{ds_id}: Fixing missing metadata for dataset {ds_id}") + # Some images are clearly of the same thing, but one is good visibility + # with no lat/lon, and the next is too dark and has no datetime. + for from_image, to_image in [ + ("IMG_1776", "IMG_1775"), + ]: + columns = ["latitude", "longitude"] + select_from = df["image"].str.startswith(from_image) + select_to = df["image"].str.startswith(to_image) + df.loc[select_to, columns] = df.loc[select_from, columns] + # Drop images without datetime + df = df[~df["datetime"].isna()] + # Fill in any missing latitude and longitude metadata + df = interpolate_by_datetime(df, ["latitude", "longitude"]) + + if any(df["latitude"].isna() | df["longitude"].isna()): + # Fill in any missing latitude and longitude values with the + # mean coordinate reported at the dataset level + ds = PanDataSet(ds_id) + if hasattr(ds, "geometryextent"): + lat = None + long = None + for k in ["meanLatitude", "latitude", "Latitude"]: + if k in ds.geometryextent: + lat = ds.geometryextent[k] + break + for k in ["meanLongitude", "longitude", "Latitude"]: + if k in ds.geometryextent: + long = ds.geometryextent[k] + break + if lat is not None: + if verbose >= 1: + print(f"{ds_id}: Using dataset mean latitude for missing values") + df.loc[df["latitude"].isna(), "latitude"] = lat + if long is not None: + if verbose >= 1: + print(f"{ds_id}: Using dataset mean longitude for missing values") + df.loc[df["longitude"].isna(), "longitude"] = long + + return df + + def process_datasets(input_dirname, output_path=None, verbose=0): """ Process a directory of datasets: clean, concatenate and save. @@ -776,6 +1472,15 @@ def process_datasets(input_dirname, output_path=None, verbose=0): if sum(df.isna().all("columns")) > 0: print(f"{ds_id} has a row which is all NaNs") + # Remove duplicated "favourited" images + df = fixup_favourite_images(df, verbose=verbose) + + # Fix incomplete lat/lon/datetime metadata + df = fixup_incomplete_metadata(df, ds_id, verbose=verbose) + + # Add datetime if it is completely missing + df = add_missing_datetime(df, ds_id, verbose=verbose) + dfs.append(df) dfs_fnames.append(fname) diff --git a/requirements.txt b/requirements.txt index 8726b63..05b0ea0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ pandas>=1.2.5 pangaeapy>=1.0.6 requests>=2.25.1 scikit-learn>=0.24.2 +scipy tqdm From 2c16f24788dc2c541e078a6b05a9e66812f18c82 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 00:29:21 +0100 Subject: [PATCH 22/92] MNT: Increase default verbosity level 0->1 --- pangaea_downloader/merge_benthic_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index b6ffb72..bae82d6 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1595,7 +1595,7 @@ def get_parser(): "--verbose", "-v", action="count", - default=0, + default=1, help=textwrap.dedent( """ Increase the level of verbosity of the program. This can be From 073efff5515e7db43499ae33ca0412c42674687a Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 00:29:42 +0100 Subject: [PATCH 23/92] MNT: Exclude some more dataset titles --- pangaea_downloader/merge_benthic_datasets.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index bae82d6..f8db667 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -199,6 +199,13 @@ def check_title(title): return False if title.startswith("Images of shell cross sections"): return False + if ( + "early biofouling processes in a coastal lagoon" in title.lower() + or "early biofouling processes in a coastal la goon" in title.lower() + ): + return False + if "photographs of tiles" in title.lower(): + return False return True From 0a0b632874b3f39b5aac3881d0e9d4684bffa6d7 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 00:30:13 +0100 Subject: [PATCH 24/92] MNT: Manually exclude dataset 805690, which was downloaded without its title? --- pangaea_downloader/merge_benthic_datasets.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index f8db667..aceae03 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1433,6 +1433,10 @@ def process_datasets(input_dirname, output_path=None, verbose=0): continue # for fname in tqdm(os.listdir(input_dirname)): ds_id = os.path.splitext(fname)[0] + if ds_id == "805690": + # The title was not captured from this dataset for some reason, + # so we can't exclude it via the title. + continue df = pd.read_csv(os.path.join(input_dirname, fname), low_memory=False) n_total += 1 if not checker.has_url_col(df): From 2bbbd66816d79d11c68c6d5db2c9ca8a480b3cb2 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 00:31:04 +0100 Subject: [PATCH 25/92] MNT: Make final report only appear if verbosity enabled --- pangaea_downloader/merge_benthic_datasets.py | 35 ++++++++++---------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index aceae03..89979b0 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1495,23 +1495,24 @@ def process_datasets(input_dirname, output_path=None, verbose=0): dfs.append(df) dfs_fnames.append(fname) - print(f"There are {n_valid} valid (of {n_total}) valid datasets") - print( - f"Of which {len(files_with_repeat_urls)} have repeated URLs (before replacing dups with image)" - ) - print( - f"Of which {len(files_with_repeat_urls2)} have repeated URLs (after replacing dups with image)" - ) - print() - print(f"There are {len(column_count)} unique column names:") - print() - - for col, count in dict( - sorted(column_count.items(), key=lambda item: item[1], reverse=True) - ).items(): - c = col + " " - print(f"{c:.<35s} {count:4d}") - print() + if verbose >= 0: + print(f"There are {n_valid} valid (of {n_total}) valid datasets") + print( + f"Of which {len(files_with_repeat_urls)} have repeated URLs (before replacing dups with image)" + ) + print( + f"Of which {len(files_with_repeat_urls2)} have repeated URLs (after replacing dups with image)" + ) + print() + print(f"There are {len(column_count)} unique column names:") + print() + + for col, count in dict( + sorted(column_count.items(), key=lambda item: item[1], reverse=True) + ).items(): + c = col + " " + print(f"{c:.<35s} {count:4d}") + print() if verbose >= 1: print("Filter columns") From 37bedd6021fb5b3c9ce83b6fcb4dc49362d991c5 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 00:31:33 +0100 Subject: [PATCH 26/92] MNT: Remove unused import of dateutil.parser --- pangaea_downloader/merge_benthic_datasets.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 89979b0..ffab652 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -12,7 +12,6 @@ from collections import defaultdict from functools import partial -import dateutil.parser import matplotlib.pyplot as plt import numpy as np import pandas as pd From f7a44eef4b9e8f26293c9efb23d74a4aa1155bce Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 00:45:00 +0100 Subject: [PATCH 27/92] ENH: Use caching functionality built into PanDataSet --- pangaea_downloader/merge_benthic_datasets.py | 4 ++-- pangaea_downloader/tools/datasets.py | 6 +++--- pangaea_downloader/tools/scraper.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index ffab652..25ee80d 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -758,7 +758,7 @@ def get_dataset_datetime(ds_id): The average datetime between the min and max extent, with precision reduced to reflect what can accurately be represented. """ - ds = PanDataSet(ds_id) + ds = PanDataSet(ds_id, enable_cache=True) dt_min = pd.to_datetime(ds.mintimeextent) dt_max = pd.to_datetime(ds.maxtimeextent) if dt_min is None and dt_max is None: @@ -1373,7 +1373,7 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1): if any(df["latitude"].isna() | df["longitude"].isna()): # Fill in any missing latitude and longitude values with the # mean coordinate reported at the dataset level - ds = PanDataSet(ds_id) + ds = PanDataSet(ds_id, enable_cache=True) if hasattr(ds, "geometryextent"): lat = None long = None diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index 23c7688..8cc319a 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -31,7 +31,7 @@ def fetch_child( global T_POLL_INTV t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time()) time.sleep(t_wait) # Stay under 180 requests every 30s - ds = PanDataSet(child_url, auth_token=auth_token) + ds = PanDataSet(child_url, enable_cache=True, auth_token=auth_token) T_POLL_LAST = time.time() # Dataset is restricted if ds.loginstatus != "unrestricted": @@ -70,7 +70,7 @@ def fetch_children( global T_POLL_INTV t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time()) time.sleep(t_wait) # Stay under 180 requests every 30s - ds = PanDataSet(parent_url, auth_token=auth_token) + ds = PanDataSet(parent_url, enable_cache=True, auth_token=auth_token) T_POLL_LAST = time.time() # Check restriction if ds.loginstatus != "unrestricted": @@ -110,7 +110,7 @@ def fetch_children( elif typ == "tabular": t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time()) time.sleep(t_wait) # Stay under 180 requests every 30s - child = PanDataSet(url, auth_token=auth_token) + child = PanDataSet(url, enable_cache=True, auth_token=auth_token) T_POLL_LAST = time.time() if ds.loginstatus != "unrestricted": if verbose >= 1: diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py index 22f4e84..a8fa7e5 100644 --- a/pangaea_downloader/tools/scraper.py +++ b/pangaea_downloader/tools/scraper.py @@ -17,7 +17,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: # Load dataset t_wait = max(0, datasets.T_POLL_LAST + datasets.T_POLL_INTV - time.time()) time.sleep(t_wait) # Stay under 180 requests every 30s - ds = PanDataSet(url) + ds = PanDataSet(url, enable_cache=True) datasets.T_POLL_LAST = time.time() # Request dataset url if verbose >= 1: From 1643a8bef872ade381c9b74d6f37ff521e167b26 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 00:59:00 +0100 Subject: [PATCH 28/92] MNT: Extract datetime from filename for rest of 896160 series --- pangaea_downloader/merge_benthic_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 25ee80d..5d1c1e0 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1027,7 +1027,7 @@ def fix_missing_datetime_from_image_name(df, ds_id, verbose=1): dtstr = selected_image_no_ext.apply(lambda x: "_".join(x.split("_")[1:])[:20]) df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y_%m_%dat%H_%M_%S") - elif ds_id in [896157]: + elif ds_id in [896157, 896160, 896164]: if verbose >= 1: print( f"{ds_id}: Extracting missing datetime from filename for dataset {ds_id}" From 5e4cf23ed0aae38fed1a73086da63cc38ea6bcab Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 00:59:16 +0100 Subject: [PATCH 29/92] MNT: Extract from filename from two more datasets --- pangaea_downloader/merge_benthic_datasets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 5d1c1e0..6aa8770 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -824,6 +824,8 @@ def fix_missing_datetime_from_image_name(df, ds_id, verbose=1): if ds_id in [ 785104, 785105, + 785106, + 785107, 785108, 785109, 785110, From 2f0d49c82988e146627465b4811f86132b04274b Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 00:59:47 +0100 Subject: [PATCH 30/92] MNT: Also manually exclude 803979, parent of 805690 --- pangaea_downloader/merge_benthic_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 6aa8770..11e1350 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1434,7 +1434,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0): continue # for fname in tqdm(os.listdir(input_dirname)): ds_id = os.path.splitext(fname)[0] - if ds_id == "805690": + if ds_id in ["805690", "803979"]: # The title was not captured from this dataset for some reason, # so we can't exclude it via the title. continue From f33f564b69ae05f792fd9eff3f40f9464109c4f2 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 01:05:32 +0100 Subject: [PATCH 31/92] ENH: Extract datetime from filename for datasets 371062, 371063, 371064 --- pangaea_downloader/merge_benthic_datasets.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 11e1350..6e90b99 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -821,7 +821,14 @@ def fix_missing_datetime_from_image_name(df, ds_id, verbose=1): selected_image_no_ext = selected_image.apply(lambda x: os.path.splitext(x)[0]) - if ds_id in [ + if 371062 <= ds_id <= 371064: + # e.g. PO309_41-1_2004-04-05T08_55_41.jpg + # e.g. PO309_41-2-1_2004-04-05T11_28_26.jpg + # e.g. PO322_211-4-1_2005-05-18T19_35_31.jpg + dtstr = selected_image_no_ext.apply(lambda x: "-".join(x.split("_")[2:])) + df.loc[select, "datetime"] = pd.to_datetime(dtstr, format="%Y-%m-%dT%H-%M-%S") + + elif ds_id in [ 785104, 785105, 785106, From 6929f5ff51dd862a7215ecc662d48c1f62cb9453 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 01:22:32 +0100 Subject: [PATCH 32/92] MNT: C416 Unnecessary dict comprehension - rewrite using dict() --- pangaea_downloader/tools/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py index a8fa7e5..34a4b52 100644 --- a/pangaea_downloader/tools/scraper.py +++ b/pangaea_downloader/tools/scraper.py @@ -124,7 +124,7 @@ def get_pagination(page_soup: BeautifulSoup, src_url: str) -> Optional[dict]: # List of page URLs page_urls = [urljoin(src_url, a["href"]) for a in pagination.find_all("a")][:-1] # Page number : Page URL - page_dict = {k: v for k, v in zip(page_nums, page_urls)} + page_dict = dict(zip(page_nums, page_urls)) return page_dict From 4e1309c56ec7f190311283fc2d454eb2e42a1490 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 01:22:51 +0100 Subject: [PATCH 33/92] Revert "MNT: Save results for parents whose children don't have URLs" This reverts commit ea5eba4e596766dc10a5c57a27a2bf07ed43dd04. We don't need to save these superfluous results now we have caching enabled. --- pangaea_downloader/pq_scraper.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py index fac599b..198e48d 100755 --- a/pangaea_downloader/pq_scraper.py +++ b/pangaea_downloader/pq_scraper.py @@ -85,7 +85,6 @@ def search_and_download( df_list = datasets.fetch_children( url, verbose=verbose - 1, - ensure_url=False, auth_token=auth_token, ) if df_list is None: @@ -125,7 +124,6 @@ def search_and_download( df = datasets.fetch_child( url, verbose=verbose - 1, - ensure_url=False, auth_token=auth_token, ) except Exception as err: From a7e152148f62eb50da10689ecd6ecb479a4e93bb Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 01:40:24 +0100 Subject: [PATCH 34/92] DOC: Typo Scrapping -> Scraping --- pangaea_downloader/tools/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index 8cc319a..a3c4ded 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -103,7 +103,7 @@ def fetch_children( continue elif typ == "paginated": if verbose >= 1: - print(f"\t\t[{i+1}] Scrapping dataset...") + print(f"\t\t[{i+1}] Scraping dataset...") df = scraper.scrape_image_data(url) if df is not None: df_list.append(df) From 322b20cd5d820b1380234057270c78a22bf3c357 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 02:07:12 +0100 Subject: [PATCH 35/92] MNT: Check other children even if one is a restricted tabular dataset --- pangaea_downloader/tools/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index a3c4ded..9c3c14b 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -119,7 +119,7 @@ def fetch_children( + f"\t\t[{i+1}] [ERROR] Access restricted: '{ds.loginstatus}'. {url}" + colorama.Fore.RESET ) - return + continue if ensure_url and not checker.has_url_col(child.data): if verbose >= 1: print( From f3f8d274dc94c5f0bf708deaa17d32809011fb0b Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 02:07:51 +0100 Subject: [PATCH 36/92] RF: Better loop conditioning structure, with common code at the end --- pangaea_downloader/tools/datasets.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index 9c3c14b..bb8a682 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -105,8 +105,6 @@ def fetch_children( if verbose >= 1: print(f"\t\t[{i+1}] Scraping dataset...") df = scraper.scrape_image_data(url) - if df is not None: - df_list.append(df) elif typ == "tabular": t_wait = max(0, T_POLL_LAST + T_POLL_INTV - time.time()) time.sleep(t_wait) # Stay under 180 requests every 30s @@ -127,12 +125,13 @@ def fetch_children( + f"\t\t[{i+1}] [WARNING] Image URL column NOT found! {url} Skipping..." + colorama.Fore.RESET ) - else: - # Add metadata - df = set_metadata(child) - # Add child dataset to list - df = exclude_rows(df) - df_list.append(df) + continue + # Add metadata + df = set_metadata(child) + # Add child dataset to list + df = exclude_rows(df) + if df is not None: + df_list.append(df) # Return result if len(df_list) > 0: From 24583c9c10473f834cd59e725eaf651781c8638e Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 02:23:36 +0100 Subject: [PATCH 37/92] MNT: Save title as dataset_title, not Dataset column --- pangaea_downloader/tools/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py index 34a4b52..7df2ec1 100644 --- a/pangaea_downloader/tools/scraper.py +++ b/pangaea_downloader/tools/scraper.py @@ -71,7 +71,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: lat, long = coordinates df["Longitude"] = long df["Latitude"] = lat - df["Dataset"] = ds.title + df["dataset_title"] = ds.title df["DOI"] = getattr(ds, "doi", "") if (len(ds.events) > 0) and (ds.events[0].campaign is not None): df["Campaign"] = ds.events[0].campaign.name From a752bdf92b8c8eecbc0100e35215e030b3bacbcf Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 02:29:04 +0100 Subject: [PATCH 38/92] RF: Move auto-deleting of partial file into save_df utility --- pangaea_downloader/pq_scraper.py | 11 +---------- pangaea_downloader/tools/datasets.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py index 198e48d..7165ba7 100755 --- a/pangaea_downloader/pq_scraper.py +++ b/pangaea_downloader/pq_scraper.py @@ -139,16 +139,7 @@ def search_and_download( # ----------------- SAVE TO FILE ----------------- # if df is None: continue - try: - saved = datasets.save_df(df, output_path, level=1, verbose=verbose - 1) - except Exception as err: - # Delete partially saved file, if present - if os.path.isfile(output_path): - try: - os.remove(output_path) - except Exception: - pass - raise err + saved = datasets.save_df(df, output_path, level=1, verbose=verbose - 1) n_downloads += 1 if saved else 0 if verbose >= 0: diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index bb8a682..2b24592 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -6,6 +6,8 @@ use pangaea_downloader.tools.scraper module. """ import os +import shutil +import tempfile import time from typing import List, Optional @@ -165,8 +167,14 @@ def save_df(df: DataFrame, output_path: str, level=1, index=None, verbose=1) -> if verbose >= 1: print(f"{tabs}[{idx}] Empty DataFrame! File not saved!") return False - # Save if dataframe not empty - df.to_csv(output_path, index=False) + # Save dataframe if it is not empty + with tempfile.TemporaryDirectory() as dir_tmp: + # Write to a temporary file + tmp_path = os.path.join(dir_tmp, os.path.basename(output_path)) + df.to_csv(tmp_path, index=False) + # Move our temporary file to the destination + os.makedirs(os.path.dirname(output_path), exist_ok=True) + shutil.move(tmp_path, output_path) if verbose >= 1: print(f"{tabs}[{idx}] Saved to '{output_path}'") return True From 03e381f93dc0d8576a0da490515ba7dff12033ab Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 02:30:23 +0100 Subject: [PATCH 39/92] ENH: Record ds_id while acquiring each dataset --- pangaea_downloader/tools/datasets.py | 18 ++++++++++++++++-- pangaea_downloader/tools/scraper.py | 5 ++++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index 2b24592..c8a591b 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -57,6 +57,10 @@ def fetch_child( df = set_metadata(ds) # Exclude unwanted rows df = exclude_rows(df) + # Add dataset ID + doi = getattr(ds, "doi", "") + ds_id = uri2dsid(doi if doi else child_url) + df["ds_id"] = ds_id return df @@ -89,6 +93,7 @@ def fetch_children( df_list = [] for i, child_uri in enumerate(ds.children): url = process.url_from_uri(child_uri) + ds_id = uri2dsid(child_uri) size = process.get_html_info(url) # Assess type try: @@ -132,8 +137,10 @@ def fetch_children( df = set_metadata(child) # Add child dataset to list df = exclude_rows(df) - if df is not None: - df_list.append(df) + if df is None: + continue + df["ds_id"] = ds_id + df_list.append(df) # Return result if len(df_list) > 0: @@ -224,6 +231,13 @@ def fix_text(text: str) -> str: return text +def uri2dsid(uri: str) -> str: + """ + Extract PANGAEA dataset ID from url/uri/doi string. + """ + return uri.split("PANGAEA.")[-1] + + def get_dataset_id(df: DataFrame) -> str: """Take a Pandas DataFrame as input and return the datasets Pangaea ID.""" col = find_column_match("doi") diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py index 7df2ec1..01c487a 100644 --- a/pangaea_downloader/tools/scraper.py +++ b/pangaea_downloader/tools/scraper.py @@ -72,7 +72,10 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: df["Longitude"] = long df["Latitude"] = lat df["dataset_title"] = ds.title - df["DOI"] = getattr(ds, "doi", "") + doi = getattr(ds, "doi", "") + df["DOI"] = doi + ds_id = datasets.uri2dsid(doi if doi else url) + df["ds_id"] = ds_id if (len(ds.events) > 0) and (ds.events[0].campaign is not None): df["Campaign"] = ds.events[0].campaign.name return df From 2a45e0d1a8a0c10d19d46131380164693fe969d4 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 02:39:45 +0100 Subject: [PATCH 40/92] MNT: Save children of parents individually, not merged together --- pangaea_downloader/pq_scraper.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py index 7165ba7..e187eb5 100755 --- a/pangaea_downloader/pq_scraper.py +++ b/pangaea_downloader/pq_scraper.py @@ -104,7 +104,22 @@ def search_and_download( + colorama.Fore.RESET ) continue - df = pd.concat(df_list) + for df in df_list: + if df is None: + continue + # Add the parent's ID to the dataframe + df["parent_ds_id"] = ds_id + # Save the child to its own CSV, including a column that + # records the parent's dataset ID + child_id = df.iloc[0]["ds_id"] + child_output_path = os.path.join(output_dir, f"{child_id}.csv") + saved = datasets.save_df( + df, child_output_path, level=1, verbose=verbose - 1 + ) + n_downloads += 1 if saved else 0 + # We have saved all the children individually, so will skip + # saving a redundant merged dataframe + continue else: try: dataset_type = process.ds_type(size) From 93ec48c3775e64f9788854bea6bcc2c9a9922de8 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 09:43:15 +0100 Subject: [PATCH 41/92] ENH: Record child to parent dataset ID mapping --- pangaea_downloader/pq_scraper.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py index e187eb5..7cd6ee6 100755 --- a/pangaea_downloader/pq_scraper.py +++ b/pangaea_downloader/pq_scraper.py @@ -56,6 +56,10 @@ def search_and_download( os.makedirs(output_dir, exist_ok=True) df_results.to_csv(output_dir.rstrip("/") + "_search_results.csv", index=False) + fname_child2parent = output_dir.rstrip("/") + "_child2parent.csv" + with open(fname_child2parent, "w") as f: + f.write("child,parent\n") + # Process each result dictionary n_files = 0 n_downloads = 0 @@ -117,6 +121,8 @@ def search_and_download( df, child_output_path, level=1, verbose=verbose - 1 ) n_downloads += 1 if saved else 0 + with open(fname_child2parent, "a") as f: + f.write(f"{child_id},{ds_id}\n") # We have saved all the children individually, so will skip # saving a redundant merged dataframe continue From c0c75cc7e0720575f6cbcf1784b07cd5a6b6774d Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:02:10 +0100 Subject: [PATCH 42/92] MNT: Fix latitude- and longitude- lookup --- pangaea_downloader/merge_benthic_datasets.py | 24 +++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 6e90b99..59c44c3 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -371,14 +371,22 @@ def reformat_df(df, remove_duplicate_columns=True): df.rename(columns=mapping, inplace=True, errors="raise") # Handle latitudesouth and longitudewest - if "latitude" not in df.columns and "latitudesouth" in df.columns: - df["latitude"] = -df["latitudesouth"] - if "latitude" not in df.columns and "latitude-" in df.columns: - df["latitude"] = -df["latitude-"] - if "longitude" not in df.columns and "longitudewest" in df.columns: - df["longitude"] = -df["longitudewest"] - if "longitude" not in df.columns and "longitude-" in df.columns: - df["longitude"] = -df["longitude-"] + if "latitude" not in df.columns and "latitudesouth" in lower_cols: + col = df.columns[lower_cols.index("latitudesouth")] + print(f"Using {col} for {df.iloc[0]['dataset']}") + df["latitude"] = -df[col] + if "latitude" not in df.columns and "latitude-" in lower_cols: + col = df.columns[lower_cols.index("latitude-")] + print(f"Using {col} for {df.iloc[0]['dataset']}") + df["latitude"] = -df[col] + if "longitude" not in df.columns and "longitudewest" in lower_cols: + col = df.columns[lower_cols.index("longitudewest")] + print(f"Using {col} for {df.iloc[0]['dataset']}") + df["longitude"] = -df[col] + if "longitude" not in df.columns and "longitude-" in lower_cols: + col = df.columns[lower_cols.index("longitude-")] + print(f"Using {col} for {df.iloc[0]['dataset']}") + df["longitude"] = -df[col] # Add file extension to image df["image"] = df.apply(add_file_extension, axis=1) From 0099de3805ee91a21d3e831e082ec6d3e346dda8 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:04:52 +0100 Subject: [PATCH 43/92] MNT: Fix method for merging elevation data with depth data --- pangaea_downloader/merge_benthic_datasets.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 59c44c3..8958f87 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -304,7 +304,6 @@ def reformat_df(df, remove_duplicate_columns=True): "bathymetry", "bathy", "depth", - "elevation", ], "backscatter": [], "temperature": ["temperature", "temp"], @@ -388,6 +387,12 @@ def reformat_df(df, remove_duplicate_columns=True): print(f"Using {col} for {df.iloc[0]['dataset']}") df["longitude"] = -df[col] + # Use elevation if there was no depth + if "depth" not in df.columns and "elevation" in lower_cols: + col = df.columns[lower_cols.index("elevation")] + print(f"Using {col} for {df.iloc[0]['dataset']}") + df["depth"] = -df[col] + # Add file extension to image df["image"] = df.apply(add_file_extension, axis=1) # if "timestamp" not in df.columns and "datetime" in df.columns: From 36570fc68e1faaa8e79020ca184ef2a60e42db26 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:05:16 +0100 Subject: [PATCH 44/92] MNT: Redact erroneously negative depth values --- pangaea_downloader/merge_benthic_datasets.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 8958f87..ae2448f 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -387,6 +387,11 @@ def reformat_df(df, remove_duplicate_columns=True): print(f"Using {col} for {df.iloc[0]['dataset']}") df["longitude"] = -df[col] + # Remove datapoints with erroneous negative depth + if "depth" in df.columns: + # Only observed two datapoints where this happens + df.loc[df["depth"] < 0, "depth"] = pd.NA + # Use elevation if there was no depth if "depth" not in df.columns and "elevation" in lower_cols: col = df.columns[lower_cols.index("elevation")] From 5760dd5c9bfd3444edc10a33d673eab8d01da5a2 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:05:49 +0100 Subject: [PATCH 45/92] ENH: Handle heightaboveseafloor as an altitude field --- pangaea_downloader/merge_benthic_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index ae2448f..088b12f 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -297,7 +297,7 @@ def reformat_df(df, remove_duplicate_columns=True): ], "x_pos": [], "y_pos": [], - "altitude": ["altitude", "height"], + "altitude": ["altitude", "heightaboveseafloor", "height"], "depth": [ "depthwater", "bathydepth", From daedd46ab06b308c0a866f37074d50691f919e53 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:08:37 +0100 Subject: [PATCH 46/92] ENH: Add kwargs pass-through to interpolate_by_datetime --- pangaea_downloader/merge_benthic_datasets.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 088b12f..c65e775 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1154,7 +1154,7 @@ def add_missing_datetime(df, ds_id=None, verbose=1): return df -def interpolate_by_datetime(df, columns): +def interpolate_by_datetime(df, columns, **kwargs): """ Use datetime column to interpolate values for selected columns. @@ -1165,6 +1165,8 @@ def interpolate_by_datetime(df, columns): in other columns. columns : str or iterable of str Name of column or columns to fill in missing values with interpolation. + **kwargs + Additional arguments as per :func:`numpy.interp`. Returns ------- @@ -1185,6 +1187,7 @@ def interpolate_by_datetime(df, columns): datetime_actual[has_dt_not_col], datetime_actual[has_dt_and_col], df.loc[has_dt_and_col, col], + **kwargs, ) return df From df1a8558afba11913fac10c6bc3ae3004b6e3930 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:10:26 +0100 Subject: [PATCH 47/92] MNT: Don't extrapolate depth beyond measured values --- pangaea_downloader/merge_benthic_datasets.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index c65e775..50553ba 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1180,6 +1180,12 @@ def interpolate_by_datetime(df, columns, **kwargs): if isinstance(columns, str): columns = [columns] for col in columns: + interp_kwargs = kwargs + if col in ["depth", "altitude"]: + if "left" not in interp_kwargs: + interp_kwargs["left"] = np.nan + if "right" not in interp_kwargs: + interp_kwargs["right"] = np.nan has_col = ~df[col].isna() has_dt_and_col = has_datetime & has_col has_dt_not_col = has_datetime & ~has_col @@ -1187,7 +1193,7 @@ def interpolate_by_datetime(df, columns, **kwargs): datetime_actual[has_dt_not_col], datetime_actual[has_dt_and_col], df.loc[has_dt_and_col, col], - **kwargs, + **interp_kwargs, ) return df From 9bfe308c1eedd4981e99db51b480acbcda530276 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:13:13 +0100 Subject: [PATCH 48/92] ENH: Interpolate holes in depth values based on datetime --- pangaea_downloader/merge_benthic_datasets.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 50553ba..d2cff88 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1406,6 +1406,18 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1): # Fill in any missing latitude and longitude metadata df = interpolate_by_datetime(df, ["latitude", "longitude"]) + if ( + (702075 <= ds_id <= 702080) + or (818484 <= ds_id <= 818509) + or ds_id in [849287, 849289] + or 862084 <= ds_id <= 862097 + or ds_id in [875072, 875074] + or 875081 <= ds_id <= 875085 + ): + if verbose >= 1: + print(f"{ds_id}: Interpolating missing depth metadata for dataset {ds_id}") + df = interpolate_by_datetime(df, ["depth"]) + if any(df["latitude"].isna() | df["longitude"].isna()): # Fill in any missing latitude and longitude values with the # mean coordinate reported at the dataset level From 10caaa033f4654c99abd7b4cbad824f79ef93c36 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:13:50 +0100 Subject: [PATCH 49/92] BUG: Check if child dataframe is empty before trying to save --- pangaea_downloader/pq_scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py index 7cd6ee6..e99f14d 100755 --- a/pangaea_downloader/pq_scraper.py +++ b/pangaea_downloader/pq_scraper.py @@ -109,7 +109,7 @@ def search_and_download( ) continue for df in df_list: - if df is None: + if df is None or len(df) == 0: continue # Add the parent's ID to the dataframe df["parent_ds_id"] = ds_id From 53356a62894fd652f67f1d634c3b376ff5486c4b Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:15:08 +0100 Subject: [PATCH 50/92] MNT: Save empty parent CSV for easy search download resumption --- pangaea_downloader/pq_scraper.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pangaea_downloader/pq_scraper.py b/pangaea_downloader/pq_scraper.py index e99f14d..9623b10 100755 --- a/pangaea_downloader/pq_scraper.py +++ b/pangaea_downloader/pq_scraper.py @@ -57,8 +57,9 @@ def search_and_download( df_results.to_csv(output_dir.rstrip("/") + "_search_results.csv", index=False) fname_child2parent = output_dir.rstrip("/") + "_child2parent.csv" - with open(fname_child2parent, "w") as f: - f.write("child,parent\n") + if not os.path.isfile(fname_child2parent): + with open(fname_child2parent, "w") as f: + f.write("child,parent\n") # Process each result dictionary n_files = 0 @@ -125,6 +126,9 @@ def search_and_download( f.write(f"{child_id},{ds_id}\n") # We have saved all the children individually, so will skip # saving a redundant merged dataframe + # But we will save an empty file so we know to skip + with open(output_path, "w") as f: + f.write("is_parent") continue else: try: From b5f3da5b4375cdd90825b91165724db88aac2d47 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:20:49 +0100 Subject: [PATCH 51/92] STY: Import from instead of aliasing --- pangaea_downloader/tools/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py index 01c487a..0af5cea 100644 --- a/pangaea_downloader/tools/scraper.py +++ b/pangaea_downloader/tools/scraper.py @@ -9,7 +9,7 @@ from pangaeapy import PanDataSet from requests.compat import urljoin -import pangaea_downloader.tools.datasets as datasets +from pangaea_downloader.tools import datasets def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: From ff2e9a445c55238207bf7985491b02e0875de34f Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:23:32 +0100 Subject: [PATCH 52/92] ENH: Add wrapper to requests.get with 30s backoff on 429 status --- pangaea_downloader/tools/requesting.py | 49 ++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 pangaea_downloader/tools/requesting.py diff --git a/pangaea_downloader/tools/requesting.py b/pangaea_downloader/tools/requesting.py new file mode 100644 index 0000000..1ec7ef0 --- /dev/null +++ b/pangaea_downloader/tools/requesting.py @@ -0,0 +1,49 @@ +""" +URL request utilities. +""" + +import time + +import requests + + +def get_request_with_backoff(url, retries=5, backoff_factor=1, verbose=1, **kwargs): + """ + Fetch a URL resource using requests with a custom backoff strategy for re-attempts. + + Parameters + ---------- + url : str + The URL to request. + retries : int, default=5 + Maximum number of attempts. + backoff_factor : float, default=1 + Base time to wait for before attempting to download again when receiving + a 500 or 503 HTTP status code. + verbose : int, default=1 + Verbosity level. + **kwargs + Additional arguments as per :func:`requests.get`. + """ + for i_attempt in range(retries): + r = requests.get(url, **kwargs) + if r.status_code not in [429, 500, 503]: + # Status code looks good + break + # N.B. Could also retry on [408, 502, 504, 599] + if r.status_code == 429: + # PANGAEA has a maximum of 180 requests within a 30s period + # Wait for this to cool off completely. + t_wait = 30 + else: + # Other errors indicate a server side error. Wait a + # short period and then retry to see if it alleviates. + t_wait = backoff_factor * 2**i_attempt + if verbose >= 1: + print( + "Retrying in {} seconds (HTTP Status {}): {}".format( + t_wait, r.status_code, url + ) + ) + time.sleep(t_wait) + return r From 55a4563e9aec764fe85a6e00334ef55af334d5d4 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:25:00 +0100 Subject: [PATCH 53/92] ENH: Use 30s backoff on 429 status --- pangaea_downloader/citations.py | 5 +++-- pangaea_downloader/licenses.py | 5 +++-- pangaea_downloader/tools/eda.py | 5 +++-- pangaea_downloader/tools/process.py | 5 +++-- pangaea_downloader/tools/scraper.py | 8 ++++---- 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/pangaea_downloader/citations.py b/pangaea_downloader/citations.py index db6796f..833463c 100644 --- a/pangaea_downloader/citations.py +++ b/pangaea_downloader/citations.py @@ -1,13 +1,14 @@ import pickle import pandas as pd -import requests + +from .tools import requesting def get_bibtex(ds_id: str, verbose=False) -> str: """Get the BibTex Citation of a Pangaea dataset using the dataset ID.""" bib_url = f"https://doi.pangaea.de/10.1594/PANGAEA.{ds_id}?format=citation_bibtex" - resp = requests.get(bib_url) + resp = requesting.get_request_with_backoff(bib_url) if verbose: print("\tStatus code:", resp.status_code) return resp.text diff --git a/pangaea_downloader/licenses.py b/pangaea_downloader/licenses.py index 4ba9a38..8cfac90 100644 --- a/pangaea_downloader/licenses.py +++ b/pangaea_downloader/licenses.py @@ -5,10 +5,11 @@ from typing import Dict, Optional, Union import pandas as pd -import requests from bs4 import BeautifulSoup from tqdm import tqdm +from .tools import requesting + def get_dataset_url(ds_id: Union[str, int]) -> str: """Return dataset URL given the six digit dataset ID.""" @@ -18,7 +19,7 @@ def get_dataset_url(ds_id: Union[str, int]) -> str: def get_dataset_license_info(url: str) -> Optional[Dict[str, str]]: """Return a dictionary with license information given the dataset URL.""" # Make a request to the URL and parse the html - resp = requests.get(url) + resp = requesting.get_request_with_backoff(url) soup = BeautifulSoup(resp.text, "lxml") # Get the tag containing the license info license_tag = soup.find("a", attrs={"rel": "license"}) diff --git a/pangaea_downloader/tools/eda.py b/pangaea_downloader/tools/eda.py index 219fa31..9696c93 100644 --- a/pangaea_downloader/tools/eda.py +++ b/pangaea_downloader/tools/eda.py @@ -4,10 +4,11 @@ import matplotlib.cm import matplotlib.colors import numpy as np -import requests from matplotlib.pyplot import get_cmap from sklearn.neighbors import KernelDensity +from . import requesting + def url_from_doi(doi: str) -> str: """ @@ -29,7 +30,7 @@ def img_from_url(url: str, verbose=False) -> np.array: """Take an image url and return retrieved image array.""" success = False while not success: - resp = requests.get(url, stream=True) + resp = requesting.get_request_with_backoff(url, stream=True) print(f"status code: {resp.status_code}") if verbose else 0 success = True if (resp.status_code == 200) else False if success: diff --git a/pangaea_downloader/tools/process.py b/pangaea_downloader/tools/process.py index 6d92e73..74d8ea6 100644 --- a/pangaea_downloader/tools/process.py +++ b/pangaea_downloader/tools/process.py @@ -1,9 +1,10 @@ """Functions for processing each of the result items.""" from typing import Optional, Tuple -import requests from bs4 import BeautifulSoup +from . import requesting + def url_from_uri(uri: str, base_url="https://doi.pangaea.de/") -> str: """Take a pangaea uri/doi string as input and return its corresponding url string.""" @@ -28,7 +29,7 @@ def get_result_info(res: dict) -> Tuple[str, str, str, str, bool]: def get_html_info(url: str) -> Optional[str]: """Make get request to dataset webpage and return dataset size.""" # Make get request to webpage - resp = requests.get(url) + resp = requesting.get_request_with_backoff(url) if resp.status_code == 200: # Parse html soup = BeautifulSoup(resp.text, "lxml") diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py index 0af5cea..b5871b5 100644 --- a/pangaea_downloader/tools/scraper.py +++ b/pangaea_downloader/tools/scraper.py @@ -9,7 +9,7 @@ from pangaeapy import PanDataSet from requests.compat import urljoin -from pangaea_downloader.tools import datasets +from pangaea_downloader.tools import datasets, requesting def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: @@ -22,7 +22,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: # Request dataset url if verbose >= 1: print("\t\t\t[INFO] Requesting:", url) - resp = requests.get(url) + resp = requesting.get_request_with_backoff(url) # Parse response soup = BeautifulSoup(resp.text, "lxml") # Get coordinates of expedition @@ -58,7 +58,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: if verbose >= 1: print("\t\t\t[INFO] URL to photos page:", download_link) # Get to photos page (page 1) - resp = requests.get(download_link) + resp = requesting.get_request_with_backoff(download_link) photos_page = BeautifulSoup(resp.text, "lxml") img_urls = get_urls_from_each_page(photos_page, src_url, verbose=verbose) if img_urls is None: @@ -107,7 +107,7 @@ def get_urls_from_each_page( if verbose >= 1: print(f"\t\t\t[INFO] Processing Page {n}...") url = pagination[n] - resp = requests.get(url) + resp = requesting.get_request_with_backoff(url) soup = BeautifulSoup(resp.text, "lxml") urls = get_page_image_urls(soup, verbose=verbose) img_urls.extend(urls) From 87cd24ac72361795b080d30464e366c6c34ef75b Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:25:40 +0100 Subject: [PATCH 54/92] JNB: Fix reference to benthicnet.io utilities --- notebooks/explore-depth-columns.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb index b84d525..27aeaaf 100644 --- a/notebooks/explore-depth-columns.ipynb +++ b/notebooks/explore-depth-columns.ipynb @@ -18,7 +18,7 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", - "from benthicnet.utils import sanitize_filename, sanitize_filename_series\n", + "from benthicnet.io import sanitize_filename, sanitize_filename_series\n", "from IPython.display import display\n", "from tqdm.auto import tqdm\n", "\n", From d6cb21152d1d3c33f8fd2a0b148be8b5f8e99c86 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:29:06 +0100 Subject: [PATCH 55/92] JNB: Don't use low_memory mode loading df --- notebooks/explore-depth-columns.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb index 27aeaaf..2d4110b 100644 --- a/notebooks/explore-depth-columns.ipynb +++ b/notebooks/explore-depth-columns.ipynb @@ -249,7 +249,7 @@ "\n", "val_exception = {}\n", "for i, file in enumerate(column_examples[key]):\n", - " df = pd.read_csv(os.path.join(dirname, file))\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", " url_column = find_url_column(df)\n", " df.columns = [col.lower() for col in df.columns]\n", " # Extract info\n", @@ -308,7 +308,7 @@ "key = \"bathy depth\"\n", "\n", "for i, file in enumerate(column_examples[key]):\n", - " df = pd.read_csv(os.path.join(dirname, file))\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", " url_column = find_url_column(df)\n", " df.columns = [col.lower() for col in df.columns]\n", " # Extract info\n", @@ -361,7 +361,7 @@ "# Depth bot & depth top\n", "\n", "for i, file in enumerate(column_examples[keys[0]]):\n", - " df = pd.read_csv(os.path.join(dirname, file))\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", " url_column = find_url_column(df)\n", " df.columns = [col.lower() for col in df.columns]\n", " for key in keys:\n", @@ -458,7 +458,7 @@ "keys = [\"depth\", \"bathy depth\"]\n", "if len(intersect) > 0:\n", " for file in intersect:\n", - " df = pd.read_csv(os.path.join(dirname, file))\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", " df.columns = [col.lower() for col in df.columns]\n", " for key in keys:\n", " # Extract info\n", @@ -513,7 +513,7 @@ "keys = [\"depth water\", \"bathy depth\"]\n", "if len(intersect) > 0:\n", " for file in intersect:\n", - " df = pd.read_csv(os.path.join(dirname, file))\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", " df.columns = [col.lower() for col in df.columns]\n", " for key in keys:\n", " # Extract info\n", From 6cb1bc5b76e81040b5db21923117c795662be01d Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:29:52 +0100 Subject: [PATCH 56/92] JNB: Fix typo --- notebooks/explore-depth-columns.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb index 2d4110b..65dedd2 100644 --- a/notebooks/explore-depth-columns.ipynb +++ b/notebooks/explore-depth-columns.ipynb @@ -272,7 +272,7 @@ " # print(\"\\tMin or Max non-positive.\")\n", " # val_exception[url] = (mean, sd, min_, max_, start, end)\n", " if value_near_zero(start) or value_near_zero(end):\n", - " print(\"\\tStart or Ene near zero.\")\n", + " print(\"\\tStart or End near zero.\")\n", " val_exception[url] = (mean, sd, min_, max_, start, end)" ] }, From 973b25de4cbd12fc7a257e8cfe42b575ade5812a Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:30:26 +0100 Subject: [PATCH 57/92] JNB+BUG: Need to reset val_exception before parsing new keys --- notebooks/explore-depth-columns.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb index 65dedd2..f741734 100644 --- a/notebooks/explore-depth-columns.ipynb +++ b/notebooks/explore-depth-columns.ipynb @@ -307,6 +307,7 @@ "# Column to find\n", "key = \"bathy depth\"\n", "\n", + "val_exception = {}\n", "for i, file in enumerate(column_examples[key]):\n", " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", " url_column = find_url_column(df)\n", From b583e8d3d312b75b0f3c04633a03898e5580c8c1 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:31:34 +0100 Subject: [PATCH 58/92] JNB+MNT: Reflect yaxis instead of plotting negative of depth --- notebooks/explore-depth-columns.ipynb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb index f741734..df659e2 100644 --- a/notebooks/explore-depth-columns.ipynb +++ b/notebooks/explore-depth-columns.ipynb @@ -265,7 +265,8 @@ " f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}, Depth start: {start}, end: {end}\"\n", " )\n", " plt.figure(figsize=(16, 4))\n", - " plt.plot(-df[key], label=key)\n", + " plt.plot(df[key], label=key)\n", + " plt.gca().invert_yaxis()\n", " plt.show()\n", " # Datasets that defy column value norms\n", " # if (min_ <= 0) or (max_ <= 0):\n", @@ -321,7 +322,8 @@ " # Show\n", " print(f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\")\n", " plt.figure(figsize=(16, 4))\n", - " plt.plot(-df[key], label=key)\n", + " plt.plot(df[key], label=key)\n", + " plt.gca().invert_yaxis()\n", " plt.show()\n", " if (min_ < 0) or (max_ < 0):\n", " print(\"\\tDoes not satisfy column value norms.\")\n", From 2782beb4d96996e12b915ca47d50d7a26ebd93f7 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:33:27 +0100 Subject: [PATCH 59/92] JNB: Add title, ylabel, and print link to dataset --- notebooks/explore-depth-columns.ipynb | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb index df659e2..dea076c 100644 --- a/notebooks/explore-depth-columns.ipynb +++ b/notebooks/explore-depth-columns.ipynb @@ -266,8 +266,11 @@ " )\n", " plt.figure(figsize=(16, 4))\n", " plt.plot(df[key], label=key)\n", + " plt.ylabel(key)\n", + " plt.title(url.split(\"/\")[-1] + \" : \" + key)\n", " plt.gca().invert_yaxis()\n", " plt.show()\n", + " print(url)\n", " # Datasets that defy column value norms\n", " # if (min_ <= 0) or (max_ <= 0):\n", " # print(\"\\tMin or Max non-positive.\")\n", @@ -323,8 +326,11 @@ " print(f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\")\n", " plt.figure(figsize=(16, 4))\n", " plt.plot(df[key], label=key)\n", + " plt.ylabel(key)\n", + " plt.title(url.split(\"/\")[-1] + \" : \" + key)\n", " plt.gca().invert_yaxis()\n", " plt.show()\n", + " print(url)\n", " if (min_ < 0) or (max_ < 0):\n", " print(\"\\tDoes not satisfy column value norms.\")\n", " val_exception[url] = (mean, sd, min_, max_)" @@ -383,7 +389,9 @@ " plt.plot(df[key], label=key)\n", " plt.plot(abs(df[\"depth top\"] - df[\"depth bot\"]), label=\"diff\", linestyle=\":\")\n", " plt.legend()\n", - " plt.show()" + " plt.title(url.split(\"/\")[-1])\n", + " plt.show()\n", + " print(url)" ] }, { From f8663be7d8c1ffa49d7354777c1882af0b55c906 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:34:47 +0100 Subject: [PATCH 60/92] JNB: Highlight negative depth --- notebooks/explore-depth-columns.ipynb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb index dea076c..34dcde7 100644 --- a/notebooks/explore-depth-columns.ipynb +++ b/notebooks/explore-depth-columns.ipynb @@ -277,6 +277,9 @@ " # val_exception[url] = (mean, sd, min_, max_, start, end)\n", " if value_near_zero(start) or value_near_zero(end):\n", " print(\"\\tStart or End near zero.\")\n", + " val_exception[url] = (mean, sd, min_, max_, start, end)\n", + " if min_ < 0:\n", + " print(\"\\tNegative depth.\")\n", " val_exception[url] = (mean, sd, min_, max_, start, end)" ] }, From e809f54c4c5373a4f0479ce90b92371f1fa5376f Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:35:41 +0100 Subject: [PATCH 61/92] JNB: Plot elevation --- notebooks/explore-depth-columns.ipynb | 173 +++++++++++++++++++++++++- 1 file changed, 170 insertions(+), 3 deletions(-) diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb index 34dcde7..2f3c590 100644 --- a/notebooks/explore-depth-columns.ipynb +++ b/notebooks/explore-depth-columns.ipynb @@ -397,6 +397,73 @@ " print(url)" ] }, + { + "cell_type": "markdown", + "id": "0ee401a9-e936-4d8b-915d-ed3b1303fd65", + "metadata": {}, + "source": [ + "### 2.4 Elevation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d1ae559-e6ee-47b8-8f20-69bcef238cb5", + "metadata": { + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Column to find\n", + "key = \"elevation\"\n", + "\n", + "val_exception = {}\n", + "for i, file in enumerate(column_examples[key]):\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", + " url_column = find_url_column(df)\n", + " df.columns = [col.lower() for col in df.columns]\n", + " # Extract info\n", + " mean = df[key].mean()\n", + " sd = df[key].std()\n", + " min_ = df[key].min()\n", + " max_ = df[key].max()\n", + " url = get_dataset_url(file)\n", + " # Check for start and end at 0 altitude/depth\n", + " start, end = df[key].iloc[0], df[key].iloc[-1]\n", + " # Show\n", + " print(\n", + " f\"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}, Depth start: {start}, end: {end}\"\n", + " )\n", + " plt.figure(figsize=(16, 4))\n", + " plt.plot(df[key], label=key)\n", + " plt.ylabel(key)\n", + " plt.title(url.split(\"/\")[-1] + \" : \" + key)\n", + " plt.show()\n", + " print(url)\n", + " # Datasets that defy column value norms\n", + " # if (min_ <= 0) or (max_ <= 0):\n", + " # print(\"\\tMin or Max non-positive.\")\n", + " # val_exception[url] = (mean, sd, min_, max_, start, end)\n", + " if max_ > 0:\n", + " print(\"\\tPositive elevation.\")\n", + " val_exception[url] = (mean, sd, min_, max_, start, end)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceb64c1f-b5b3-4d8c-9943-b9c6810a1d53", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "val_exception" + ] + }, { "cell_type": "markdown", "id": "83e54609", @@ -416,7 +483,7 @@ "print(len(column_examples[\"depth water\"]))\n", "print(len(column_examples[\"bathy depth\"]))\n", "print(len(column_examples[\"bathy depth_2\"]))\n", - "print(len(column_examples[\"bathy_depth\"]))" + "print(len(column_examples[\"elevation\"]))" ] }, { @@ -614,13 +681,113 @@ "**NOTE:** Upon checking the dataset webpages we see that the two bathy depth columns correspond to the original collection and recollection sites." ] }, + { + "cell_type": "markdown", + "id": "61d6de0f-09d8-43f5-a2b6-c47afed77a9d", + "metadata": {}, + "source": [ + "## 3.5 Datasets with depth water and elevation" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "6aabda82", + "id": "c3627d1c-717d-4dc2-b20d-761adebd513d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column1 = \"depth water\"\n", + "column2 = \"elevation\"\n", + "\n", + "fnames_set1 = set(column_examples[column1])\n", + "fnames_set2 = set(column_examples[column2])\n", + "intersect = fnames_set1.intersection(fnames_set2)\n", + "\n", + "print(f\"{column1} count:\", len(fnames_set1))\n", + "print(f\"{column2} count:\", len(fnames_set2))\n", + "print(\"# of files with both:\", len(intersect))\n", + "print()\n", + "\n", + "keys = [column1, column2]\n", + "if len(intersect) > 0:\n", + " for file in intersect:\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", + " df.columns = [col.lower() for col in df.columns]\n", + " for key in keys:\n", + " # Extract info\n", + " mean = df[key].mean()\n", + " sd = df[key].std()\n", + " min_ = df[key].min()\n", + " max_ = df[key].max()\n", + " url = get_dataset_url(file)\n", + " # Show\n", + " print(\n", + " f\"[{i}] '{key}' Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\"\n", + " )\n", + " # Plot\n", + " plt.figure(figsize=(16, 4))\n", + " for key in keys:\n", + " factor = 1 if key == \"elevation\" else -1\n", + " plt.plot(factor * df[key], label=key.capitalize())\n", + " plt.legend()\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "eeae8d64-0038-47ea-bc0b-8e59e0724b5e", "metadata": {}, + "source": [ + "## 3.6 Datasets with bathy depth and elevation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eccfc931-307a-4007-8306-6ea918a1489b", + "metadata": { + "tags": [] + }, "outputs": [], - "source": [] + "source": [ + "column1 = \"bathy depth\"\n", + "column2 = \"elevation\"\n", + "\n", + "fnames_set1 = set(column_examples[column1])\n", + "fnames_set2 = set(column_examples[column2])\n", + "intersect = fnames_set1.intersection(fnames_set2)\n", + "\n", + "print(f\"{column1} count:\", len(fnames_set1))\n", + "print(f\"{column2} count:\", len(fnames_set2))\n", + "print(\"# of files with both:\", len(intersect))\n", + "print()\n", + "\n", + "keys = [column1, column2]\n", + "if len(intersect) > 0:\n", + " for file in intersect:\n", + " df = pd.read_csv(os.path.join(dirname, file), low_memory=False)\n", + " df.columns = [col.lower() for col in df.columns]\n", + " for key in keys:\n", + " # Extract info\n", + " mean = df[key].mean()\n", + " sd = df[key].std()\n", + " min_ = df[key].min()\n", + " max_ = df[key].max()\n", + " url = get_dataset_url(file)\n", + " # Show\n", + " print(\n", + " f\"[{i}] '{key}' Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}\"\n", + " )\n", + " # Plot\n", + " plt.figure(figsize=(16, 4))\n", + " for key in keys:\n", + " factor = 1 if key == \"elevation\" else -1\n", + " plt.plot(factor * df[key], label=key.capitalize())\n", + " plt.legend()\n", + " plt.show()" + ] } ], "metadata": { From 154ff1fca76171fa4bc1a889c06739f17af81bb6 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:48:49 +0100 Subject: [PATCH 62/92] BUG: Need to drop columns after handling reversed columns --- pangaea_downloader/merge_benthic_datasets.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index d2cff88..45e3b31 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -364,8 +364,6 @@ def reformat_df(df, remove_duplicate_columns=True): elif col not in mapping and col not in cols_to_drop: cols_to_drop.append(col) - # Remove superfluous columns - df.drop(labels=cols_to_drop, axis="columns", inplace=True) # Rename columns to canonical names df.rename(columns=mapping, inplace=True, errors="raise") @@ -398,6 +396,9 @@ def reformat_df(df, remove_duplicate_columns=True): print(f"Using {col} for {df.iloc[0]['dataset']}") df["depth"] = -df[col] + # Remove superfluous columns + df.drop(labels=cols_to_drop, axis="columns", inplace=True) + # Add file extension to image df["image"] = df.apply(add_file_extension, axis=1) # if "timestamp" not in df.columns and "datetime" in df.columns: From 4a3cb25b5c8abe2c8fd213e576a4efc934b7edd2 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 17:49:23 +0100 Subject: [PATCH 63/92] MNT: Drop latitude-, longitude- if used --- pangaea_downloader/merge_benthic_datasets.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 45e3b31..4ad9507 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -372,18 +372,22 @@ def reformat_df(df, remove_duplicate_columns=True): col = df.columns[lower_cols.index("latitudesouth")] print(f"Using {col} for {df.iloc[0]['dataset']}") df["latitude"] = -df[col] + cols_to_drop.append("latitudesouth") if "latitude" not in df.columns and "latitude-" in lower_cols: col = df.columns[lower_cols.index("latitude-")] print(f"Using {col} for {df.iloc[0]['dataset']}") df["latitude"] = -df[col] + cols_to_drop.append("latitude-") if "longitude" not in df.columns and "longitudewest" in lower_cols: col = df.columns[lower_cols.index("longitudewest")] print(f"Using {col} for {df.iloc[0]['dataset']}") df["longitude"] = -df[col] + cols_to_drop.append("longitudewest") if "longitude" not in df.columns and "longitude-" in lower_cols: col = df.columns[lower_cols.index("longitude-")] print(f"Using {col} for {df.iloc[0]['dataset']}") df["longitude"] = -df[col] + cols_to_drop.append("longitude-") # Remove datapoints with erroneous negative depth if "depth" in df.columns: From 42bdc5374268cdd72fcfafe17b7f355357c1f8cb Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 23:31:01 +0100 Subject: [PATCH 64/92] MNT: Save depth_of_observer, bathymetry, and elevation separately --- pangaea_downloader/merge_benthic_datasets.py | 41 +++++++++----------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 4ad9507..74b3e8b 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -298,13 +298,9 @@ def reformat_df(df, remove_duplicate_columns=True): "x_pos": [], "y_pos": [], "altitude": ["altitude", "heightaboveseafloor", "height"], - "depth": [ - "depthwater", - "bathydepth", - "bathymetry", - "bathy", - "depth", - ], + "depth_of_observer": ["depthwater", "depth"], + "bathymetry": ["bathydepth", "bathymetry", "bathy"], + "elevation": ["elevation"], "backscatter": [], "temperature": ["temperature", "temp"], "salinity": ["salinity", "sal"], @@ -390,15 +386,9 @@ def reformat_df(df, remove_duplicate_columns=True): cols_to_drop.append("longitude-") # Remove datapoints with erroneous negative depth - if "depth" in df.columns: + if "depth_of_observer" in df.columns: # Only observed two datapoints where this happens - df.loc[df["depth"] < 0, "depth"] = pd.NA - - # Use elevation if there was no depth - if "depth" not in df.columns and "elevation" in lower_cols: - col = df.columns[lower_cols.index("elevation")] - print(f"Using {col} for {df.iloc[0]['dataset']}") - df["depth"] = -df[col] + df.loc[df["depth_of_observer"] < 0, "depth_of_observer"] = pd.NA # Remove superfluous columns df.drop(labels=cols_to_drop, axis="columns", inplace=True) @@ -1185,8 +1175,10 @@ def interpolate_by_datetime(df, columns, **kwargs): if isinstance(columns, str): columns = [columns] for col in columns: + if col not in df: + continue interp_kwargs = kwargs - if col in ["depth", "altitude"]: + if col in ["depth", "depth_of_observer", "bathymetry", "altitude"]: if "left" not in interp_kwargs: interp_kwargs["left"] = np.nan if "right" not in interp_kwargs: @@ -1273,7 +1265,9 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1): if verbose >= 1: print(f"Interpolating latitude, longitude, and depth for dataset {ds_id}") # Interpolate lat, lon, and depth based on datetime - df = interpolate_by_datetime(df, ["latitude", "longitude", "depth"]) + df = interpolate_by_datetime( + df, ["latitude", "longitude", "depth_of_observer", "bathymetry"] + ) if ds_id in [875071, 875073]: if verbose >= 1: @@ -1284,7 +1278,7 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1): # from the subsequent image, so we don't need the ones without metadata. df = df[~df["datetime"].isna()] # Interpolate missing depth values - df = interpolate_by_datetime(df, ["depth"]) + df = interpolate_by_datetime(df, ["depth_of_observer", "bathymetry"]) if ds_id in [875084]: if verbose >= 1: @@ -1293,7 +1287,7 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1): # The first three are of the deck, the rest are dark watercolumn shots. df = df[~df["longitude"].isna()] # Interpolate missing depth values - df = interpolate_by_datetime(df, ["depth"]) + df = interpolate_by_datetime(df, ["depth_of_observer", "bathymetry"]) if (878001 <= ds_id <= 878019) or ds_id == 878045: if verbose >= 1: @@ -1366,7 +1360,9 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1): print( f"{ds_id}: Interpolating latitude, longitude, and depth for dataset {ds_id}" ) - df = interpolate_by_datetime(df, ["latitude", "longitude", "depth"]) + df = interpolate_by_datetime( + df, ["latitude", "longitude", "depth_of_observer", "bathymetry"] + ) if ds_id in [914155]: if verbose >= 1: @@ -1421,7 +1417,7 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1): ): if verbose >= 1: print(f"{ds_id}: Interpolating missing depth metadata for dataset {ds_id}") - df = interpolate_by_datetime(df, ["depth"]) + df = interpolate_by_datetime(df, ["depth_of_observer", "bathymetry"]) if any(df["latitude"].isna() | df["longitude"].isna()): # Fill in any missing latitude and longitude values with the @@ -1578,7 +1574,8 @@ def process_datasets(input_dirname, output_path=None, verbose=0): "latitude", "longitude", "altitude", - "depth", + "depth_of_observer", + "bathymetry", "backscatter", "temperature", "salinity", From eda209d6c26eab8f979bafa97b94de99cac0d223 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 23:32:34 +0100 Subject: [PATCH 65/92] MNT: Rearrange so old columns are dropped before mapping new ones onto them --- pangaea_downloader/merge_benthic_datasets.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 74b3e8b..2916f3a 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -360,6 +360,9 @@ def reformat_df(df, remove_duplicate_columns=True): elif col not in mapping and col not in cols_to_drop: cols_to_drop.append(col) + # Remove superfluous columns + df.drop(labels=cols_to_drop, axis="columns", inplace=True) + # Rename columns to canonical names df.rename(columns=mapping, inplace=True, errors="raise") @@ -368,31 +371,24 @@ def reformat_df(df, remove_duplicate_columns=True): col = df.columns[lower_cols.index("latitudesouth")] print(f"Using {col} for {df.iloc[0]['dataset']}") df["latitude"] = -df[col] - cols_to_drop.append("latitudesouth") if "latitude" not in df.columns and "latitude-" in lower_cols: col = df.columns[lower_cols.index("latitude-")] print(f"Using {col} for {df.iloc[0]['dataset']}") df["latitude"] = -df[col] - cols_to_drop.append("latitude-") if "longitude" not in df.columns and "longitudewest" in lower_cols: col = df.columns[lower_cols.index("longitudewest")] print(f"Using {col} for {df.iloc[0]['dataset']}") df["longitude"] = -df[col] - cols_to_drop.append("longitudewest") if "longitude" not in df.columns and "longitude-" in lower_cols: col = df.columns[lower_cols.index("longitude-")] print(f"Using {col} for {df.iloc[0]['dataset']}") df["longitude"] = -df[col] - cols_to_drop.append("longitude-") # Remove datapoints with erroneous negative depth if "depth_of_observer" in df.columns: # Only observed two datapoints where this happens df.loc[df["depth_of_observer"] < 0, "depth_of_observer"] = pd.NA - # Remove superfluous columns - df.drop(labels=cols_to_drop, axis="columns", inplace=True) - # Add file extension to image df["image"] = df.apply(add_file_extension, axis=1) # if "timestamp" not in df.columns and "datetime" in df.columns: From 4d5718839b0a2191875c813dac74731c31eaa9f8 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 23:33:01 +0100 Subject: [PATCH 66/92] MNT: Change warning colour from red to yellow --- pangaea_downloader/tools/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py index b5871b5..45d5782 100644 --- a/pangaea_downloader/tools/scraper.py +++ b/pangaea_downloader/tools/scraper.py @@ -29,7 +29,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: coordinates = get_metadata(soup) if coordinates is None and hasattr(ds, "geometryextent"): print( - colorama.Fore.RED + "\t\t\t[ALERT] Trying to get coordinates from" + colorama.Fore.YELLOW + "\t\t\t[ALERT] Trying to get coordinates from" " PanDataSet.geometryextent" + colorama.Fore.RESET ) lat = None From 0dc20c61703617289c5d74ba8b62cf41c8f107b5 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 30 Mar 2023 23:33:19 +0100 Subject: [PATCH 67/92] MNT: Change Campaign -> campaign --- pangaea_downloader/tools/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/tools/scraper.py b/pangaea_downloader/tools/scraper.py index 45d5782..a36cd34 100644 --- a/pangaea_downloader/tools/scraper.py +++ b/pangaea_downloader/tools/scraper.py @@ -77,7 +77,7 @@ def scrape_image_data(url: str, verbose=1) -> Optional[DataFrame]: ds_id = datasets.uri2dsid(doi if doi else url) df["ds_id"] = ds_id if (len(ds.events) > 0) and (ds.events[0].campaign is not None): - df["Campaign"] = ds.events[0].campaign.name + df["campaign"] = ds.events[0].campaign.name return df From 87452af3d1dd467213ced87221b87f74a5be858e Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Mon, 3 Apr 2023 11:54:09 +0100 Subject: [PATCH 68/92] BUG: Add pangaea- to ds_id for dataframe output --- pangaea_downloader/merge_benthic_datasets.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 2916f3a..1240513 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1491,7 +1491,11 @@ def process_datasets(input_dirname, output_path=None, verbose=0): files_without_url.append(fname) continue - df["ds_id"] = f"pangaea-{ds_id}" + if "ds_id" in df.columns: + df["ds_id"] = "pangaea-" + df["ds_id"].astype(str) + else: + df["ds_id"] = f"pangaea-{ds_id}" + df = reformat_df(df) if df is None: continue From d135600f7f42f3d385af3c47e9b5d6be089882d2 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Mon, 3 Apr 2023 11:54:59 +0100 Subject: [PATCH 69/92] MNT: Allow photographs of tiles --- pangaea_downloader/merge_benthic_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 1240513..4a1bdf4 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -204,7 +204,7 @@ def check_title(title): ): return False if "photographs of tiles" in title.lower(): - return False + pass return True From a3b14e20b8a80234320fb04193321e3bd784d6f0 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Mon, 3 Apr 2023 11:55:42 +0100 Subject: [PATCH 70/92] ENH: Include parent_ds_id in output dataframe --- pangaea_downloader/merge_benthic_datasets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 4a1bdf4..0d3ec6b 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1581,6 +1581,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0): "salinity", "chlorophyll", "acidity", + "parent_ds_id", } df_all = pd.concat( [df[df.columns.intersection(select_cols)] for df in dfs if len(df) > 0] From 2082aa31901b4bfbf1fe68a8f8194052c23e66d4 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Mon, 3 Apr 2023 11:58:14 +0100 Subject: [PATCH 71/92] MNT: Remove self-imposed rate-limit so cached data is loaded immediately --- pangaea_downloader/tools/datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index c8a591b..4a34874 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -18,7 +18,8 @@ from pangaea_downloader.tools import checker, process, scraper T_POLL_LAST = 0 -T_POLL_INTV = 0.1667 +T_POLL_INTV = 0 # Allow rapid loading of cached records +# T_POLL_INTV = 0.1667 # Rate-limit ourselves; stay under 5 requests within 30s def fetch_child( From f236e0cd84f162e6ba8f5d35a8ff465747a2d89f Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Tue, 4 Apr 2023 11:18:10 +0100 Subject: [PATCH 72/92] ENH: Include url_thumbnail column --- pangaea_downloader/merge_benthic_datasets.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 0d3ec6b..1b458cf 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -265,6 +265,7 @@ def reformat_df(df, remove_duplicate_columns=True): # is the output column name, and the value is a list of search names # in order of priority. The first match will be kept and others discarded. desired_columns = { + "url_thumbnail": ["urlthumb", "urlthumbnail"], "dataset": ["ds_id"], "site": ["Event", "event", "deployment"], "image": ["image", "filename"], @@ -329,6 +330,8 @@ def reformat_df(df, remove_duplicate_columns=True): if not found: found = True mapping[col] = canon + if col in cols_to_drop: + cols_to_drop.remove(col) if col != canon and canon in df.columns: cols_to_drop.append(canon) elif col not in mapping and col not in cols_to_drop: @@ -342,6 +345,8 @@ def reformat_df(df, remove_duplicate_columns=True): if not found: found = True mapping[col] = canon + if col in cols_to_drop: + cols_to_drop.remove(col) if col != canon and canon in df.columns: cols_to_drop.append(canon) elif col not in mapping and col not in cols_to_drop: @@ -355,6 +360,8 @@ def reformat_df(df, remove_duplicate_columns=True): if not found: found = True mapping[col] = canon + if col in cols_to_drop: + cols_to_drop.remove(col) if col != canon and canon in df.columns: cols_to_drop.append(canon) elif col not in mapping and col not in cols_to_drop: @@ -1569,6 +1576,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0): "dataset", "site", "url", + "url_thumbnail", "image", "datetime", "latitude", From 28739119942be735c2a001eb5f9496e71580c632 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Tue, 4 Apr 2023 11:21:43 +0100 Subject: [PATCH 73/92] ENH: Find area columns encoding image area in square meters --- pangaea_downloader/merge_benthic_datasets.py | 21 ++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 1b458cf..76966ee 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -16,6 +16,7 @@ import numpy as np import pandas as pd import scipy.interpolate +from pandas.api.types import is_numeric_dtype from pangaeapy import PanDataSet from tqdm.auto import tqdm @@ -235,8 +236,21 @@ def reformat_df(df, remove_duplicate_columns=True): # Make a copy of the dataframe so we can't overwrite the input df = df.copy() - # Remove bad columns - df.drop(labels=["-"], axis="columns", inplace=True, errors="ignore") + # Get dataset id from first row + ds_id = df.iloc[0]["ds_id"] + if isinstance(ds_id, str): + ds_id = ds_id.split("-")[-1] + + # Handle Area column + for col in ["Area", "Area_2", "Area_3"]: + # Area is sometimes the seafloor surface area of the image in + # meters^2 and sometimes used as a synonym for location + if col in df.columns and not all(df[col].isna()) and is_numeric_dtype(df[col]): + print(df.columns) + print(f"{ds_id}: Using {col} for area measurement") + df.rename(columns={col: "area"}, inplace=True, errors="raise") + break + # Remove duplicately named columns cols_to_drop = [] if remove_duplicate_columns: @@ -250,6 +264,8 @@ def reformat_df(df, remove_duplicate_columns=True): ): cols_to_drop.append(col) df.drop(labels=cols_to_drop, axis="columns", inplace=True) + # Remove bad columns + df.drop(labels=["-"], axis="columns", inplace=True, errors="ignore") # Find the correct URL column, and drop other columns containing "url" cols_to_drop = [] @@ -1581,6 +1597,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0): "datetime", "latitude", "longitude", + "area", "altitude", "depth_of_observer", "bathymetry", From 6a5c687156db6234154646895263b02843dad6bb Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Tue, 4 Apr 2023 11:22:27 +0100 Subject: [PATCH 74/92] MNT: Find and remove additional FAVOURITE duplicate images --- pangaea_downloader/merge_benthic_datasets.py | 28 +++++++++++++------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 76966ee..76cd3a1 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -741,8 +741,10 @@ def fixup_favourite_images(df, verbose=1): """ Drop duplicated favourite images. - These occur in Ingo Schewe's datasets along OFOS profiles during POLARSTERN - cruises, PANGAEA dataset ids 849814--849816 and 873995--874002. + These occur in Schewe and Bergmann's datasets along OFOS profiles during + POLARSTERN cruises, PANGAEA dataset ids 849814--849816. 873995--874002, + 895102--895104, 896545--896549, 896653--896657, 912471. + Parameters ---------- @@ -757,14 +759,22 @@ def fixup_favourite_images(df, verbose=1): As input dataframe, but with all Type entries starting with favourite removed (case-insensitive). """ - if "Type" not in df.columns: - return df - # Remove all Favourite timer, Favourite hotkey, FAVOURITE_TIMER, and - # FAVOURITE_HOTKEY entries, which although they have unique URLs for their - # images are actually identical images to the ones occuring immediately - # after them in the dataframe. n_samples_before = len(df) - df = df[~df["Type"].str.lower().str.startswith("favourite")] + if "Type" in df.columns: + # Remove all Favourite timer, Favourite hotkey, FAVOURITE_TIMER, and + # FAVOURITE_HOTKEY entries, which although they have unique URLs for their + # images are actually identical images to the ones occuring immediately + # after them in the dataframe. + df = df[~df["Type"].str.lower().str.startswith("favourite")] + if "image" in df.columns: + # Check if the image filename field is repeated except for a leading + # "FAVOURITE_" string, if so remove it. These images are identical + # copies of the other images. + select = df["image"].str.lower().str.startswith("favourite") + image_tmp = df["image"].str.replace("FAVOURITE_", "", case=False, regex=False) + is_repeated = image_tmp.duplicated(False) + # Remove favourite images which are repeated + df = df[~(select & is_repeated)] n_samples_after = len(df) if verbose >= 1 and n_samples_after != n_samples_before: print( From 2808ad9a470571d6b7486d34ed6b50f528dc0d7f Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Tue, 4 Apr 2023 11:22:56 +0100 Subject: [PATCH 75/92] MNT: Print files which had duplicated URLs resolved --- pangaea_downloader/merge_benthic_datasets.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 76cd3a1..e3254e5 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1581,9 +1581,13 @@ def process_datasets(input_dirname, output_path=None, verbose=0): print( f"Of which {len(files_with_repeat_urls)} have repeated URLs (before replacing dups with image)" ) + for fname in files_with_repeat_urls: + print(f" {fname}") print( f"Of which {len(files_with_repeat_urls2)} have repeated URLs (after replacing dups with image)" ) + for fname in files_with_repeat_urls2: + print(f" {fname}") print() print(f"There are {len(column_count)} unique column names:") print() From aadf712dad1d6da9b15e4286ccd4cead341c75ba Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 5 Apr 2023 10:50:06 +0100 Subject: [PATCH 76/92] Revert "MNT: Remove self-imposed rate-limit so cached data is loaded immediately" This reverts commit 2082aa31901b4bfbf1fe68a8f8194052c23e66d4. --- pangaea_downloader/tools/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index 4a34874..e53c600 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -18,8 +18,8 @@ from pangaea_downloader.tools import checker, process, scraper T_POLL_LAST = 0 -T_POLL_INTV = 0 # Allow rapid loading of cached records -# T_POLL_INTV = 0.1667 # Rate-limit ourselves; stay under 5 requests within 30s +# T_POLL_INTV = 0 # Allow rapid loading of cached records +T_POLL_INTV = 0.1667 # Rate-limit ourselves; stay under 5 requests within 30s def fetch_child( From 4f152f16e7f9821cb9b982b2df140a61743cbbb0 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 5 Apr 2023 10:51:11 +0100 Subject: [PATCH 77/92] MNT: Print number of records before and after dropping duplicates --- pangaea_downloader/merge_benthic_datasets.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index e3254e5..a1ac91f 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1626,10 +1626,13 @@ def process_datasets(input_dirname, output_path=None, verbose=0): [df[df.columns.intersection(select_cols)] for df in dfs if len(df) > 0] ) + print(f"There are {len(df_all)} records before dropping duplicated URLs") + # Remove duplicate URLs if verbose >= 1: print("Remove duplicates") df_all.drop_duplicates(subset="url", inplace=True, keep="first") + print(f"There are {len(df_all)} records after dropping duplicated URLs") # Fix repeated output paths by replacing with image field if fixup_repeated_output_paths is None: From df0beeabe0e69f1f12051533a3c9dd74bb2f7537 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 5 Apr 2023 10:52:46 +0100 Subject: [PATCH 78/92] MNT: Print IDs of datasets which may have label columns --- pangaea_downloader/merge_benthic_datasets.py | 32 ++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index a1ac91f..e45ea51 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1504,6 +1504,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0): n_valid = 0 dfs = [] dfs_fnames = [] + ids_with_potential_labels = [] for fname in tqdm(sorted(sorted(os.listdir(input_dirname)), key=len)): # noqa: C414 if not fname.endswith(".csv"): @@ -1548,6 +1549,30 @@ def process_datasets(input_dirname, output_path=None, verbose=0): column_count[col] += 1 column_examples[col].append(fname) + for key in [ + # "Type", + "Content", # Yes! + # "Sample label", + # "ID", + # "Sample ID", + "Classification", # Yes! + "Species", # Yes! + # "Reference", + # "Samp type", + "Family", + "Genus", + # "Ind No", + # "Imagery", + # "Img brightness", # No + "Ground vis", # Yes! + "Marine litter", + "Fisheries plastic", + "Unident litter", + ]: + if key in df.columns: + print(f"{fname} has {key}") + ids_with_potential_labels.append(ds_id) + # Drop rows that are complete duplicates df.drop_duplicates(inplace=True) @@ -1598,6 +1623,13 @@ def process_datasets(input_dirname, output_path=None, verbose=0): c = col + " " print(f"{c:.<35s} {count:4d}") print() + ids_with_potential_labels = sorted(set(ids_with_potential_labels)) + print( + f"There are {len(ids_with_potential_labels)} datasets which might have labels to extract:" + ) + for ds_id in ids_with_potential_labels: + print(ds_id) + print() if verbose >= 1: print("Filter columns") From f9d5b48211c5f64f70aded629e2ebcb9ba4f3d9b Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 5 Apr 2023 10:53:03 +0100 Subject: [PATCH 79/92] MNT: Rename parent_ds_id -> collection --- pangaea_downloader/merge_benthic_datasets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index e45ea51..f000409 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1657,6 +1657,7 @@ def process_datasets(input_dirname, output_path=None, verbose=0): df_all = pd.concat( [df[df.columns.intersection(select_cols)] for df in dfs if len(df) > 0] ) + df_all.rename(columns={"parent_ds_id": "collection"}, inplace=True) print(f"There are {len(df_all)} records before dropping duplicated URLs") From 45a3492b9eaafd972d7bd2a0c698b4794e58c7f3 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 6 Apr 2023 09:50:32 +0100 Subject: [PATCH 80/92] BUG: Fix nanosecond output format of datetime in pangaea-907025 --- pangaea_downloader/merge_benthic_datasets.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index f000409..c348057 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1366,12 +1366,17 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1): select_and_col = select & ~missing_dt select_not_col = select & missing_dt if any(select_and_col) and any(select_not_col): - df.loc[select_not_col, col] = scipy.interpolate.interp1d( + new_values = scipy.interpolate.interp1d( indices[select_and_col], - pd.to_datetime(df.loc[select_and_col, col]), + pd.to_datetime(df.loc[select_and_col, col]).map( + pd.Timestamp.timestamp + ), kind="nearest", fill_value="extrapolate", )(indices[select_not_col]) + new_values = pd.to_datetime(new_values, unit="s") + new_values = new_values.strftime("%Y-%m-%d") + df.loc[select_not_col, col] = new_values if ds_id in [911904, 918924, 919348]: if verbose >= 1: From 13080a1ad7c2604473bd5f56879ff40c085a7734 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 6 Apr 2023 09:51:03 +0100 Subject: [PATCH 81/92] MNT: Convert parent_ds_id into pangaea-IDENTIFIER like ds_id --- pangaea_downloader/merge_benthic_datasets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index c348057..6403be0 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1534,6 +1534,8 @@ def process_datasets(input_dirname, output_path=None, verbose=0): df["ds_id"] = "pangaea-" + df["ds_id"].astype(str) else: df["ds_id"] = f"pangaea-{ds_id}" + if "parent_ds_id" in df.columns: + df["parent_ds_id"] = "pangaea-" + df["parent_ds_id"].astype(str) df = reformat_df(df) if df is None: From 1db2a4a9575fc0a6499c5d848e6dae80ba3437e9 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 6 Apr 2023 09:51:29 +0100 Subject: [PATCH 82/92] DOC: Fix rate limit comment --- pangaea_downloader/tools/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangaea_downloader/tools/datasets.py b/pangaea_downloader/tools/datasets.py index e53c600..8b6f83a 100644 --- a/pangaea_downloader/tools/datasets.py +++ b/pangaea_downloader/tools/datasets.py @@ -19,7 +19,7 @@ T_POLL_LAST = 0 # T_POLL_INTV = 0 # Allow rapid loading of cached records -T_POLL_INTV = 0.1667 # Rate-limit ourselves; stay under 5 requests within 30s +T_POLL_INTV = 0.1667 # Rate-limit ourselves; stay under 180 requests within 30s def fetch_child( From b1b2a7095f1497685043fbd4606a8f6c3ed92b61 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 6 Apr 2023 09:52:13 +0100 Subject: [PATCH 83/92] ENH: Merge down metadata across rows with repeated URLs, preserving details --- pangaea_downloader/merge_benthic_datasets.py | 83 +++++++++++++++++++- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 6403be0..2df21bc 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -29,6 +29,10 @@ fixup_repeated_output_paths = None row2basename = None +# Create new `pandas` methods which use `tqdm` progress +# (can use tqdm_gui, optional kwargs, etc.) +tqdm.pandas() + TAXONOMY_RANKS = [ ["Kingdom", "Regnum"], ["Phylum", "Division"], @@ -1480,6 +1484,81 @@ def fixup_incomplete_metadata(df, ds_id=None, verbose=1): return df +def merge_duplicated_urls(df): + """ + Merge metadata across rows which have the same URL. + """ + print("Original number of rows:", len(df)) + df.drop_duplicates(inplace=True) + print("Number of rows after dropping simple duplicates:", len(df)) + # Record the original sort index so we can get the data back in the original + # order. + df["original_index"] = df.index + # Determine how many images are at the same location. This indicates how + # accurate the latitude and longitude information is. We will want to keep + # the most accurate version of this. + repeat_location_counts = df[["longitude", "latitude"]].value_counts() + repeat_location_counts = repeat_location_counts.to_frame() + repeat_location_counts.rename(columns={0: "tally_repeated_location"}, inplace=True) + # Add the tally_repeated_location data as a new column + df = df.merge(repeat_location_counts, how="left", on=["latitude", "longitude"]) + + def resolve_duplicates(sdf): + if len(sdf) == 1: + # If there's only one row in the group, return it. + return sdf.iloc[0] + # Take the entry which has the fewest repetitions of the latitude and + # longitude value. We will use the version from the first dataset that + # had the fewest repetitions of the location for this image. + # We adopt this row's collection, dataset, and site values in addition + # to its coordinates. + idx = np.argmin(sdf["tally_repeated_location"]) + row = sdf.iloc[idx].copy() + # For numeric columns (other than latitude and longitude), take the + # average of the values where they are present. + for col in [ + "depth_of_observer", + "altitude", + "bathymetry", + "salinity", + "temperature", + "acidity", + "area", + ]: + select = ~pd.isna(sdf[col]) + if select.sum() == 0: + continue + row[col] = sdf[select][col].mean() + # Look to see if we are missing an image or thumbnail entry and one + # of the duplicates has its value. + for col in ["image", "url_thumbnail"]: + if not pd.isna(row[col]): + continue + values = sdf[col] + values = values[~pd.isna(values)] + if len(values) == 0: + continue + row[col] = values.iloc[0] + # For datetime, use the fact that we encoded datetime as a string + # with varying levels of precision. More digits means higher precision. + # Take the most precise value, preferring the value from the selected + # record in the event of a tie. + datetime_len = sdf["datetime"].str.replace(" 00:00:00", "").str.len() + idx_dt = np.argmax(datetime_len) + if datetime_len.iloc[idx] != datetime_len.iloc[idx_dt]: + row["datetime"] = sdf.iloc[idx_dt]["datetime"] + return row + + print("Merging metadata between rows with the same URL") + # Group by URL and apply our transformation to each group + df_out = df.groupby("url").progress_apply(resolve_duplicates) + # Reorder the dataframe to preseve implicit temporal information from the + # ordering of the images + df_out.sort_values("original_index", inplace=True) + df_out.drop(columns=["original_index", "tally_repeated_location"], inplace=True) + return df_out + + def process_datasets(input_dirname, output_path=None, verbose=0): """ Process a directory of datasets: clean, concatenate and save. @@ -1670,8 +1749,8 @@ def process_datasets(input_dirname, output_path=None, verbose=0): # Remove duplicate URLs if verbose >= 1: - print("Remove duplicates") - df_all.drop_duplicates(subset="url", inplace=True, keep="first") + print("Merge duplicated URLs") + df_all = merge_duplicated_urls(df_all) print(f"There are {len(df_all)} records after dropping duplicated URLs") # Fix repeated output paths by replacing with image field From f631dd85fba8e3b9413cac81a7593629266da3d8 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 6 Apr 2023 10:23:49 +0100 Subject: [PATCH 84/92] MNT: Save a copy with duplicates before removing them, so duplicates can be resolved without merging again --- pangaea_downloader/merge_benthic_datasets.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 2df21bc..d5e7336 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1747,6 +1747,13 @@ def process_datasets(input_dirname, output_path=None, verbose=0): print(f"There are {len(df_all)} records before dropping duplicated URLs") + if os.path.dirname(output_path): + os.makedirs(os.path.dirname(output_path), exist_ok=True) + output_path_with_dups = os.path.splitext(output_path)[0] + "_with-duplicates.csv" + if verbose >= 0: + print(f"Saving (with duplicates) to {output_path_with_dups}") + df_all.to_csv(output_path_with_dups, index=False) + # Remove duplicate URLs if verbose >= 1: print("Merge duplicated URLs") @@ -1762,10 +1769,8 @@ def process_datasets(input_dirname, output_path=None, verbose=0): print("Fix repeated output paths to prevent collisions") df_all = fixup_repeated_output_paths(df_all, inplace=True, verbose=verbose) - if os.path.dirname(output_path): - os.makedirs(os.path.dirname(output_path), exist_ok=True) if verbose >= 0: - print(f"Saving to {output_path}") + print(f"Saving (without duplicates) to {output_path}") df_all.to_csv(output_path, index=False) From eebb878005d0e2ae00d8d93c5596148afc35005e Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Thu, 6 Apr 2023 10:25:35 +0100 Subject: [PATCH 85/92] BUG: Need to convert datetime to string before merging (some are datetime objects) --- pangaea_downloader/merge_benthic_datasets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index d5e7336..2ccc433 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1757,6 +1757,8 @@ def process_datasets(input_dirname, output_path=None, verbose=0): # Remove duplicate URLs if verbose >= 1: print("Merge duplicated URLs") + # Convert datetime to string + df_all["datetime"] = df_all["datetime"].astype(str) df_all = merge_duplicated_urls(df_all) print(f"There are {len(df_all)} records after dropping duplicated URLs") From f1cf9852c9feaf686f8c2b61b04ada00efdd70b3 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 May 2024 23:25:25 +0100 Subject: [PATCH 86/92] MNT: Rewrite any(list comp) as any(generator) instead (flake8:C419) C419 Unnecessary list comprehension passed to any() prevents short-circuiting - rewrite as a generator. --- pangaea_downloader/merge_benthic_datasets.py | 2 +- pangaea_downloader/tools/checker.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 2ccc433..2cfd4c3 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -425,7 +425,7 @@ def reformat_df(df, remove_duplicate_columns=True): if "site" not in df.columns: df["site"] = df["dataset"] + "_site" - if any([c in clean_cols for c in ["Kingdom", "Phylum", "Genus"]]): + if any(c in clean_cols for c in ["Kingdom", "Phylum", "Genus"]): df["taxonomy"] = df.apply(row2taxonomy, axis=1) df.drop( labels=[x for syn in TAXONOMY_RANKS for x in syn], diff --git a/pangaea_downloader/tools/checker.py b/pangaea_downloader/tools/checker.py index 7292c43..3d345d6 100644 --- a/pangaea_downloader/tools/checker.py +++ b/pangaea_downloader/tools/checker.py @@ -61,8 +61,8 @@ def is_invalid_file_ext(filename: str) -> bool: # --------------------------------------------- DataFrame Checkers --------------------------------------------- # def has_url_col(df: DataFrame) -> bool: """Take a Pandas DataFrame and return True if it has image URL column.""" - condition1 = any(["url" in col.lower() for col in df.columns]) - condition2 = any(["image" in col.lower() for col in df.columns]) + condition1 = any("url" in col.lower() for col in df.columns) + condition2 = any("image" in col.lower() for col in df.columns) return condition1 or condition2 From 4a42f493f752ca3aa75f7d13b18076b74651a0d8 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 May 2024 23:26:46 +0100 Subject: [PATCH 87/92] MNT: Rename depth columns --- pangaea_downloader/merge_benthic_datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 2cfd4c3..9898115 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -319,8 +319,8 @@ def reformat_df(df, remove_duplicate_columns=True): "x_pos": [], "y_pos": [], "altitude": ["altitude", "heightaboveseafloor", "height"], - "depth_of_observer": ["depthwater", "depth"], - "bathymetry": ["bathydepth", "bathymetry", "bathy"], + "depth_camera": ["depthwater", "depth"], + "depth_seafloor": ["bathydepth", "bathymetry", "bathy"], "elevation": ["elevation"], "backscatter": [], "temperature": ["temperature", "temp"], From e5d9db079226989d52bdfd0a8cf1ca64ae1e37d8 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 May 2024 23:27:21 +0100 Subject: [PATCH 88/92] MNT: Exclude AntGlassSponges with DOWN in their URL - not Benthic imagery --- pangaea_downloader/merge_benthic_datasets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 9898115..4641f4b 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -538,6 +538,7 @@ def check_image_url(url): if ( url.startswith("https://hs.pangaea.de/Images/Benthos/AntGlassSponges/") and "AHEAD" not in url + and "DOWN" not in url ): # Images of AntGlassSponges must contain "AHEAD" to be kept # otherwise, they are of sponges after removal From 3dfaba1c1fa74cb0960d3c40b20534a8e729b128 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 May 2024 23:27:52 +0100 Subject: [PATCH 89/92] MNT: Skip missing URL cols --- pangaea_downloader/merge_benthic_datasets.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 4641f4b..7079aaf 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1526,6 +1526,8 @@ def resolve_duplicates(sdf): "acidity", "area", ]: + if col not in sdf.columns: + continue select = ~pd.isna(sdf[col]) if select.sum() == 0: continue @@ -1533,6 +1535,8 @@ def resolve_duplicates(sdf): # Look to see if we are missing an image or thumbnail entry and one # of the duplicates has its value. for col in ["image", "url_thumbnail"]: + if col not in sdf.columns: + continue if not pd.isna(row[col]): continue values = sdf[col] From a8967dcdaef95cc991b860eef200a0eb6e2cf67c Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 May 2024 23:28:23 +0100 Subject: [PATCH 90/92] ENH: Add process_single to cleanup metadata for a single dataset --- pangaea_downloader/merge_benthic_datasets.py | 75 ++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 7079aaf..a7532fa 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1564,6 +1564,81 @@ def resolve_duplicates(sdf): return df_out +def process_single(df, ds_id=None, verbose=1, remove_duplicate_columns=False): + """ + Reformat and cleanup metadata for a single dataset. + + Parameters + ---------- + df : pandas.Dataframe + The dataset to process. + ds_id : int, optional + The ID number for the PANGAEA dataset. If omitted, it is inferred from + the ``ds_id`` column of ``df``. + verbose : int, default=1 + Verbosity level. + remove_duplicate_columns : bool, default=False + Whether to remove duplicate column names. + + Returns + ------- + df : pandas.Dataframe + A processed copy of the dataset. + """ + if df is None or len(df) == 0: + return df + + if ds_id is None: + ds_id = df.iloc[0]["ds_id"] + if isinstance(ds_id, str): + ds_id = int(ds_id.split("-")[-1]) + + if "ds_id" in df.columns: + df["ds_id"] = "pangaea-" + df["ds_id"].astype(str) + df["ds_id"] = df["ds_id"].str.replace("pangaea-pangaea-", "pangaea-") + else: + df["ds_id"] = f"pangaea-{ds_id}" + if "parent_ds_id" in df.columns: + df["parent_ds_id"] = "pangaea-" + df["parent_ds_id"].astype(str) + df["parent_ds_id"] = df["parent_ds_id"].str.replace( + "pangaea-pangaea-", "pangaea-" + ) + + df = reformat_df(df, remove_duplicate_columns=remove_duplicate_columns) + if df is None: + return df + + url_col = "url" + df = df[df[url_col] != ""] + if len(df) == 0: + return df + + df = filter_urls(df, url_column=url_col) + if len(df) == 0: + return df + + # Drop rows that are complete duplicates + df.drop_duplicates(inplace=True) + + # Try to fix repeated URLs that are accidental dups but should differ + df = fixup_repeated_urls(df, url_column=url_col, verbose=verbose) + + # Check for any rows that are all NaNs + if sum(df.isna().all("columns")) > 0: + print(f"{ds_id} has a row which is all NaNs") + + # Remove duplicated "favourited" images + df = fixup_favourite_images(df, verbose=verbose) + + # Fix incomplete lat/lon/datetime metadata + df = fixup_incomplete_metadata(df, ds_id, verbose=verbose) + + # Add datetime if it is completely missing + df = add_missing_datetime(df, ds_id, verbose=verbose) + + return df + + def process_datasets(input_dirname, output_path=None, verbose=0): """ Process a directory of datasets: clean, concatenate and save. From c4ce7aba35b6eae8c9f63e46f04b316b9d44abf2 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 May 2024 23:29:17 +0100 Subject: [PATCH 91/92] JNB: More EDA and new output files --- notebooks/explore-depth-columns.ipynb | 80 ++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/notebooks/explore-depth-columns.ipynb b/notebooks/explore-depth-columns.ipynb index 2f3c590..53305b9 100644 --- a/notebooks/explore-depth-columns.ipynb +++ b/notebooks/explore-depth-columns.ipynb @@ -30,13 +30,19 @@ "cell_type": "code", "execution_count": null, "id": "b6f9ebdb", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Load datasets from this directory\n", "dirname = \"../query-outputs_2022-01-01\"\n", + "dirname = \"../query-outputs_2023-03-07_extras/\"\n", + "dirname = \"../query-outputs_2023-03-30c/\"\n", + "# dirname = \"../query-outputs_2023-03-30c\"\n", "# Pangaea benthic image dataset file with filtered dataset IDs\n", "pangaea_file = \"../full-dataset/pangaea_2022-01-24_filtered.csv\"\n", + "pangaea_file = \"../datasetcsvs/pangaea_2023-03-30c_with-tiles4.csv\"\n", "pangaea_df = pd.read_csv(pangaea_file)\n", "ds_ids = pangaea_df.dataset.unique()\n", "print(f\"Total {len(ds_ids)} datasets to process.\")" @@ -186,6 +192,78 @@ " print(f\"{c:.<35s} {count:4d}\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a251b7dd-673b-43c0-b948-bb83019aedb1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column_examples[\"sal\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42a79516-2ab2-45ee-b876-daf12758ed00", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column_examples[\"area\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "235276c3-d887-46b6-a453-2873a636533a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column_examples[\"length\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "897336e0-d260-46d4-a71b-7e882e785ce5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column_examples[\"classification\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f159be9-f6dc-4d0f-ae6a-a781a9983cdf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column_examples[\"content\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d804d2f-6adb-42f3-b164-68fe42a08b92", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "column_examples[\"ground vis\"]" + ] + }, { "cell_type": "markdown", "id": "a07b478a-bd3d-417f-8e88-f49ea585c812", From d8e0aef41356b7e99502c85d3fb83ddbe8cc453c Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 May 2024 23:45:27 +0100 Subject: [PATCH 92/92] DEV: Remove malfunctioning pretty-format-json On the GitHub workflow, it is trying to fix the format of the jupyter notebooks. --- .pre-commit-config.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 19b80c0..52656ca 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -80,8 +80,6 @@ repos: - id: detect-private-key - id: end-of-file-fixer exclude: ^LICENSE|\.(html|csv|txt|svg|py)$ - - id: pretty-format-json - args: ["--autofix", "--no-ensure-ascii", "--no-sort-keys"] - id: requirements-txt-fixer - id: trailing-whitespace args: [--markdown-linebreak-ext=md]