From a8967dcdaef95cc991b860eef200a0eb6e2cf67c Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 8 May 2024 23:28:23 +0100 Subject: [PATCH] ENH: Add process_single to cleanup metadata for a single dataset --- pangaea_downloader/merge_benthic_datasets.py | 75 ++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/pangaea_downloader/merge_benthic_datasets.py b/pangaea_downloader/merge_benthic_datasets.py index 7079aaf..a7532fa 100755 --- a/pangaea_downloader/merge_benthic_datasets.py +++ b/pangaea_downloader/merge_benthic_datasets.py @@ -1564,6 +1564,81 @@ def resolve_duplicates(sdf): return df_out +def process_single(df, ds_id=None, verbose=1, remove_duplicate_columns=False): + """ + Reformat and cleanup metadata for a single dataset. + + Parameters + ---------- + df : pandas.Dataframe + The dataset to process. + ds_id : int, optional + The ID number for the PANGAEA dataset. If omitted, it is inferred from + the ``ds_id`` column of ``df``. + verbose : int, default=1 + Verbosity level. + remove_duplicate_columns : bool, default=False + Whether to remove duplicate column names. + + Returns + ------- + df : pandas.Dataframe + A processed copy of the dataset. + """ + if df is None or len(df) == 0: + return df + + if ds_id is None: + ds_id = df.iloc[0]["ds_id"] + if isinstance(ds_id, str): + ds_id = int(ds_id.split("-")[-1]) + + if "ds_id" in df.columns: + df["ds_id"] = "pangaea-" + df["ds_id"].astype(str) + df["ds_id"] = df["ds_id"].str.replace("pangaea-pangaea-", "pangaea-") + else: + df["ds_id"] = f"pangaea-{ds_id}" + if "parent_ds_id" in df.columns: + df["parent_ds_id"] = "pangaea-" + df["parent_ds_id"].astype(str) + df["parent_ds_id"] = df["parent_ds_id"].str.replace( + "pangaea-pangaea-", "pangaea-" + ) + + df = reformat_df(df, remove_duplicate_columns=remove_duplicate_columns) + if df is None: + return df + + url_col = "url" + df = df[df[url_col] != ""] + if len(df) == 0: + return df + + df = filter_urls(df, url_column=url_col) + if len(df) == 0: + return df + + # Drop rows that are complete duplicates + df.drop_duplicates(inplace=True) + + # Try to fix repeated URLs that are accidental dups but should differ + df = fixup_repeated_urls(df, url_column=url_col, verbose=verbose) + + # Check for any rows that are all NaNs + if sum(df.isna().all("columns")) > 0: + print(f"{ds_id} has a row which is all NaNs") + + # Remove duplicated "favourited" images + df = fixup_favourite_images(df, verbose=verbose) + + # Fix incomplete lat/lon/datetime metadata + df = fixup_incomplete_metadata(df, ds_id, verbose=verbose) + + # Add datetime if it is completely missing + df = add_missing_datetime(df, ds_id, verbose=verbose) + + return df + + def process_datasets(input_dirname, output_path=None, verbose=0): """ Process a directory of datasets: clean, concatenate and save.