diff --git a/notebooks/dataset-details-scraper.ipynb b/notebooks/dataset-details-scraper.ipynb index f29ace3..ec03fe0 100644 --- a/notebooks/dataset-details-scraper.ipynb +++ b/notebooks/dataset-details-scraper.ipynb @@ -55,7 +55,10 @@ "from tqdm.auto import tqdm\n", "\n", "sys.path.append(\"..\")\n", - "from pangaea_downloader.tools.checker import is_url" + "from pangaea_downloader.tools.checker import is_url\n", + "\n", + "sys.path.append(\"pangaea_downloader/tools\")\n", + "import checker" ] }, { @@ -73,9 +76,8 @@ "metadata": {}, "outputs": [], "source": [ - "# pangaea_file = \"../full-dataset/pangaea_2022-03-03_filtered_no-repeats_sorted-first_subsampled-1.25m-40b-200-40m_100-200m_fewfact2-nonspa-exh2.csv\"\n", - "pangaea_file = \"../full-dataset/pangaea_2022-01-24_filtered.csv\"\n", - "# pangaea_file = \"../full-dataset/pangaea_2022-01-24.csv\"\n", + "# pangaea_file is a csv indicating all datasets that have been downloaded, their URLs, and ids\n", + "# pangaea_file = \"../full-dataset/pangaea_2022-01-24_filtered.csv\"\n", "df = pd.read_csv(pangaea_file, low_memory=False)\n", "sorted_ids = sorted([int(ds_id.split(\"-\")[-1]) for ds_id in df.dataset.unique()])\n", "ds_ids = [f\"pangaea-{id_}\" for id_ in sorted_ids]\n", @@ -195,8 +197,8 @@ " \"\"\"Given a parsed html object return a tuple with the dataset project name and URL (if available).\"\"\"\n", " name, url = None, None\n", " if len(ds.projects) > 0:\n", - " name = ds.projects[0].name.text if ds.projects[0].name is not None else None\n", - " href = ds.projects[0].URL.text if ds.projects[0].URL is not None else None\n", + " name = ds.projects[0].name if ds.projects[0].name is not None else None\n", + " href = ds.projects[0].URL if ds.projects[0].URL is not None else None\n", " if isinstance(href, str) and is_url(href):\n", " url = href\n", " return name, url\n", @@ -266,18 +268,18 @@ " \"license_url\": None,\n", "}\n", "# Parent dataset\n", - "if len(ds.children) > 0:\n", - " children = [f\"pangaea-{child.split('.')[-1]}\" for child in ds.children]\n", + "if len(ds.collection_members) > 0:\n", + " children = [f\"pangaea-{child.split('.')[-1]}\" for child in ds.collection_members]\n", " info[\"is_parent\"] = True\n", " info[\"children\"] = children\n", "# Child: Identify parents\n", "if \"In:\" in info[\"citation_dataset\"]:\n", " info[\"parent\"] = f\"pangaea-{info['citation_dataset'].split('.')[-1]}\"\n", "info[\"citation_paper\"] = get_paper_citation(soup)\n", - "if (ds.error != \"Data set is protected\") and (len(ds.licenses) > 0):\n", - " info[\"license\"] = ds.licenses[0].label.text\n", - " info[\"license_url\"] = ds.licenses[0].URI.text\n", - "elif ds.error == \"Data set is protected\":\n", + "if len(ds.licence.label) > 0:\n", + " info[\"license\"] = ds.licence.label\n", + " info[\"license_url\"] = ds.licence.URI\n", + "else:\n", " info[\"license\"] = \"Protected (License Unknown)\"\n", " info[\"license_url\"] = None\n", "proj = get_project_info(ds, soup)\n", @@ -336,7 +338,9 @@ "cell_type": "code", "execution_count": null, "id": "b74f5b63", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "bibtex_list = []\n", @@ -379,8 +383,8 @@ " \"license\": None,\n", " \"license_url\": None,\n", " }\n", - " if len(ds.children) > 0: # Parent dataset\n", - " children = [f\"pangaea-{child.split('.')[-1]}\" for child in ds.children]\n", + " if len(ds.collection_members) > 0: # Parent dataset\n", + " children = [f\"pangaea-{child.split('.')[-1]}\" for child in ds.collection_members]\n", " parent_child_mappings[ds_id] = children\n", " info[\"is_parent\"] = True\n", " info[\"children\"] = children\n", @@ -390,10 +394,10 @@ " info[\"citation_paper\"] = get_paper_citation(soup)\n", " if isinstance(info[\"citation_paper\"], str):\n", " info[\"citation_paper\"] = correct_citation_paper(info[\"citation_paper\"], ds)\n", - " if (ds.error != \"Data set is protected\") and (len(ds.licenses) > 0):\n", - " info[\"license\"] = ds.licenses[0].label.text\n", - " info[\"license_url\"] = ds.licenses[0].URI.text\n", - " elif ds.error == \"Data set is protected\":\n", + " if (len(ds.licence.label) > 0):\n", + " info[\"license\"] = ds.licence.label\n", + " info[\"license_url\"] = ds.licence.URI\n", + " else:\n", " info[\"license\"] = \"Protected (License Unknown)\"\n", " info[\"license_url\"] = None\n", " proj = get_project_info(ds, soup)\n", @@ -928,37 +932,13 @@ " print(\"-\" * 85)\n", " dds.append(dataset)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b9b245cd", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e8441957", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "509ed3fd", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3.8 (ws)", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "ws" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -970,7 +950,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.8.18" } }, "nbformat": 4,