Skip to content

Commit

Permalink
Update dataset-details-scraper.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
benjaminmisiuk committed Feb 15, 2024
1 parent e4f140b commit 2bda24f
Showing 1 changed file with 26 additions and 46 deletions.
72 changes: 26 additions & 46 deletions notebooks/dataset-details-scraper.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@
"from tqdm.auto import tqdm\n",
"\n",
"sys.path.append(\"..\")\n",
"from pangaea_downloader.tools.checker import is_url"
"from pangaea_downloader.tools.checker import is_url\n",
"\n",
"sys.path.append(\"pangaea_downloader/tools\")\n",
"import checker"
]
},
{
Expand All @@ -73,9 +76,8 @@
"metadata": {},
"outputs": [],
"source": [
"# pangaea_file = \"../full-dataset/pangaea_2022-03-03_filtered_no-repeats_sorted-first_subsampled-1.25m-40b-200-40m_100-200m_fewfact2-nonspa-exh2.csv\"\n",
"pangaea_file = \"../full-dataset/pangaea_2022-01-24_filtered.csv\"\n",
"# pangaea_file = \"../full-dataset/pangaea_2022-01-24.csv\"\n",
"# pangaea_file is a csv indicating all datasets that have been downloaded, their URLs, and ids\n",
"# pangaea_file = \"../full-dataset/pangaea_2022-01-24_filtered.csv\"\n",
"df = pd.read_csv(pangaea_file, low_memory=False)\n",
"sorted_ids = sorted([int(ds_id.split(\"-\")[-1]) for ds_id in df.dataset.unique()])\n",
"ds_ids = [f\"pangaea-{id_}\" for id_ in sorted_ids]\n",
Expand Down Expand Up @@ -195,8 +197,8 @@
" \"\"\"Given a parsed html object return a tuple with the dataset project name and URL (if available).\"\"\"\n",
" name, url = None, None\n",
" if len(ds.projects) > 0:\n",
" name = ds.projects[0].name.text if ds.projects[0].name is not None else None\n",
" href = ds.projects[0].URL.text if ds.projects[0].URL is not None else None\n",
" name = ds.projects[0].name if ds.projects[0].name is not None else None\n",
" href = ds.projects[0].URL if ds.projects[0].URL is not None else None\n",
" if isinstance(href, str) and is_url(href):\n",
" url = href\n",
" return name, url\n",
Expand Down Expand Up @@ -266,18 +268,18 @@
" \"license_url\": None,\n",
"}\n",
"# Parent dataset\n",
"if len(ds.children) > 0:\n",
" children = [f\"pangaea-{child.split('.')[-1]}\" for child in ds.children]\n",
"if len(ds.collection_members) > 0:\n",
" children = [f\"pangaea-{child.split('.')[-1]}\" for child in ds.collection_members]\n",
" info[\"is_parent\"] = True\n",
" info[\"children\"] = children\n",
"# Child: Identify parents\n",
"if \"In:\" in info[\"citation_dataset\"]:\n",
" info[\"parent\"] = f\"pangaea-{info['citation_dataset'].split('.')[-1]}\"\n",
"info[\"citation_paper\"] = get_paper_citation(soup)\n",
"if (ds.error != \"Data set is protected\") and (len(ds.licenses) > 0):\n",
" info[\"license\"] = ds.licenses[0].label.text\n",
" info[\"license_url\"] = ds.licenses[0].URI.text\n",
"elif ds.error == \"Data set is protected\":\n",
"if len(ds.licence.label) > 0:\n",
" info[\"license\"] = ds.licence.label\n",
" info[\"license_url\"] = ds.licence.URI\n",
"else:\n",
" info[\"license\"] = \"Protected (License Unknown)\"\n",
" info[\"license_url\"] = None\n",
"proj = get_project_info(ds, soup)\n",
Expand Down Expand Up @@ -336,7 +338,9 @@
"cell_type": "code",
"execution_count": null,
"id": "b74f5b63",
"metadata": {},
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"bibtex_list = []\n",
Expand Down Expand Up @@ -379,8 +383,8 @@
" \"license\": None,\n",
" \"license_url\": None,\n",
" }\n",
" if len(ds.children) > 0: # Parent dataset\n",
" children = [f\"pangaea-{child.split('.')[-1]}\" for child in ds.children]\n",
" if len(ds.collection_members) > 0: # Parent dataset\n",
" children = [f\"pangaea-{child.split('.')[-1]}\" for child in ds.collection_members]\n",
" parent_child_mappings[ds_id] = children\n",
" info[\"is_parent\"] = True\n",
" info[\"children\"] = children\n",
Expand All @@ -390,10 +394,10 @@
" info[\"citation_paper\"] = get_paper_citation(soup)\n",
" if isinstance(info[\"citation_paper\"], str):\n",
" info[\"citation_paper\"] = correct_citation_paper(info[\"citation_paper\"], ds)\n",
" if (ds.error != \"Data set is protected\") and (len(ds.licenses) > 0):\n",
" info[\"license\"] = ds.licenses[0].label.text\n",
" info[\"license_url\"] = ds.licenses[0].URI.text\n",
" elif ds.error == \"Data set is protected\":\n",
" if (len(ds.licence.label) > 0):\n",
" info[\"license\"] = ds.licence.label\n",
" info[\"license_url\"] = ds.licence.URI\n",
" else:\n",
" info[\"license\"] = \"Protected (License Unknown)\"\n",
" info[\"license_url\"] = None\n",
" proj = get_project_info(ds, soup)\n",
Expand Down Expand Up @@ -928,37 +932,13 @@
" print(\"-\" * 85)\n",
" dds.append(dataset)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9b245cd",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e8441957",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "509ed3fd",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8 (ws)",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "ws"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -970,7 +950,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.8.18"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 2bda24f

Please sign in to comment.