diff --git a/.gitignore b/.gitignore index 9ab6dc5..1e627b7 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,20 @@ requirements-tocheck.in dev-requirements.txt .idea .env +draft-* +datasets +pandora-collections.ndjson +pandora-subjects.ndjson +pandora-titles.csv +snapshot-test.ndjson +snapshots.csv +subjects.ndjson +title_urls.ndjson +titles_all.ndjson +titles.csv +titles.db +titles.ndjson +Untitled* +.python-version +http_cache.sqlite +data-rocrate diff --git a/README.md b/README.md index 3157b81..94d6415 100644 --- a/README.md +++ b/README.md @@ -1,83 +1,23 @@ -# Trove web archives +# trove-web-archives -Current version: [v0.0.0](https://github.com/GLAM-Workbench/trove-web-archives/releases/tag/v0.0.0) +A GLAM Workbench repository -Tools and examples to woth with Pandora For more information see the [Trove web archives](https://glam-workbench.net/trove-web-archives/) section of the GLAM Workbench. +For more information and documentation see the [Trove web archive collections (Pandora)](https://glam-workbench.net/trove-web-archives) section of the [GLAM Workbench](https://glam-workbench.net). -## Notebook topics +## Notebooks +- [Create title datasets from collections and subjects](https://github.com/GLAM-Workbench/trove-web-archives/blob/master/create-datasets.ipynb) +- [Harvest Pandora subjects and collections](https://github.com/GLAM-Workbench/trove-web-archives/blob/master/harvest-pandora-subject-collections.ipynb) +- [Harvest the full collection of Pandora titles](https://github.com/GLAM-Workbench/trove-web-archives/blob/master/harvest-pandora-titles.ipynb) -* [Notebook title](sample_notebook.ipynb) – this notebook does things -See the [GLAM Workbench for more details](https://glam-workbench.net/trove-web-archives/). +## Associated datasets +- [trove-web-archives-titles](https://github.com/GLAM-Workbench/trove-web-archives-titles) +- [trove-web-archives-collections](https://github.com/GLAM-Workbench/trove-web-archives-collections) - -## Run these notebooks - -There are a number of different ways to use these notebooks. Binder is quickest and easiest, but it doesn't save your data. I've listed the options below from easiest to most complicated (requiring more technical knowledge). - -### Using Binder - -[![Launch on Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/GLAM-Workbench/trove-web-archives/master/?urlpath=lab/tree/index.ipynb) - -Click on the button above to launch the notebooks in this repository using the [Binder](https://mybinder.org/) service (it might take a little while to load). This is a free service, but note that sessions will close if you stop using the notebooks, and no data will be saved. Make sure you download any changed notebooks or harvested data that you want to save. - -See [Using Binder](https://glam-workbench.net/using-binder/) for more details. - -### Using Reclaim Cloud - -[![Launch on Reclaim Cloud](https://glam-workbench.github.io/images/launch-on-reclaim-cloud.svg)](https://app.my.reclaim.cloud/?manifest=https://raw.githubusercontent.com/GLAM-Workbench/trove-web-archives/master/reclaim-manifest.jps) - -[Reclaim Cloud](https://reclaim.cloud/) is a paid hosting service, aimed particularly at supported digital scholarship in hte humanities. Unlike Binder, the environments you create on Reclaim Cloud will save your data – even if you switch them off! To run this repository on Reclaim Cloud for the first time: - -* Create a [Reclaim Cloud](https://reclaim.cloud/) account and log in. -* Click on the button above to start the installation process. -* A dialogue box will ask you to set a password, this is used to limit access to your Jupyter installation. -* Sit back and wait for the installation to complete! -* Once the installation is finished click on the 'Open in Browser' button of your newly created environment (note that you might need to wait a few minutes before everything is ready). - -See [Using Reclaim Cloud](https://glam-workbench.net/using-reclaim-cloud/) for more details. - -### Using Docker - -You can use Docker to run a pre-built computing environment on your own computer. It will set up everything you need to run the notebooks in this repository. This is free, but requires more technical knowledge – you'll have to install Docker on your computer, and be able to use the command line. - -* Install [Docker Desktop](https://docs.docker.com/get-docker/). -* Create a new directory for this repository and open it from the command line. -* From the command line, run the following command: - ``` - docker run -p 8888:8888 --name trove-web-archives -v "$PWD":/home/jovyan/work quay.io/glamworkbench/trove-web-archives repo2docker-entrypoint jupyter lab --ip 0.0.0.0 --NotebookApp.token='' --LabApp.default_url='/lab/tree/index.ipynb' - ``` -* It will take a while to download and configure the Docker image. Once it's ready you'll see a message saying that Jupyter Notebook is running. -* Point your web browser to `http://127.0.0.1:8888` - -See [Using Docker](https://glam-workbench.net/using-docker/) for more details. - -### Setting up on your own computer - -If you know your way around the command line and are comfortable installing software, you might want to set up your own computer to run these notebooks. - -Assuming you have recent versions of Python and Git installed, the steps might be something like: - -* Create a virtual environment, eg: `python -m venv trove-web-archives` -* Open the new directory" `cd trove-web-archives` -* Activate the environment `source bin/activate` -* Clone the repository: `git clone https://github.com/GLAM-Workbench/trove-web-archives.git notebooks` -* Open the new `notebooks` directory: `cd notebooks` -* Install the necessary Python packages: `pip install -r requirements.txt` -* Run Jupyter: `jupyter lab` - -See [Getting started](https://glam-workbench.net/getting-started/#using-python-on-your-own-computer for more details. - -## Cite as - -See the GLAM Workbench or [Zenodo](https://doi.org/10.5281/zenodo.3521724) for up-to-date citation details. - ---- - -This repository is part of the [GLAM Workbench](https://glam-workbench.net/). - +Created by [Tim Sherratt](https://timsherratt.au) for the [GLAM Workbench](https://glam-workbench.net) \ No newline at end of file diff --git a/create-datasets.ipynb b/create-datasets.ipynb new file mode 100644 index 0000000..babf1ce --- /dev/null +++ b/create-datasets.ipynb @@ -0,0 +1,462 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a5241964-ff09-4c34-b164-32befd8ac430", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Create archived url datasets from Pandora's collections and subjects\n", + "\n", + "This notebook helps you create a dataset of archived urls using Pandora's subject and collection groupings.\n", + "\n", + "The Australian Web Archive makes billions of archived web pages searchable through Trove. But how would you go about constructing a search that would find websites relating to election campaigns? Fortunately you don't have to, as Pandora provides a collection of archived web resources organised by subject and collection. By using harvests of Pandora's subject hierarchy and a complete list of archived titles, this notebook makes it easy for you to create custom datasets relating to a specific topic or event.\n", + "\n", + "This notebook uses pre-harvested datasets containing information about Pandora's subjects, collections and titles. New titles are added to Pandora frequently, so you might want to create your own updated versions using these notebooks:\n", + "\n", + "- [Harvest Pandora subjects and collections](harvest-pandora-subject-collections.ipynb)\n", + "- [Harvest the full collection of Pandora titles](harvest-pandora-titles.ipynb)\n", + "\n", + "## Using this notebook\n", + "\n", + "The simplest way to get started is to browse the subject and collection groupings in [Pandora](http://pandora.nla.gov.au/). Once you've found a subject or collection of interest, just copy its identifier, either `/subject/[subject number]` or `/col/[collection number]`. You also need to decide if you want *every* title under that subject or collection, including those associated with its children, or if you only want the titles directly linked to your selected grouping.\n", + "\n", + "Then you can run either `create_subject_dataset([your subject id])` or `create_collection_dataset([your collection id])`.\n", + "\n", + "## Datasets\n", + "\n", + "This notebook creates a CSV formatted dataset containing the following fields:\n", + "\n", + "- `tep_id` – the Title Entry Page (TEP) identifier in the form `/tep/[TEP NUMBER]`\n", + "- `name` – name of the title\n", + "- `gathered_url` – the url that was archived\n", + "- `surt` – the surt (Sort-friendly URI Reordering Transform) is a version of the url that reverses the order of the domain components to put the top-level domain first, making it easier to group or sort resources by domain\n", + "\n", + "Note that Pandora's title records can bring together different urls and domains that have pointed to a resource over time. This means that there can be multiple urls associated with each TEP. See [Harvest the full collection of Pandora titles](harvest-pandora-titles.ipynb) for more information.\n", + "\n", + "The dataset also includes an RO-Crate metadata file describing the dataset's contents and context.\n", + "\n", + "## What can you do with a collection of archived urls?\n", + "\n", + "For more information about the Pandora title, use the `tep_id` to construct a url to a human-readable version in Trove, or a machine-readable JSON version:\n", + "\n", + "- [https://webarchive.nla.gov.au/tep/131444](https://webarchive.nla.gov.au/tep/131444) – goes to TEP web page\n", + "- [https://webarchive.nla.gov.au/bamboo-service/tep/131444](https://webarchive.nla.gov.au/bamboo-service/tep/131444) – returns JSON version of TEP\n", + "\n", + "Once you have an archived url you can make use of the tools in the [Web Archives](https://glam-workbench.net/web-archives/) section of the GLAM Workbench to gather more data for analysis. For example:\n", + "\n", + "- [Find all the archived versions of a web page using Timemaps](https://glam-workbench.net/web-archives/get-all-versions/)\n", + "- [Display changes in the text of an archived web page over time](https://glam-workbench.net/web-archives/display-changes-in-text/)\n", + "- [Harvesting collections of text from archived web pages](https://glam-workbench.net/web-archives/harvesting-text/)\n", + "- [Using screenshots to visualise change in a page over time](https://glam-workbench.net/web-archives/create-screenshots-over-time/)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 329, + "id": "9362f044-66c3-44b4-b4b4-8566603bb11d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 329, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import mimetypes\n", + "import os\n", + "from datetime import datetime\n", + "from pathlib import Path\n", + "\n", + "import ipynbname\n", + "import nbformat\n", + "import pandas as pd\n", + "from dotenv import load_dotenv\n", + "from IPython.display import HTML, display\n", + "from rocrate.rocrate import ContextEntity, ROCrate\n", + "from slugify import slugify\n", + "\n", + "load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 300, + "id": "1ab9af6c-01d7-459b-8595-31325aeb3558", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dfc = pd.read_json(\n", + " \"https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-collections.ndjson\",\n", + " lines=True,\n", + ")\n", + "dfs = pd.read_json(\n", + " \"https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-subjects.ndjson\",\n", + " lines=True,\n", + ")\n", + "dft = pd.read_csv(\n", + " \"https://github.com/GLAM-Workbench/trove-web-archives-titles/raw/main/pandora-titles.csv\"\n", + ")\n", + "\n", + "\n", + "def create_rocrate(subject, file_path, start_date, end_date):\n", + " \"\"\"\n", + " Create an RO-Crate metadata file describing the downloaded dataset.\n", + " \"\"\"\n", + " crate = ROCrate()\n", + " crate.add_file(file_path)\n", + " nb_path = ipynbname.path()\n", + " nb = nbformat.read(nb_path, nbformat.NO_CONVERT)\n", + " metadata = nb.metadata.rocrate\n", + " nb_url = metadata.get(\"url\", \"\")\n", + " nb_properties = {\n", + " \"@type\": [\"File\", \"SoftwareSourceCode\"],\n", + " \"name\": metadata.get(\"name\", \"\"),\n", + " \"description\": metadata.get(\"description\", \"\"),\n", + " \"encodingFormat\": \"application/x-ipynb+json\",\n", + " \"codeRepository\": metadata.get(\"codeRepository\", \"\"),\n", + " \"url\": nb_url,\n", + " }\n", + " crate.add(ContextEntity(crate, nb_url, properties=nb_properties))\n", + " action_id = f\"{nb_path.stem}_run\"\n", + " action_properties = {\n", + " \"@type\": \"CreateAction\",\n", + " \"instrument\": {\"@id\": nb_url},\n", + " \"actionStatus\": {\"@id\": \"http://schema.org/CompletedActionStatus\"},\n", + " \"name\": f\"Run of notebook: {nb_path.name}\",\n", + " \"result\": {\"@id\": f\"{file_path.name}/\"},\n", + " \"object\": [{\"@id\": o[\"url\"]} for o in metadata[\"action\"][0][\"object\"]],\n", + " \"query\": f\"{subject['id']} ({subject['name']})\",\n", + " \"startDate\": start_date,\n", + " \"endDate\": end_date,\n", + " }\n", + " encoding = mimetypes.guess_type(file_path)[0]\n", + " stats = file_path.stat()\n", + " size = stats.st_size\n", + " date = datetime.fromtimestamp(stats.st_mtime).strftime(\"%Y-%m-%d\")\n", + " rows = 0\n", + " with file_path.open(\"r\") as df:\n", + " for line in df:\n", + " rows += 1\n", + " crate.update_jsonld(\n", + " {\n", + " \"@id\": file_path.name,\n", + " \"dateModified\": date,\n", + " \"contentSize\": size,\n", + " \"size\": rows,\n", + " \"encodingFormat\": encoding,\n", + " }\n", + " )\n", + " crate.add(ContextEntity(crate, action_id, properties=action_properties))\n", + " crate.write(file_path.parent)\n", + " crate.write_zip(file_path.parent)" + ] + }, + { + "cell_type": "markdown", + "id": "218d3ded-e4f9-4dfd-88b0-9c62a319f26f", + "metadata": {}, + "source": [ + "## Get title urls from a Pandora subject group" + ] + }, + { + "cell_type": "code", + "execution_count": 323, + "id": "cb8497ee-fae1-4252-bdb4-5fe1ab6fb12a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def get_title_ids_in_collection(coll_id, include_subcollections=True):\n", + " title_ids = []\n", + " coll = dfc.loc[dfc[\"id\"] == coll_id].iloc[0]\n", + " title_ids += coll[\"titles\"]\n", + " if include_subcollections:\n", + " for scoll_id in coll[\"subcollections\"]:\n", + " scoll = dfc.loc[dfc[\"id\"] == scoll_id].iloc[0]\n", + " title_ids += scoll[\"titles\"]\n", + " return title_ids\n", + "\n", + "\n", + "def get_urls_by_subject(\n", + " subject, include_subcategories=False, include_collections=False\n", + "):\n", + " title_ids = []\n", + " title_ids += subject[\"titles\"]\n", + " if include_subcategories:\n", + " for subc_id in subject[\"subcategories\"]:\n", + " subc = dfs.loc[dfs[\"id\"] == subc_id].iloc[0]\n", + " title_ids += subc[\"titles\"]\n", + " if include_collections:\n", + " for coll_id in subc[\"collections\"]:\n", + " title_ids += get_title_ids_in_collection(coll_id)\n", + " if include_collections:\n", + " for coll_id in subject[\"collections\"]:\n", + " title_ids += get_title_ids_in_collection(coll_id)\n", + " titles = dft.loc[dft[\"tep_id\"].isin(title_ids)]\n", + " return titles\n", + "\n", + "\n", + "def create_subject_dataset(\n", + " id, include_subcategories=False, include_collections=False, include_crate=True\n", + "):\n", + " start_date = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n", + " subject = dfs.loc[dfs[\"id\"] == id].iloc[0]\n", + "\n", + " df = get_urls_by_subject(\n", + " subject,\n", + " include_subcategories=include_subcategories,\n", + " include_collections=include_collections,\n", + " )\n", + "\n", + " end_date = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n", + " subject_slug = slugify(f\"pandora-{id}-{subject['name']}\")\n", + " output_path = Path(\"datasets\", subject_slug)\n", + " output_path.mkdir(exist_ok=True, parents=True)\n", + " output_file = Path(output_path, f\"pandora-{subject_slug}.csv\")\n", + " df.to_csv(output_file, index=False)\n", + " if include_crate:\n", + " create_rocrate(subject, output_file, start_date, end_date)\n", + " display(\n", + " HTML(\n", + " f\"Download dataset: datasets/{subject_slug}.zip\"\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 324, + "id": "a9895f20-1891-4c5e-8bfe-bb3ef54f023f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "nbval-skip" + ] + }, + "outputs": [ + { + "data": { + "text/html": [ + "Download dataset: datasets/subject-3-business-economy.zip" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "create_subject_dataset(\n", + " \"/subject/3\", include_subcategories=True, include_collections=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e4e5aafe-6884-457f-a249-fc3a0f40f42f", + "metadata": {}, + "source": [ + "## Get title urls from a Pandora collection" + ] + }, + { + "cell_type": "code", + "execution_count": 327, + "id": "a1f8b140-1453-4b1f-ba7f-6bdb3e77b11f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def get_titles_by_collection(coll, include_subcollections=True):\n", + " title_ids = get_title_ids_in_collection(\n", + " coll[\"id\"], include_subcollections=include_subcollections\n", + " )\n", + " titles = dft.loc[dft[\"tep_id\"].isin(title_ids)]\n", + " return titles\n", + "\n", + "\n", + "def create_collection_dataset(id, include_subcollections=False, include_crate=True):\n", + " start_date = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n", + " coll = dfc.loc[dfc[\"id\"] == id].iloc[0]\n", + " df = get_titles_by_collection(\n", + " coll,\n", + " include_subcollections=include_subcollections,\n", + " )\n", + " end_date = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n", + " coll_slug = slugify(f\"pandora-{id}-{coll['name']}\")\n", + "\n", + " output_path = Path(\"datasets\", coll_slug)\n", + " output_path.mkdir(exist_ok=True, parents=True)\n", + " output_file = Path(output_path, f\"pandora-{coll_slug}.csv\")\n", + " df.to_csv(output_file, index=False)\n", + " if include_crate:\n", + " create_rocrate(coll, output_file, start_date, end_date)\n", + " display(\n", + " HTML(\n", + " f\"Download dataset: datasets/{coll_slug}.zip\"\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 328, + "id": "f9dff1c6-4305-4d87-976e-a11f72a914e2", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "nbval-skip" + ] + }, + "outputs": [ + { + "data": { + "text/html": [ + "Download dataset: datasets/pandora-col-21326-museums.zip" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "create_collection_dataset(\"/col/21326\", include_subcollections=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea3b5a17-9b47-4c48-a471-7b42c5948371", + "metadata": { + "editable": true, + "jupyter": { + "source_hidden": true + }, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# IGNORE CELL -- TESTING ONLY\n", + "\n", + "if os.getenv(\"GW_STATUS\") == \"dev\":\n", + " create_subject_dataset(\n", + " \"/subject/3\",\n", + " include_subcategories=True,\n", + " include_collections=True,\n", + " include_crate=False,\n", + " )\n", + " create_collection_dataset(\n", + " \"/col/21326\", include_subcollections=True, include_crate=False\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "f4ecfc37-359a-4021-b157-4b9fbe1f0dea", + "metadata": {}, + "source": [ + "----\n", + "\n", + "Created by [Tim Sherratt](https://timsherratt.au/) for the [GLAM Workbench](https://glam-workbench.net/)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "rocrate": { + "action": [ + { + "object": [ + { + "url": "https://github.com/GLAM-Workbench/trove-web-archives-titles/raw/main/pandora-titles.csv" + }, + { + "url": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-subjects.ndjson" + }, + { + "url": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-collections.ndjson" + } + ] + } + ], + "author": [ + { + "mainEntityOfPage": "https://timsherratt.au", + "name": "Sherratt, Tim", + "orcid": "https://orcid.org/0000-0001-7956-4498" + } + ], + "description": "This notebook helps you create a dataset of archived urls using Pandora's subject and collection groupings.\n\nThe Australian Web Archive makes billions of archived web pages searchable through Trove. But how would you go about constructing a search that would find websites relating to election campaigns? Fortunately you don't have to, as Pandora provides a collection of archived web resources organised by subject and collection. By using harvests of Pandora's subject hierarchy and a complete list of archived titles, this notebook makes it easy for you to create custom datasets relating to a specific topic or event.", + "mainEntityOfPage": "https://glam-workbench.net/trove-web-archives/create-datasets/", + "name": "Create title datasets from collections and subjects", + "url": "https://github.com/GLAM-Workbench/trove-web-archives/raw/master/create-datasets.ipynb" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/dev-requirements.in b/dev-requirements.in index b643718..7d1a7f2 100644 --- a/dev-requirements.in +++ b/dev-requirements.in @@ -12,3 +12,4 @@ giturlparse requests python-dotenv jupyterlab-code-formatter +gitpython diff --git a/harvest-pandora-subject-collections.ipynb b/harvest-pandora-subject-collections.ipynb new file mode 100644 index 0000000..3ee5751 --- /dev/null +++ b/harvest-pandora-subject-collections.ipynb @@ -0,0 +1,454 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1a86913b-9a41-4013-b700-1696982517d7", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Harvest Pandora subjects and collections\n", + "\n", + "The [Pandora](http://pandora.nla.gov.au/) selective web archive assigns archived titles to subject and collection groupings. These curated collections help researchers find archived websites relating to specific topics or events, such as [election campaigns](http://pandora.nla.gov.au/subject/6). This notebook harvests Pandora's navigation hierarchy, saving the connections between subjects, collections, and titles. The datasets created can be used to assemble subject-based collections of archived websites for research.\n", + "\n", + "## Pandora vs Trove\n", + "\n", + "The relationship between Pandora and Trove is a bit confusing. While the websites archived in Pandora are now part of the Australian Web Archive, and are searchable through Trove, not all of Pandora's metadata can be accessed through the Trove web interface.\n", + "\n", + "Trove's [Categories](https://trove.nla.gov.au/landing/categories) tab includes a link to [Archived Webpage Collections](https://webarchive.nla.gov.au/collection). This collection hierarchy is basically the same as Pandora's – combining Pandora's subjects, subcategories, and collections into a single structure. However, it only includes links to titles that are part of collections. This is important, as less than half of Pandora's selected titles seem to be assigned to collections.\n", + "\n", + "I originally started harvesting the collections from Trove, but eventually realised that I was missing out on titles that had been grouped by subject, but were not part of collections. As a result, I shifted approaches to scrape the data from Pandora directly.\n", + "\n", + "## Subjects, Collections, and Titles\n", + "\n", + "There are two levels of subject headings in Pandora. The top-level headings are displayed on the Pandora home page, for example, [Arts](http://pandora.nla.gov.au/subject/2) and [Politics](http://pandora.nla.gov.au/subject/21). The top-level headings can include sub-categories. For example, Arts includes sub-categories for Architecture and Dance. Both the top-level subjects and sub-categories can include collections and titles.\n", + "\n", + "Collections are more fine-grained groupings of titles, often related to specific events or activities. Collections can include sub-collections. In Pandora's web interface, the sub-collections are displayed as sub-headings on the collection page, but in the backend each sub-collection has its own identifier. For example, the Galleries collection, includes a list of gallery websites divided into sub-collections by the state in which they're located. Both collections and sub-collections can contain titles.\n", + "\n", + "Collections can appear in multiple subjects and sub-categories. This means that the harvesting process saves duplicate copies of collections that need to be removed.\n", + "\n", + "Titles are also a type of group, bringing together webpage snapshots over time. They can also link urls where the addresses or domains of resources have changed. As a result, each title can be associated with multiple urls. This notebook doesn't harvest the full title details, it simply links title identifiers with subjects and collections. See [Harvest the full collection of Pandora titles](harvest-pandora-titles.ipynb) for more.\n", + "\n", + "Titles can be linked to any level in this hierarchy. So to assemble a complete list of titles under a subject such as 'Arts', you need to get all the titles from 'Arts', all of the titles from all of the sub-categories under 'Arts', and all of the titles from all of the collections and sub-collections under both 'Arts' and its subcategories. See [Create archived url datasets from Pandora's collections and subjects](create-datasets.ipynb) for an example of this.\n", + "\n", + "For more on Pandora's approach to describing collections see [Creating Structure in Web Archives With Collections: Different Concepts From Web Archivists](https://doi.org/10.48550/arXiv.2209.08649).\n", + "\n", + "## Datasets\n", + "\n", + "This notebook creates two datasets:\n", + "\n", + "- `pandora-subjects.ndjson`\n", + "- `pandora-collections.ndjson`\n", + "\n", + "Pre-harvested versions of these datasets are available from the [trove-web-archives-collections-data](https://github.com/GLAM-Workbench/trove-web-archives-collections-data) repository.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dabdf361-6f0a-4c9c-91bd-14a92b45e6a9", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "import re\n", + "import time\n", + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "from dotenv import load_dotenv\n", + "from tqdm.auto import tqdm\n", + "\n", + "load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d6f6094-ce03-48d7-9ef2-7cdac14dba11", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def get_title_ids(page_id):\n", + " \"\"\"\n", + " Get the TEP identifiers for all the titles on the specified page.\n", + " Excludes titles in subcollections as they will can be harvested separately.\n", + " \"\"\"\n", + " title_ids = []\n", + " page = 1\n", + " # Subjects can have multiple pages of titles, so we'll go through page by page\n", + " # until there's no more titles\n", + " while page:\n", + " response = requests.get(f\"http://pandora.nla.gov.au{page_id}/{page}\")\n", + " soup = BeautifulSoup(response.text, \"lxml\")\n", + " # we only want the first itemlist containing titles\n", + " # subsequent titles will be part of subcollections\n", + " title_links = []\n", + " for item_list in soup.find_all(\"div\", class_=\"itemlist\"):\n", + " # This checks if the title list has an h1 tag before it\n", + " # which indicates its actually a subcollection\n", + " if not (\n", + " item_list.find_previous_sibling(\"h1\")\n", + " and item_list.find_previous_sibling(\"h1\").name == \"h1\"\n", + " ):\n", + " # Extract the TEP ids from the links\n", + " title_links = item_list.find_all(\"a\", href=re.compile(r\"/tep/\\d+\"))\n", + " for title_link in title_links:\n", + " title_ids.append(title_link[\"href\"])\n", + " # Continue if it's a subject page and there were title links on this page\n", + " if title_links and \"/col/\" not in page_id:\n", + " page += 1\n", + " else:\n", + " page = None\n", + " time.sleep(0.5)\n", + " return title_ids\n", + "\n", + "\n", + "def harvest_subcategories(subject_id):\n", + " \"\"\"\n", + " Harvest details of sub-categories from a subject page.\n", + " \"\"\"\n", + " subject_ids = []\n", + " response = requests.get(f\"http://pandora.nla.gov.au{subject_id}\")\n", + " soup = BeautifulSoup(response.text, \"lxml\")\n", + " subject_links = soup.find_all(\"a\", href=re.compile(r\"/subject/\\d+$\"))\n", + "\n", + " for subject_link in subject_links:\n", + " subject_name = \" \".join(subject_link.stripped_strings)\n", + " subject_id = subject_link[\"href\"]\n", + " collection_ids = harvest_collections(subject_id)\n", + " title_ids = get_title_ids(subject_id)\n", + " with Path(subjects_output).open(\"a\") as subjects_file:\n", + " subjects_file.write(\n", + " json.dumps(\n", + " {\n", + " \"name\": subject_name,\n", + " \"id\": subject_id,\n", + " \"collections\": collection_ids,\n", + " \"titles\": title_ids,\n", + " }\n", + " )\n", + " + \"\\n\"\n", + " )\n", + " subject_ids.append(subject_id)\n", + " return subject_ids\n", + "\n", + "\n", + "def harvest_subcollections(coll_id, coll_name):\n", + " \"\"\"\n", + " Harvest sub-collections from a collection page.\n", + " \"\"\"\n", + " collection_ids = []\n", + " response = requests.get(f\"http://pandora.nla.gov.au{coll_id}\")\n", + " soup = BeautifulSoup(response.text, \"lxml\")\n", + " for subc in soup.find_all(\"h1\"):\n", + " sub_link = subc.find(\"a\", {\"name\": re.compile(r\"\\d+\")})\n", + " if sub_link:\n", + " sub_name = sub_link.string\n", + " if coll_name not in sub_name:\n", + " sub_name = f\"{coll_name} - {sub_name}\"\n", + " sub_id = f\"/col/{sub_link['name']}\"\n", + " title_ids = get_title_ids(sub_id)\n", + " with Path(collections_output).open(\"a\") as collections_file:\n", + " collections_file.write(\n", + " json.dumps(\n", + " {\n", + " \"name\": sub_name,\n", + " \"id\": sub_id,\n", + " \"titles\": title_ids,\n", + " \"subcollections\": [],\n", + " }\n", + " )\n", + " + \"\\n\"\n", + " )\n", + " collection_ids.append(sub_id)\n", + " return collection_ids\n", + "\n", + "\n", + "def harvest_collections(subject_id):\n", + " \"\"\"\n", + " Harvest details of collections from a subject, or sub-category page.\n", + " \"\"\"\n", + " collection_ids = []\n", + " response = requests.get(f\"http://pandora.nla.gov.au{subject_id}\")\n", + " soup = BeautifulSoup(response.text, \"lxml\")\n", + " collection_links = soup.find_all(\"a\", href=re.compile(r\"/col/\\d+$\"))\n", + " for coll_link in collection_links:\n", + " coll_name = \" \".join(coll_link.stripped_strings)\n", + " coll_id = coll_link[\"href\"]\n", + " subcollection_ids = harvest_subcollections(coll_id, coll_name)\n", + " title_ids = get_title_ids(coll_id)\n", + " with Path(collections_output).open(\"a\") as collections_file:\n", + " collections_file.write(\n", + " json.dumps(\n", + " {\n", + " \"name\": coll_name,\n", + " \"id\": coll_id,\n", + " \"subcollections\": subcollection_ids,\n", + " \"titles\": title_ids,\n", + " }\n", + " )\n", + " + \"\\n\"\n", + " )\n", + " collection_ids.append(coll_id)\n", + " return collection_ids\n", + "\n", + "\n", + "def harvest_subjects(sample=None):\n", + " Path(subjects_output).unlink(missing_ok=True)\n", + " Path(collections_output).unlink(missing_ok=True)\n", + " response = requests.get(\"http://pandora.nla.gov.au/\")\n", + " soup = BeautifulSoup(response.text, \"lxml\")\n", + " subject_list = soup.find(\"div\", class_=\"browseSubjects\").find_all(\"li\")\n", + " for subject in tqdm(subject_list[:sample]):\n", + " subject_link = subject.find(\"a\")\n", + " subject_name = \" \".join(subject_link.stripped_strings)\n", + " subject_id = subject_link[\"href\"]\n", + " subcategory_ids = harvest_subcategories(subject_id)\n", + " subcollection_ids = harvest_collections(subject_id)\n", + " title_ids = get_title_ids(subject_id)\n", + " with Path(subjects_output).open(\"a\") as subjects_file:\n", + " subjects_file.write(\n", + " json.dumps(\n", + " {\n", + " \"name\": subject_name,\n", + " \"id\": subject_id,\n", + " \"subcategories\": subcategory_ids,\n", + " \"collections\": subcollection_ids,\n", + " \"titles\": title_ids,\n", + " }\n", + " )\n", + " + \"\\n\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07258429-8f1e-4acd-a778-ce2beaa884d2", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "nbval-skip" + ] + }, + "outputs": [], + "source": [ + "subjects_output = \"pandora-subjects.ndjson\"\n", + "collections_output = \"pandora-collections.ndjson\"\n", + "\n", + "harvest_subjects()" + ] + }, + { + "cell_type": "markdown", + "id": "bdd2f2fe-08a1-41c7-827a-968620b1635f", + "metadata": {}, + "source": [ + "## Remove duplicate collections\n", + "\n", + "Collections can appear under multiple subjects, so there will be duplicates in the collections dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "983b6be0-3e05-49e6-8793-e824284a1344", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "nbval-skip" + ] + }, + "outputs": [], + "source": [ + "dfc = pd.read_json(\"pandora-collections.ndjson\", lines=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57cbbb9f-1feb-4026-97e3-76fd92eb246f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "nbval-skip" + ] + }, + "outputs": [], + "source": [ + "dfc.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31e50849-089a-4788-b047-546cef627e0c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "nbval-skip" + ] + }, + "outputs": [], + "source": [ + "dfc.drop_duplicates(subset=[\"id\"], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4692b6c-7be9-442c-a143-d9f807ccde20", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "nbval-skip" + ] + }, + "outputs": [], + "source": [ + "dfc.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37499022-7212-48e5-9c40-e4d8ca4ab21e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "nbval-skip" + ] + }, + "outputs": [], + "source": [ + "dfc.to_json(\"pandora-collections.ndjson\", orient=\"records\", lines=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "150d9e88-3e3d-43ed-82a9-75cb778f677d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# IGNORE CELL --TESTING ONLY\n", + "if os.getenv(\"GW_STATUS\") == \"dev\":\n", + " subjects_output = \"pandora-subjects-test.ndjson\"\n", + " collections_output = \"pandora-collections-test.ndjson\"\n", + "\n", + " harvest_subjects(sample=1)\n", + "\n", + " Path(subjects_output).unlink(missing_ok=True)\n", + " Path(collections_output).unlink(missing_ok=True)" + ] + }, + { + "cell_type": "markdown", + "id": "f3e531ff-31cb-49ef-b9a7-06bad671cede", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "----\n", + "\n", + "Created by [Tim Sherratt](https://timsherratt.au/) for the [GLAM Workbench](https://glam-workbench.net/)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "rocrate": { + "action": [ + { + "description": "This dataset contains details of the subject and collection groupings used by Pandora to organise archived web resource titles.", + "isPartOf": "https://github.com/GLAM-Workbench/trove-web-archives-collections", + "mainEntityOfPage": "https://glam-workbench.net/trove-web-archives/pandora-collections-data", + "result": [ + { + "url": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-subjects.ndjson" + }, + { + "url": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-collections.ndjson" + } + ] + } + ], + "author": [ + { + "mainEntityOfPage": "https://timsherratt.au", + "name": "Sherratt, Tim", + "orcid": "https://orcid.org/0000-0001-7956-4498" + } + ], + "description": "This notebook harvests Pandora's navigation hierarchy, saving the connections between subjects, collections, and titles. The datasets created can be used to assemble subject-based collections of archived websites for research.", + "mainEntityOfPage": "https://glam-workbench.net/trove-web-archives/harvest-pandora-subject-collections/", + "name": "Harvest Pandora subjects and collections" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/harvest-pandora-titles.ipynb b/harvest-pandora-titles.ipynb new file mode 100644 index 0000000..b1ffe68 --- /dev/null +++ b/harvest-pandora-titles.ipynb @@ -0,0 +1,386 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6464b6f6-e5d9-45a6-92fb-0b6a7216040c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Harvest the full collection of Pandora titles\n", + "\n", + "This notebook harvests a complete collection of archived web page titles from [Pandora](http://pandora.nla.gov.au/), the National Library of Australia's selective web archive.\n", + "\n", + "Pandora has been selecting web sites and online resources for preservation since 1996. It has assembled a collection of more than 80,000 titles, organised into subjects and collections. The archived websites are now part of the Australian Web Archive (AWA), which combines the selected titles with broader domain harvests, and is searchable through Trove. However, Pandora's curated collections offer a useful entry point for researchers trying to find web sites relating to particular topics or events.\n", + "\n", + "By combining the list of titles with data [harvested from Pandora's hierarchy of subjects and collections](harvest-pandora-subject-collections.ipynb), you can [create datasets of archived urls relating to specific topics](create-datasets.ipynb).\n", + "\n", + "## What are titles?\n", + "\n", + "Pandora's 'titles' are not single resources, they're groups of resources. Titles link to snapshots of a web resource captured on different dates (also known as [Mementos](https://glam-workbench.net/web-archives/timegates-timemaps-mementos/)). Titles also bring together different urls or domains that have pointed to the resource over time. This means that each title can be linked to multiple urls. This notebook unpacks the title records to create an entry for each archived url.\n", + "\n", + "## Harvesting method\n", + "\n", + "There are two main processes used to harvest the data:\n", + "\n", + "- scraping Pandora's [complete list of titles](http://pandora.nla.gov.au/alpha/ALL) to save the link and name for each title\n", + "- requesting a machine-readable version of the Title Entry Page (TEP) for each title and saving all the archived urls grouped within the title\n", + "\n", + "The title links have the form `/tep/[TEP number]` and lead to a human-readable Title Entry Page in Trove. However, by changing the url, you can get a JSON version of the TEP. For example:\n", + "\n", + "- [https://webarchive.nla.gov.au/tep/131444](https://webarchive.nla.gov.au/tep/131444) – goes to TEP web page\n", + "- [https://webarchive.nla.gov.au/bamboo-service/tep/131444](https://webarchive.nla.gov.au/bamboo-service/tep/131444) – returns JSON version of TEP\n", + "\n", + "The JSON data includes a list of instances that point to individual snapshots (or Mementos) of the title. As far as I can tell, the TEPs only include snapshots captured through Pandora's selective archiving processes. Additional snapshots of a resource might have been captured by a domain crawl and included in the Australian Web Archive. A complete list of captures can be retrieved by using the url of the archived resource to [request a Timemap](https://glam-workbench.net/web-archives/get-all-versions/).\n", + "\n", + "The harvesting process attempts to extract all the archived urls from the `gatheredUrl` field in the instance data. However, it seems that when Pandora snapshots are migrated to the AWA, the `gatheredUrl` value is set to point to the snapshot, rather than the url of the original resource. The original url is embedded in the snapshot url, so the harvesting process extracts it using regular expressions.\n", + "\n", + "The urls extracted from each title record are de-duplicated, and each unique value is saved as a separate row in the resulting dataset. This means there can be multiple records for each title.\n", + "\n", + "## Dataset structure\n", + "\n", + "The dataset includes a row for each unique url from each title. The fields are:\n", + "\n", + "- `tep_id` – the TEP identifier in the form `/tep/[TEP NUMBER]`\n", + "- `name` – name of the title\n", + "- `gathered_url` – the url that was archived\n", + "- `surt` – the surt (Sort-friendly URI Reordering Transform) is a version of the url that reverses the order of the domain components to put the top-level domain first, making it easier to group or sort resources by domain\n", + "\n", + "A pre-harvested version of this dataset is available from the [trove-web-archives-collections-data](https://github.com/GLAM-Workbench/trove-web-archives-collections-data) repository." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6ae9f72-18bb-42d9-9f58-0bfdd52a2a45", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "import re\n", + "import time\n", + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "import requests\n", + "import requests_cache\n", + "from bs4 import BeautifulSoup\n", + "from dotenv import load_dotenv\n", + "from requests.adapters import HTTPAdapter\n", + "from requests.packages.urllib3.util.retry import Retry\n", + "from surt import surt\n", + "from tqdm.auto import tqdm\n", + "\n", + "s = requests_cache.CachedSession(\"titles.db\")\n", + "retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])\n", + "s.mount(\"https://\", HTTPAdapter(max_retries=retries))\n", + "s.mount(\"http://\", HTTPAdapter(max_retries=retries))\n", + "\n", + "load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4afc1306-f255-436e-bb39-c75ae36b7f23", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def harvest_titles(output=\"titles_all.ndjson\", sample_only=False):\n", + " \"\"\"\n", + " Scrapes details of all titles from the Pandora website.\n", + " \"\"\"\n", + " Path(output).unlink(missing_ok=True)\n", + " page = 1\n", + " with tqdm() as pbar:\n", + " # Continue harvesting page by page until there's no results\n", + " while page:\n", + " # Request a page of title links\n", + " response = requests.get(f\"http://pandora.nla.gov.au/alpha/ALL/{page}\")\n", + " soup = BeautifulSoup(response.text, \"lxml\")\n", + " title_links = []\n", + " with Path(output).open(\"a\") as titles_file:\n", + " # Find all the item lists on the page and loop through them\n", + " for item_list in soup.find_all(\"div\", class_=\"itemlist\"):\n", + " # Get all the tep links\n", + " title_links = item_list.find_all(\"a\", href=re.compile(r\"/tep/\\d+\"))\n", + " # Save the tep id and name\n", + " for title_link in title_links:\n", + " titles_file.write(\n", + " json.dumps(\n", + " {\n", + " \"tep_id\": title_link[\"href\"],\n", + " \"name\": title_link.string,\n", + " }\n", + " )\n", + " + \"\\n\"\n", + " )\n", + " pbar.update(1)\n", + " # If there's title links on this page, increment the page value and continue\n", + " if title_links and not sample_only:\n", + " page += 1\n", + " # If there's no title links then stop harvesting\n", + " else:\n", + " page = None\n", + " time.sleep(0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79a17f4a-bef7-4536-b83f-bd00e0ea4a54", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "nbval-skip" + ] + }, + "outputs": [], + "source": [ + "harvest_titles()" + ] + }, + { + "cell_type": "markdown", + "id": "24ef2801-effb-4a92-b495-13f7973ae9a7", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Extract archived urls from TEP\n", + "\n", + "Now we'll request data for each TEP and extract the archived urls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bfb39eb-8167-43ad-a993-290811913508", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def clean_url(url):\n", + " \"\"\"\n", + " Get the harvested url from a Pandora snapshot link.\n", + " \"\"\"\n", + " match = re.search(r\"^/?[A-Z0-9]*/?[A-Za-z0-9-]+/\", url)\n", + " if match:\n", + " url = url[match.end() :]\n", + " if not url.startswith(\"http\"):\n", + " url = f\"http://{url}\"\n", + " return url\n", + "\n", + "\n", + "def add_title_urls(input=\"titles_all.ndjson\", output=\"title_urls.ndjson\"):\n", + " with Path(input).open(\"r\") as input_file:\n", + " with Path(output).open(\"w\") as output_file:\n", + " for line in tqdm(input_file):\n", + " tep_data = json.loads(line)\n", + " # Get TEP JSON\n", + " url = (\n", + " f\"https://webarchive.nla.gov.au/bamboo-service{tep_data['tep_id']}\"\n", + " )\n", + " response = s.get(url)\n", + " # Some TEPs produce 500 errors -- seems they're no longer in the archive?\n", + " if response.ok:\n", + " data = response.json()\n", + " instance_urls = []\n", + " # Title record includes multiple instances\n", + " # An instance can be a different url, or a Pandora snapshot\n", + " # We want to get all the distinct urls, so we'll trim the Pandora bits from urls and\n", + " # use surts to merge http, https, www addresses\n", + " surts = []\n", + " for instance in data[\"instances\"]:\n", + " # First we'll use the `gatheredUrl` field\n", + " if gathered_url := instance.get(\"gatheredUrl\"):\n", + " # Remove the Pandora part of the url (if there is one)\n", + " gathered_url = clean_url(gathered_url)\n", + " try:\n", + " tep_surt = surt(gathered_url)\n", + " # This is to handle a broken url\n", + " except ValueError:\n", + " gathered_url = gathered_url.replace(\n", + " \"http://https:\", \"http://\"\n", + " )\n", + " tep_surt = surt(gathered_url)\n", + " # If there's no `gatheredUrl`, we'll use the `url`\n", + " elif tep_url := instance.get(\"url\"):\n", + " # Remove Pandora part of link\n", + " gathered_url = re.search(\n", + " r\"http://pandora.nla.gov.au/pan/\\w+/\\w+-\\w+/(.*)\",\n", + " tep_url,\n", + " ).group(1)\n", + " if not gathered_url.startswith(\"http\"):\n", + " gathered_url = f\"http://{gathered_url}\"\n", + " tep_surt = surt(gathered_url)\n", + " else:\n", + " tep_surt = None\n", + " # Add url to list if we don't already have it (check surts)\n", + " if tep_surt and tep_surt not in surts:\n", + " instance_urls.append(gathered_url)\n", + " surts.append(tep_surt)\n", + " # Save each url\n", + " for instance_url in sorted(set(instance_urls)):\n", + " tep_data[\"gathered_url\"] = instance_url\n", + " tep_data[\"surt\"] = surt(instance_url)\n", + " output_file.write(json.dumps(tep_data) + \"\\n\")\n", + " if not response.from_cache:\n", + " time.sleep(0.5)\n", + " else:\n", + " output_file.write(json.dumps(tep_data) + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc1b4a3b-8bfd-41d9-a044-64f42eb04c3e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "nbval-skip" + ] + }, + "outputs": [], + "source": [ + "add_title_urls()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91a934fe-14e9-43cb-b270-e321d647db51", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "nbval-skip" + ] + }, + "outputs": [], + "source": [ + "dft = pd.read_json(\"title_urls.ndjson\", lines=True)\n", + "dft.to_csv(\"pandora-titles.csv\", index=False, encoding=\"utf-8-sig\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91cbc3e5-c8e9-427c-937e-9f1fff829c39", + "metadata": { + "editable": true, + "jupyter": { + "source_hidden": true + }, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# IGNORE THIS CELL -- TESTING ONLY\n", + "if os.getenv(\"GW_STATUS\") == \"dev\":\n", + " harvest_titles(output=\"test.ndjson\", sample_only=True)\n", + " add_title_urls(input=\"test.ndjson\", output=\"test_urls.ndjson\")\n", + " Path(\"test.ndjson\").unlink()\n", + " Path(\"test_urls.ndjson\").unlink()" + ] + }, + { + "cell_type": "markdown", + "id": "75e4e155-3ec9-446b-ac11-d95f476cc23f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "----\n", + "\n", + "Created by [Tim Sherratt](https://timsherratt.au/) for the [GLAM Workbench](https://glam-workbench.net/)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "rocrate": { + "action": [ + { + "description": "This dataset contains a complete list of Pandora's archived web resource titles.", + "isPartOf": "https://github.com/GLAM-Workbench/trove-web-archives-titles", + "mainEntityOfPage": "https://glam-workbench.net/trove-web-archives/pandora-titles-data", + "result": [ + { + "url": "https://github.com/GLAM-Workbench/trove-web-archives-titles/raw/main/pandora-titles.csv" + } + ] + } + ], + "author": [ + { + "mainEntityOfPage": "https://timsherratt.au", + "name": "Sherratt, Tim", + "orcid": "https://orcid.org/0000-0001-7956-4498" + } + ], + "description": "This notebook harvests a complete collection of archived web page titles from [Pandora](http://pandora.nla.gov.au/), the National Library of Australia's selective web archive.", + "mainEntityOfPage": "https://glam-workbench.net/trove-web-archives/harvest-pandora-titles/", + "name": "Harvest the full collection of Pandora titles" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index 9af391c..7594be7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.nbqa.addopts] flake8 = [ - "--ignore=E501,W503" + "--ignore=E501,W503,E203" ] [tool.pytest.ini_options] addopts = "--ignore-glob=Untitled* --ignore=snippets.ipynb --ignore-glob=draft*" diff --git a/requirements.in b/requirements.in index af5af4f..e196f4b 100644 --- a/requirements.in +++ b/requirements.in @@ -5,3 +5,14 @@ pandas requests altair jupyter-archive +requests-cache +surt +# because of virtualenv +filelock==3.13.1 +ipywidgets +beautifulsoup4 +lxml +tqdm +rocrate +ipynbname +python-slugify \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3fb69ca --- /dev/null +++ b/requirements.txt @@ -0,0 +1,450 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile requirements.in +# +aiohttp==3.9.5 + # via tuspy +aiosignal==1.3.1 + # via aiohttp +altair==5.2.0 + # via -r requirements.in +anyio==4.3.0 + # via + # httpx + # jupyter-server +arcp==0.2.1 + # via rocrate +argon2-cffi==23.1.0 + # via jupyter-server +argon2-cffi-bindings==21.2.0 + # via argon2-cffi +arrow==1.3.0 + # via isoduration +asttokens==2.4.1 + # via stack-data +async-lru==2.0.4 + # via jupyterlab +async-timeout==4.0.3 + # via aiohttp +attrs==23.2.0 + # via + # aiohttp + # cattrs + # jsonschema + # referencing + # requests-cache +babel==2.14.0 + # via jupyterlab-server +beautifulsoup4==4.12.3 + # via + # -r requirements.in + # nbconvert +bioblend==1.2.0 + # via gxformat2 +bleach==6.1.0 + # via nbconvert +cachecontrol[filecache]==0.14.0 + # via schema-salad +cattrs==23.2.3 + # via requests-cache +certifi==2024.2.2 + # via + # httpcore + # httpx + # requests +cffi==1.16.0 + # via argon2-cffi-bindings +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via rocrate +comm==0.2.1 + # via + # ipykernel + # ipywidgets +debugpy==1.8.1 + # via ipykernel +decorator==5.1.1 + # via ipython +defusedxml==0.7.1 + # via nbconvert +exceptiongroup==1.2.0 + # via + # anyio + # cattrs + # ipython +executing==2.0.1 + # via stack-data +fastjsonschema==2.19.1 + # via nbformat +filelock==3.13.1 + # via + # -r requirements.in + # cachecontrol + # tldextract +fqdn==1.5.1 + # via jsonschema +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +galaxy2cwl==0.1.4 + # via rocrate +gxformat2==0.18.0 + # via galaxy2cwl +h11==0.14.0 + # via httpcore +httpcore==1.0.4 + # via httpx +httpx==0.27.0 + # via jupyterlab +idna==3.6 + # via + # anyio + # httpx + # jsonschema + # requests + # tldextract + # yarl +ipykernel==6.29.3 + # via + # ipynbname + # jupyterlab +ipynbname==2023.2.0.0 + # via -r requirements.in +ipython==8.22.1 + # via + # ipykernel + # ipywidgets +ipywidgets==8.1.2 + # via -r requirements.in +isodate==0.6.1 + # via rdflib +isoduration==20.11.0 + # via jsonschema +jedi==0.19.1 + # via ipython +jinja2==3.1.3 + # via + # altair + # jupyter-server + # jupyterlab + # jupyterlab-server + # nbconvert + # rocrate +json5==0.9.18 + # via jupyterlab-server +jsonpointer==2.4 + # via jsonschema +jsonschema[format-nongpl]==4.21.1 + # via + # altair + # jupyter-events + # jupyterlab-server + # nbformat +jsonschema-specifications==2023.12.1 + # via jsonschema +jupyter-archive==3.4.0 + # via -r requirements.in +jupyter-client==8.6.0 + # via + # ipykernel + # jupyter-server + # nbclient + # voila +jupyter-core==5.7.1 + # via + # ipykernel + # jupyter-client + # jupyter-server + # jupyterlab + # nbclient + # nbconvert + # nbformat + # voila +jupyter-events==0.9.0 + # via jupyter-server +jupyter-lsp==2.2.3 + # via jupyterlab +jupyter-server==2.12.5 + # via + # jupyter-archive + # jupyter-lsp + # jupyterlab + # jupyterlab-server + # notebook-shim + # voila +jupyter-server-terminals==0.5.2 + # via jupyter-server +jupyterlab==4.1.2 + # via -r requirements.in +jupyterlab-pygments==0.3.0 + # via nbconvert +jupyterlab-server==2.25.3 + # via + # jupyterlab + # voila +jupyterlab-widgets==3.0.10 + # via ipywidgets +lxml==5.2.1 + # via -r requirements.in +markupsafe==2.1.5 + # via + # jinja2 + # nbconvert +matplotlib-inline==0.1.6 + # via + # ipykernel + # ipython +mistune==3.0.2 + # via + # nbconvert + # schema-salad +msgpack==1.0.8 + # via cachecontrol +multidict==6.0.5 + # via + # aiohttp + # yarl +mypy-extensions==1.0.0 + # via schema-salad +nbclient==0.7.4 + # via + # nbconvert + # voila +nbconvert==7.16.1 + # via + # jupyter-server + # voila +nbformat==5.9.2 + # via + # jupyter-server + # nbclient + # nbconvert +nest-asyncio==1.6.0 + # via ipykernel +notebook-shim==0.2.4 + # via jupyterlab +numpy==1.26.4 + # via + # altair + # pandas +overrides==7.7.0 + # via jupyter-server +packaging==23.2 + # via + # altair + # ipykernel + # jupyter-server + # jupyterlab + # jupyterlab-server + # nbconvert +pandas==2.2.1 + # via + # -r requirements.in + # altair +pandocfilters==1.5.1 + # via nbconvert +parso==0.8.3 + # via jedi +pexpect==4.9.0 + # via ipython +platformdirs==4.2.0 + # via + # jupyter-core + # requests-cache +prometheus-client==0.20.0 + # via jupyter-server +prompt-toolkit==3.0.43 + # via ipython +psutil==5.9.8 + # via ipykernel +ptyprocess==0.7.0 + # via + # pexpect + # terminado +pure-eval==0.2.2 + # via stack-data +pycparser==2.21 + # via cffi +pygments==2.17.2 + # via + # ipython + # nbconvert +pyparsing==3.1.2 + # via rdflib +python-dateutil==2.9.0.post0 + # via + # arrow + # jupyter-client + # pandas + # rocrate +python-json-logger==2.0.7 + # via jupyter-events +python-slugify==8.0.4 + # via -r requirements.in +pytz==2024.1 + # via pandas +pyyaml==6.0.1 + # via + # galaxy2cwl + # gxformat2 + # jupyter-events +pyzmq==25.1.2 + # via + # ipykernel + # jupyter-client + # jupyter-server +rdflib==7.0.0 + # via schema-salad +referencing==0.33.0 + # via + # jsonschema + # jsonschema-specifications + # jupyter-events +requests==2.31.0 + # via + # -r requirements.in + # bioblend + # cachecontrol + # jupyterlab-server + # requests-cache + # requests-file + # requests-toolbelt + # rocrate + # schema-salad + # tldextract + # tuspy +requests-cache==1.2.0 + # via -r requirements.in +requests-file==2.0.0 + # via tldextract +requests-toolbelt==1.0.0 + # via bioblend +rfc3339-validator==0.1.4 + # via + # jsonschema + # jupyter-events +rfc3986-validator==0.1.1 + # via + # jsonschema + # jupyter-events +rocrate==0.10.0 + # via -r requirements.in +rpds-py==0.18.0 + # via + # jsonschema + # referencing +ruamel-yaml==0.18.6 + # via schema-salad +ruamel-yaml-clib==0.2.8 + # via ruamel-yaml +schema-salad==8.5.20240410123758 + # via gxformat2 +send2trash==1.8.2 + # via jupyter-server +six==1.16.0 + # via + # asttokens + # bleach + # isodate + # python-dateutil + # rfc3339-validator + # surt + # url-normalize +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.5 + # via beautifulsoup4 +stack-data==0.6.3 + # via ipython +surt==0.3.1 + # via -r requirements.in +terminado==0.18.0 + # via + # jupyter-server + # jupyter-server-terminals +text-unidecode==1.3 + # via python-slugify +tinycss2==1.2.1 + # via nbconvert +tinydb==4.8.0 + # via tuspy +tldextract==5.1.2 + # via surt +tomli==2.0.1 + # via jupyterlab +toolz==0.12.1 + # via altair +tornado==6.4 + # via + # ipykernel + # jupyter-client + # jupyter-server + # jupyterlab + # terminado +tqdm==4.66.2 + # via -r requirements.in +traitlets==5.14.1 + # via + # comm + # ipykernel + # ipython + # ipywidgets + # jupyter-client + # jupyter-core + # jupyter-events + # jupyter-server + # jupyterlab + # matplotlib-inline + # nbclient + # nbconvert + # nbformat + # voila +tuspy==1.0.3 + # via bioblend +types-python-dateutil==2.8.19.20240106 + # via arrow +typing-extensions==4.10.0 + # via + # altair + # anyio + # async-lru + # bioblend + # cattrs +tzdata==2024.1 + # via pandas +uri-template==1.3.0 + # via jsonschema +url-normalize==1.4.3 + # via requests-cache +urllib3==2.2.1 + # via + # requests + # requests-cache +voila==0.5.5 + # via + # -r requirements.in + # voila-material +voila-material @ git+https://github.com/GLAM-Workbench/voila-material.git + # via -r requirements.in +wcwidth==0.2.13 + # via prompt-toolkit +webcolors==1.13 + # via jsonschema +webencodings==0.5.1 + # via + # bleach + # tinycss2 +websocket-client==1.7.0 + # via jupyter-server +websockets==12.0 + # via voila +widgetsnbextension==4.0.10 + # via ipywidgets +yarl==1.9.4 + # via aiohttp diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index d2d7c22..123d2e8 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -9,11 +9,36 @@ "@id": "https://orcid.org/0000-0001-7956-4498" } ], - "datePublished": "2024-03-03", - "description": "Tools and examples to woth with Pandora", - "name": "Trove web archives", - "url": "https://github.com/GLAM-Workbench/trove-web-archives", - "version": "0.0.0" + "datePublished": "2024-05-03T03:41:59+00:00", + "description": "A GLAM Workbench repository", + "hasPart": [ + { + "@id": "create-datasets.ipynb" + }, + { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-titles/raw/main/pandora-titles.csv" + }, + { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-subjects.ndjson" + }, + { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-collections.ndjson" + }, + { + "@id": "harvest-pandora-subject-collections.ipynb" + }, + { + "@id": "harvest-pandora-titles.ipynb" + } + ], + "license": { + "@id": "https://spdx.org/licenses/MIT" + }, + "mainEntityOfPage": { + "@id": "https://glam-workbench.net/trove-web-archives" + }, + "name": "trove-web-archives", + "url": "https://github.com/GLAM-Workbench/trove-web-archives/" }, { "@id": "ro-crate-metadata.json", @@ -23,12 +48,314 @@ }, "conformsTo": { "@id": "https://w3id.org/ro/crate/1.1" + }, + "license": { + "@id": "https://creativecommons.org/publicdomain/zero/1.0/" } }, + { + "@id": "create-datasets.ipynb", + "@type": [ + "File", + "SoftwareSourceCode" + ], + "author": [ + { + "@id": "https://orcid.org/0000-0001-7956-4498" + } + ], + "category": "", + "codeRepository": "https://github.com/GLAM-Workbench/trove-web-archives/", + "conformsTo": { + "@id": "https://purl.archive.org/textcommons/profile#Notebook" + }, + "description": "This notebook helps you create a dataset of archived urls using Pandora's subject and collection groupings.\n\nThe Australian Web Archive makes billions of archived web pages searchable through Trove. But how would you go about constructing a search that would find websites relating to election campaigns? Fortunately you don't have to, as Pandora provides a collection of archived web resources organised by subject and collection. By using harvests of Pandora's subject hierarchy and a complete list of archived titles, this notebook makes it easy for you to create custom datasets relating to a specific topic or event.", + "encodingFormat": "application/x-ipynb+json", + "mainEntityOfPage": { + "@id": "https://glam-workbench.net/trove-web-archives/create-datasets/" + }, + "name": "Create title datasets from collections and subjects", + "position": 0, + "programmingLanguage": { + "@id": "https://www.python.org/downloads/release/python-31012/" + }, + "url": "https://github.com/GLAM-Workbench/trove-web-archives/blob/master/create-datasets.ipynb", + "workExample": [] + }, + { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-titles/raw/main/pandora-titles.csv", + "@type": [ + "File", + "Dataset" + ], + "contentSize": 13200346, + "dateModified": "2024-05-03", + "encodingFormat": "text/csv", + "isPartOf": { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-titles" + }, + "name": "pandora-titles.csv", + "sdDatePublished": "2024-05-03", + "size": 87741, + "url": "https://github.com/GLAM-Workbench/trove-web-archives-titles/raw/main/pandora-titles.csv" + }, + { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-subjects.ndjson", + "@type": [ + "File", + "Dataset" + ], + "contentSize": 1901413, + "dateModified": "2024-05-02", + "isPartOf": { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-collections" + }, + "name": "pandora-subjects.ndjson", + "sdDatePublished": "2024-05-03", + "size": 149, + "url": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-subjects.ndjson" + }, + { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-collections.ndjson", + "@type": [ + "File", + "Dataset" + ], + "contentSize": 844738, + "dateModified": "2024-05-02", + "isPartOf": { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-collections" + }, + "name": "pandora-collections.ndjson", + "sdDatePublished": "2024-05-03", + "size": 1920, + "url": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-collections.ndjson" + }, + { + "@id": "harvest-pandora-subject-collections.ipynb", + "@type": [ + "File", + "SoftwareSourceCode" + ], + "author": [ + { + "@id": "https://orcid.org/0000-0001-7956-4498" + } + ], + "category": "", + "codeRepository": "https://github.com/GLAM-Workbench/trove-web-archives/", + "conformsTo": { + "@id": "https://purl.archive.org/textcommons/profile#Notebook" + }, + "description": "This notebook harvests Pandora's navigation hierarchy, saving the connections between subjects, collections, and titles. The datasets created can be used to assemble subject-based collections of archived websites for research.", + "encodingFormat": "application/x-ipynb+json", + "mainEntityOfPage": { + "@id": "https://glam-workbench.net/trove-web-archives/harvest-pandora-subject-collections/" + }, + "name": "Harvest Pandora subjects and collections", + "position": 0, + "programmingLanguage": { + "@id": "https://www.python.org/downloads/release/python-31012/" + }, + "url": "https://github.com/GLAM-Workbench/trove-web-archives/blob/master/harvest-pandora-subject-collections.ipynb", + "workExample": [] + }, + { + "@id": "harvest-pandora-titles.ipynb", + "@type": [ + "File", + "SoftwareSourceCode" + ], + "author": [ + { + "@id": "https://orcid.org/0000-0001-7956-4498" + } + ], + "category": "", + "codeRepository": "https://github.com/GLAM-Workbench/trove-web-archives/", + "conformsTo": { + "@id": "https://purl.archive.org/textcommons/profile#Notebook" + }, + "description": "This notebook harvests a complete collection of archived web page titles from [Pandora](http://pandora.nla.gov.au/), the National Library of Australia's selective web archive.", + "encodingFormat": "application/x-ipynb+json", + "mainEntityOfPage": { + "@id": "https://glam-workbench.net/trove-web-archives/harvest-pandora-titles/" + }, + "name": "Harvest the full collection of Pandora titles", + "position": 0, + "programmingLanguage": { + "@id": "https://www.python.org/downloads/release/python-31012/" + }, + "url": "https://github.com/GLAM-Workbench/trove-web-archives/blob/master/harvest-pandora-titles.ipynb", + "workExample": [] + }, + { + "@id": "https://glam-workbench.net/trove-web-archives", + "@type": "CreativeWork", + "isPartOf": { + "@id": "https://glam-workbench.net/" + }, + "name": "Trove web archive collections (Pandora)", + "url": "https://glam-workbench.net/trove-web-archives" + }, + { + "@id": "https://glam-workbench.net/", + "@type": "CreativeWork", + "author": [ + { + "@id": "https://orcid.org/0000-0001-7956-4498" + } + ], + "description": "A collection of tools, tutorials, examples, and hacks to help researchers work with data from galleries, libraries, archives, and museums (the GLAM sector).", + "name": "GLAM Workbench", + "url": "https://glam-workbench.net/" + }, { "@id": "https://orcid.org/0000-0001-7956-4498", "@type": "Person", - "name": "Sherratt, Tim" + "mainEntityOfPage": "https://timsherratt.au", + "name": "Sherratt, Tim", + "orcid": "https://orcid.org/0000-0001-7956-4498" + }, + { + "@id": "https://spdx.org/licenses/MIT", + "@type": "CreativeWork", + "name": "MIT License", + "url": "https://spdx.org/licenses/MIT.html" + }, + { + "@id": "https://creativecommons.org/publicdomain/zero/1.0/", + "@type": "CreativeWork", + "name": "CC0 Public Domain Dedication", + "url": "https://creativecommons.org/publicdomain/zero/1.0/" + }, + { + "@id": "http://rightsstatements.org/vocab/NKC/1.0/", + "@type": "CreativeWork", + "description": "The organization that has made the Item available reasonably believes that the Item is not restricted by copyright or related rights, but a conclusive determination could not be made.", + "name": "No Known Copyright", + "url": "http://rightsstatements.org/vocab/NKC/1.0/" + }, + { + "@id": "http://rightsstatements.org/vocab/CNE/1.0/", + "@type": "CreativeWork", + "description": "The copyright and related rights status of this Item has not been evaluated.", + "name": "Copyright Not Evaluated", + "url": "http://rightsstatements.org/vocab/CNE/1.0/" + }, + { + "@id": "https://www.python.org/downloads/release/python-31012/", + "@type": [ + "ComputerLanguage", + "SoftwareApplication" + ], + "name": "Python 3.10.12", + "url": "https://www.python.org/downloads/release/python-31012/", + "version": "3.10.12" + }, + { + "@id": "https://glam-workbench.net/trove-web-archives/create-datasets/", + "@type": "CreativeWork", + "isPartOf": { + "@id": "https://glam-workbench.net" + }, + "name": "Create datasets", + "url": "https://glam-workbench.net/trove-web-archives/create-datasets/" + }, + { + "@id": "https://glam-workbench.net/trove-web-archives/harvest-pandora-subject-collections/", + "@type": "CreativeWork", + "isPartOf": { + "@id": "https://glam-workbench.net" + }, + "name": "Harvest pandora subject collections", + "url": "https://glam-workbench.net/trove-web-archives/harvest-pandora-subject-collections/" + }, + { + "@id": "https://glam-workbench.net/trove-web-archives/pandora-collections-data", + "@type": "CreativeWork", + "isPartOf": { + "@id": "https://glam-workbench.net" + }, + "name": "Pandora collections data", + "url": "https://glam-workbench.net/trove-web-archives/pandora-collections-data" + }, + { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-collections", + "@type": "Dataset", + "description": "This dataset contains details of the subject and collection groupings used by Pandora to organise archived web resource titles.", + "mainEntityOfPage": { + "@id": "https://glam-workbench.net/trove-web-archives/pandora-collections-data" + }, + "name": "trove-web-archives-collections", + "url": "https://github.com/GLAM-Workbench/trove-web-archives-collections", + "workExample": [] + }, + { + "@id": "#harvest-pandora-subject-collections_run_0", + "@type": "CreateAction", + "actionStatus": { + "@id": "http://schema.org/CompletedActionStatus" + }, + "endDate": "2024-05-02", + "instrument": { + "@id": "harvest-pandora-subject-collections.ipynb" + }, + "name": "Run of notebook: harvest-pandora-subject-collections.ipynb", + "result": [ + { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-subjects.ndjson" + }, + { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-collections.ndjson" + } + ] + }, + { + "@id": "https://glam-workbench.net/trove-web-archives/harvest-pandora-titles/", + "@type": "CreativeWork", + "isPartOf": { + "@id": "https://glam-workbench.net" + }, + "name": "Harvest pandora titles", + "url": "https://glam-workbench.net/trove-web-archives/harvest-pandora-titles/" + }, + { + "@id": "https://glam-workbench.net/trove-web-archives/pandora-titles-data", + "@type": "CreativeWork", + "isPartOf": { + "@id": "https://glam-workbench.net" + }, + "name": "Pandora titles data", + "url": "https://glam-workbench.net/trove-web-archives/pandora-titles-data" + }, + { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-titles", + "@type": "Dataset", + "description": "This dataset contains a complete list of Pandora's archived web resource titles.", + "mainEntityOfPage": { + "@id": "https://glam-workbench.net/trove-web-archives/pandora-titles-data" + }, + "name": "trove-web-archives-titles", + "url": "https://github.com/GLAM-Workbench/trove-web-archives-titles", + "workExample": [] + }, + { + "@id": "#harvest-pandora-titles_run_0", + "@type": "CreateAction", + "actionStatus": { + "@id": "http://schema.org/CompletedActionStatus" + }, + "endDate": "2024-05-03", + "instrument": { + "@id": "harvest-pandora-titles.ipynb" + }, + "name": "Run of notebook: harvest-pandora-titles.ipynb", + "result": [ + { + "@id": "https://github.com/GLAM-Workbench/trove-web-archives-titles/raw/main/pandora-titles.csv" + } + ] } ] } \ No newline at end of file diff --git a/sample_notebook.ipynb b/sample_notebook.ipynb deleted file mode 100644 index 1148ecb..0000000 --- a/sample_notebook.ipynb +++ /dev/null @@ -1,227 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "cf74e980-d0f0-42c7-ad1e-f5fb4611bc9d", - "metadata": {}, - "source": [ - "# My first notebook\n", - "\n", - "This notebook explores a fictional sample of pet ownership." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "b8d9cb6b-7776-4829-8cce-7270a0140459", - "metadata": {}, - "outputs": [], - "source": [ - "import altair as alt\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "4cca4d52-794e-4990-9e67-1ac8e74ff982", - "metadata": {}, - "outputs": [], - "source": [ - "data = [\n", - " {\"name\": \"Bob\", \"animal\": \"cat\", \"number\": 3},\n", - " {\"name\": \"Bob\", \"animal\": \"dog\", \"number\": 1},\n", - " {\"name\": \"Jan\", \"animal\": \"cat\", \"number\": 0},\n", - " {\"name\": \"Jan\", \"animal\": \"dog\", \"number\": 2},\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "99ffacf6-847f-47b9-a26c-d8d400e78121", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameanimalnumber
0Bobcat3
1Bobdog1
2Jancat0
3Jandog2
\n", - "
" - ], - "text/plain": [ - " name animal number\n", - "0 Bob cat 3\n", - "1 Bob dog 1\n", - "2 Jan cat 0\n", - "3 Jan dog 2" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.DataFrame(data)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "460273bc-14b3-4ed9-9f84-ee9c10667c15", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "alt.Chart(df).mark_bar().encode(x=\"name:N\", y=\"number:Q\", color=\"animal:N\")" - ] - }, - { - "cell_type": "markdown", - "id": "eaab0485-b62a-4e3d-bc49-687c3a8ed469", - "metadata": {}, - "source": [ - "----\n", - "\n", - "Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.net/)." - ] - } - ], - "metadata": { - "rocrate": { - "name": "This is a sample notebook", - "description": "This notebook explores a fictional sample of pet ownership." - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/scripts/update_crate.py b/scripts/update_crate.py index 02a0eb8..a883f24 100755 --- a/scripts/update_crate.py +++ b/scripts/update_crate.py @@ -7,13 +7,14 @@ from pathlib import Path import mimetypes from bs4 import BeautifulSoup +from dotenv import load_dotenv from rocrate.rocrate import ROCrate from rocrate.model.person import Person from rocrate.model.data_entity import DataEntity from rocrate.model.contextentity import ContextEntity -from extract_metadata import ( - extract_notebook_metadata, -) +from extract_metadata import extract_notebook_metadata + +load_dotenv() NOTEBOOK_EXTENSION = ".ipynb" @@ -72,13 +73,13 @@ } -def main(version, data_repo, data_paths): +def main(version, data_repo): # Make working directory the parent of the scripts directory os.chdir(Path(__file__).resolve().parent.parent) # Get a list of paths to notebooks in the cwd notebooks = get_notebooks() # Update the crate - update_crate(version, data_repo, data_paths, notebooks) + update_crate(version, data_repo, notebooks) def get_notebooks(): @@ -92,7 +93,7 @@ def get_notebooks(): """ # files = [Path(file) for file in os.listdir()] files = Path(".").glob("*.ipynb") - is_notebook = lambda file: not file.name.lower().startswith(("draft", "untitled", "index")) + is_notebook = lambda file: not file.name.lower().startswith(("draft", "untitled", "index.")) return list(filter(is_notebook, files)) @@ -184,10 +185,13 @@ def get_file_stats(datafile, local_path): stats = local_file.stat() size = stats.st_size date = datetime.datetime.fromtimestamp(stats.st_mtime).strftime("%Y-%m-%d") - rows = 0 - with local_file.open("r") as df: - for line in df: - rows += 1 + if local_file.name.endswith((".zip", ".db")): + rows = "" + else: + rows = 0 + with local_file.open("r") as df: + for line in df: + rows += 1 elif datafile.startswith("http"): # I don't think I want to download the whole file, so set to None rows = None @@ -236,11 +240,12 @@ def get_default_gh_branch(url): # the ghparser doesn't seem to like 'raw' urls url = url.replace("/raw/", "/blob/") gh_parts = ghparse(url) + headers = {'Authorization': f'token {os.getenv("GITHUB_TOKEN")}'} gh_repo_url = f"https://api.github.com/repos/{gh_parts.owner}/{gh_parts.repo}" - response = requests.get(gh_repo_url) + response = requests.get(gh_repo_url, headers=headers) return response.json().get("default_branch") -def add_files(crate, action, data_type, gw_url, data_repo, data_paths): +def add_files(crate, action, data_type, gw_url, data_repo, local_path): """ Add data files to the crate. Tries to extract some basic info about files (size, date) before adding them. @@ -250,12 +255,14 @@ def add_files(crate, action, data_type, gw_url, data_repo, data_paths): # Loop through list of datafiles for df_data in action.get(data_type, []): datafile = df_data["url"] - local_path = action.get("local_path", ".") + print(datafile) + # print(df_data) + # local_path = action.get("local_path", ".") # Check if file exists (or is a url) if ( Path(datafile).exists() - or (data_repo == "" and datafile.startswith("http")) + or (datafile.startswith("http")) or (data_repo and data_repo in datafile) ): # If this is a data repo crate use the file name (not full url) as the id @@ -284,43 +291,66 @@ def add_files(crate, action, data_type, gw_url, data_repo, data_paths): # but modify the date, size etc later if file_entity: properties = file_entity.properties() + # print(properties) # Otherwise we'll define default properties for a new file entity else: - name = datafile.rstrip("/").split("/")[-1] + if df_data.get("name"): + name = df_data.get("name") + else: + name = datafile.rstrip("/").split("/")[-1] properties = { "name": name, "url": file_url, } - # Add contextual entities for data repo associated with file - # If this is a data repo crate, this is not necessary as the crate root will have this - if not data_repo: - gw_page = action.get("mainEntityOfPage") - if data_repo_url := action.get("isPartOf"): - properties["isPartOf"] = id_ify(data_repo_url) - data_rocrate = { - "@id": data_repo_url, - "@type": "Dataset", - "url": data_repo_url, - "name": data_repo_url.rstrip("/").split("/")[-1] - } - if data_roc_description := action.get("description"): - data_rocrate["description"] = data_roc_description - if gw_page: - add_gw_page_link(crate, gw_page) - data_rocrate["mainEntityOfPage"] = id_ify(gw_page) - add_context_entity(crate, data_rocrate) + # Add contextual entities for data repo associated with file + # If this is a data repo crate, this is not necessary as the crate root will have this + if data_type == "result" and not data_repo: + # print(data_type) + examples = action.get("workExample", []) + # print(examples) + add_example_entities(crate, examples) + if gw_page := action.get("mainEntityOfPage"): + add_gw_page_link(crate, gw_page) + data_repo_url = action.get("isPartOf") + if data_repo_url: + properties["isPartOf"] = id_ify(data_repo_url) + elif gw_page: + properties["mainEntityOfPage"] = id_ify(gw_page) + properties["workExample"] = id_ify([e["url"] for e in examples]) + if not crate.get(data_repo_url): - # Guess the encoding type from extension - encoding = mimetypes.guess_type(datafile)[0] - if encoding: - properties["encodingFormat"] = encoding + data_rocrate = { + "@id": data_repo_url, + "@type": "Dataset", + "url": data_repo_url, + "name": data_repo_url.rstrip("/").split("/")[-1] + } + if data_roc_description := action.get("description"): + print(data_roc_description) + data_rocrate["description"] = data_roc_description + if gw_page: + # print(gw_page) + data_rocrate["mainEntityOfPage"] = id_ify(gw_page) + if current_data_rocrate := crate.get(data_repo_url): + current_examples = [e["@id"] for e in current_data_rocrate.properties().get("workExample")] + else: + current_examples = [] + data_rocrate["workExample"] = id_ify(list(set([e["url"] for e in examples] + current_examples))) + + add_context_entity(crate, data_rocrate) + + + # Guess the encoding type from extension + encoding = mimetypes.guess_type(datafile)[0] + if encoding: + properties["encodingFormat"] = encoding - if description := df_data.get("description"): - properties["description"] = description - if license := df_data.get("license"): - properties["license"] = id_ify(license) + if description := df_data.get("description"): + properties["description"] = description + if license := df_data.get("license"): + properties["license"] = id_ify(license) # Add/update modified date if date: @@ -344,8 +374,8 @@ def add_files(crate, action, data_type, gw_url, data_repo, data_paths): # Add/update the file entity and add to the list of file entities local_file = find_local_file(datafile.rstrip("/").split("/")[-1], action.get("local_path", ".")) - print(datafile, local_file, file_id) - if data_repo: + # print(datafile, local_file, file_id) + if data_repo and data_repo in datafile: crate_id = local_file else: crate_id = file_id @@ -360,7 +390,7 @@ def add_files(crate, action, data_type, gw_url, data_repo, data_paths): return file_entities -def add_action(crate, notebook, input_files, output_files, query, index): +def add_action(crate, notebook, input_files, output_files, query, index, local_path): """ Links a notebook and associated datafiles through a CreateAction. """ @@ -376,7 +406,7 @@ def add_action(crate, notebook, input_files, output_files, query, index): # There's no dates (or no output files) except IndexError: # Use the date the notebook was last modified - last_date, _, _ = get_file_stats(notebook.id, ["."]) + last_date, _, _ = get_file_stats(notebook.id, local_path) # Check to see if this action is already in the crate action_current = crate.get(action_id) @@ -409,6 +439,16 @@ def add_action(crate, notebook, input_files, output_files, query, index): for output in output_files: action_new.append_to("result", output) +def add_example_entities(crate, examples): + for example in examples: + example_props = { + "@id": example["url"], + "@type": "CreativeWork", + "name": example["name"], + "url": example["url"] + } + add_context_entity(crate, example_props) + def creates_data(data_repo, notebook_metadata): """ @@ -416,13 +456,13 @@ def creates_data(data_repo, notebook_metadata): """ if data_repo: for action in notebook_metadata["action"]: - for result in action["result"]: + for result in action.get("result", []): if data_repo in result["url"]: return True return False -def add_notebook(crate, notebook, data_repo, data_path, gw_url): +def add_notebook(crate, notebook, data_repo, gw_url): """Adds notebook information to an ROCRate. Parameters: @@ -440,10 +480,13 @@ def add_notebook(crate, notebook, data_repo, data_path, gw_url): "author": [], "description": "", "action": [], - "mainEntityOfPage": "" + "mainEntityOfPage": "", + "workExample": [], + "category": "", + "position": 0 }, ) - # print(notebook_metadata) + # print(notebook.name) has_data = creates_data(data_repo, notebook_metadata) # If this is a data repo crate change nb ids to full urls @@ -489,12 +532,19 @@ def add_notebook(crate, notebook, data_repo, data_path, gw_url): ), "codeRepository": repo_url, "url": nb_url, + "category": notebook_metadata["category"], + "position": notebook_metadata["position"] } if doc_url := notebook_metadata.get("mainEntityOfPage"): add_gw_page_link(crate, doc_url) properties["mainEntityOfPage"] = id_ify(doc_url) + nb_examples = notebook_metadata.get("workExample", []) + add_example_entities(crate, nb_examples) + properties["workExample"] = id_ify([e["url"] for e in nb_examples]) + + # Add input files from 'object' property of actions #nb_inputs = [a["object"] for a in notebook_metadata.get("action", [])] #input_files = add_files(crate, nb_inputs, data_repo) @@ -509,11 +559,13 @@ def add_notebook(crate, notebook, data_repo, data_path, gw_url): # Add a CreateAction that links the notebook run with the input and output files for index, action in enumerate(notebook_metadata.get("action", [])): + local_path = action.get("local_path", ".") if not data_repo or data_repo in action.get("result", [])[0]["url"]: # print(action) - input_files = add_files(crate, action, "object", gw_url, data_repo, data_paths) - output_files = add_files(crate, action, "result", gw_url, data_repo, data_paths) - add_action(crate, nb_new, input_files, output_files, action.get("query", ""), index) + input_files = add_files(crate, action, "object", gw_url, data_repo, local_path) + output_files = add_files(crate, action, "result", gw_url, data_repo, local_path) + if output_files: + add_action(crate, nb_new, input_files, output_files, action.get("query", ""), index, local_path) if data_repo: if dataset_gw_page := action.get("mainEntityOfPage"): crate.update_jsonld({"@id": "./", "mainEntityOfPage": id_ify(dataset_gw_page)}) @@ -521,17 +573,10 @@ def add_notebook(crate, notebook, data_repo, data_path, gw_url): if dataset_description := action.get("description"): crate.update_jsonld({"@id": "./", "description": dataset_description}) dataset_examples = action.get("workExample", []) - crate.update_jsonld({"@id": "./", "workExample": id_ify([e["url"] for e in dataset_examples])}) - for example in dataset_examples: - example_props = { - "@id": example["url"], - "@type": "CreativeWork", - "name": example["name"], - "url": example["url"] - } - add_context_entity(crate, example_props) + current_examples = root.get("workExample", []) + crate.update_jsonld({"@id": "./", "workExample": id_ify([e["url"] for e in dataset_examples]) + current_examples}) + add_example_entities(crate, dataset_examples) - # If the notebook has author info, add people to crate if notebook_metadata["author"]: # Add people referenced in notebook metadata @@ -651,7 +696,7 @@ def get_gw_docs(repo_name): return {"url": gw_url, "title": gw_title} -def update_crate(version, data_repo, data_paths, notebooks): +def update_crate(version, data_repo, notebooks): """Creates a parent crate in the supplied directory. Parameters: @@ -771,7 +816,7 @@ def update_crate(version, data_repo, data_paths, notebooks): # Process notebooks for notebook in notebooks: - add_notebook(crate, notebook, data_repo, data_paths, gw_url) + add_notebook(crate, notebook, data_repo, gw_url) # Remove files from crate if they're no longer in the repo # remove_deleted_files(crate, data_paths) @@ -789,8 +834,5 @@ def update_crate(version, data_repo, data_paths, notebooks): "--version", type=str, help="New version number", required=False ) parser.add_argument("--data-repo", type=str, default="", required=False) - parser.add_argument("--data-paths", type=str, default=".", required=False) args = parser.parse_args() - data_paths = ["."] - data_paths += args.data_paths.split(",") - main(args.version, args.data_repo, data_paths) + main(args.version, args.data_repo)