diff --git a/.gitignore b/.gitignore
index 1e627b7..06f94ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,4 @@ Untitled*
.python-version
http_cache.sqlite
data-rocrate
+subject-hierarchy.md
diff --git a/.zenodo.json b/.zenodo.json
index 8f06344..88f0637 100644
--- a/.zenodo.json
+++ b/.zenodo.json
@@ -1,42 +1,41 @@
{
- "language": "eng",
- "license": "MIT",
- "title": "GLAM-Workbench/trove-web-archives",
- "related_identifiers": [
- {
- "scheme": "url",
- "identifier": "https://github.com/GLAM-Workbench/trove-web-archives/tree/v0.0.0",
- "relation": "isDerivedFrom",
- "resource_type": "software"
- },
- {
- "scheme": "url",
- "identifier": "https://glam-workbench.net/trove-web-archives/",
- "relation": "isDocumentedBy",
- "resource_type": "publication-softwaredocumentation"
- },
- {
- "scheme": "url",
- "identifier": "https://glam-workbench.net/",
- "relation": "isPartOf",
- "resource_type": "other"
- }
- ],
- "version": "v0.0.0",
- "upload_type": "software",
- "keywords": [
- "digital humanities",
- "Jupyter",
- "GLAM Workbench"
- ],
- "publication_date": "2024-03-03",
- "creators": [
+ "language": "eng",
+ "license": "MIT",
+ "title": "GLAM-Workbench/trove-web-archives",
+ "related_identifiers": [
{
- "name": "Sherratt, Tim",
- "orcid": "0000-0001-7956-4498"
+ "scheme": "url",
+ "identifier": "https://github.com/GLAM-Workbench/trove-web-archives/tree/v1.0.0",
+ "relation": "isDerivedFrom",
+ "resource_type": "software"
+ },
+ {
+ "scheme": "url",
+ "identifier": "https://glam-workbench.net/trove-web-archives/",
+ "relation": "isDocumentedBy",
+ "resource_type": "publication-softwaredocumentation"
+ },
+ {
+ "scheme": "url",
+ "identifier": "https://glam-workbench.net/",
+ "relation": "isPartOf",
+ "resource_type": "other"
+ }
+ ],
+ "version": "v1.0.0",
+ "upload_type": "software",
+ "keywords": [
+ "digital humanities",
+ "Jupyter",
+ "GLAM Workbench"
+ ],
+ "publication_date": "2024-05-06",
+ "creators": [
+ {
+ "name": "Sherratt, Tim",
+ "orcid": "0000-0001-7956-4498"
}
-],
- "access_right": "open",
- "description": ""
- }
-
\ No newline at end of file
+ ],
+ "access_right": "open",
+ "description": "
A GLAM Workbench repository
For more information and documentation see the Trove web archive collections (Pandora) section of the GLAM Workbench.
Notebooks
- Create title datasets from collections and subjects
- Harvest Pandora subjects and collections
- Harvest the full collection of Pandora titles
Associated datasets
Created by Tim Sherratt for the GLAM Workbench
"
+}
diff --git a/README.md b/README.md
index 94d6415..349ba8e 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,12 @@
# trove-web-archives
-A GLAM Workbench repository
+CURRENT VERSION: v1.0.0
+
+This repository includes information on finding, understanding, and using Pandora's collections of archived web pages.
+
+[Pandora](http://pandora.nla.gov.au/) has been selecting web sites and online resources for preservation since 1996. It has assembled a collection of more than 80,000 titles, organised into subjects and collections. The archived websites are now part of the Australian Web Archive (AWA), which combines the selected titles with broader domain harvests, and is searchable through Trove. However, Pandora's curated collections offer a useful entry point for researchers trying to find web sites relating to particular topics or events.
+
+The [Web Archives](https://glam-workbench.net/web-archives/) section of the GLAM Workbench provides documentation, tools, and examples to help you work with data from a range of web archives, including the Australian Web Archive. The title urls obtained through Pandora can be used to obtain additional data from the AWA for analysis.
For more information and documentation see the [Trove web archive collections (Pandora)](https://glam-workbench.net/trove-web-archives) section of the [GLAM Workbench](https://glam-workbench.net).
diff --git a/crate_maker.py b/crate_maker.py
new file mode 100644
index 0000000..799e281
--- /dev/null
+++ b/crate_maker.py
@@ -0,0 +1,114 @@
+from rocrate.rocrate import ContextEntity, ROCrate
+import ipynbname
+import nbformat
+import mimetypes
+from datetime import datetime
+from giturlparse import parse as ghparse
+import requests
+
+def add_gh_file(crate, url):
+ datafile = url.replace("/raw/", "/blob/")
+ gh_parts = ghparse(datafile)
+
+ # API url to get the latest commit for this file
+ gh_commit_url = f"https://api.github.com/repos/{gh_parts.owner}/{gh_parts.repo}/commits?path={gh_parts.path_raw.split('/')[-1]}"
+ try:
+ response = requests.get(gh_commit_url)
+
+ # Get the date of the last commit
+ date = response.json()[0]["commit"]["committer"]["date"][:10]
+
+ except (IndexError, KeyError):
+ date = None
+
+ # Different API endpoint for file data
+ gh_file_url = f"https://api.github.com/repos/{gh_parts.owner}/{gh_parts.repo}/contents/{gh_parts.path_raw.split('/')[-1]}"
+ try:
+ response = requests.get(gh_file_url)
+ contents_data = response.json()
+ # Get the file size
+ try:
+ size = contents_data["size"]
+ except TypeError:
+ size = None
+
+ except KeyError:
+ size = None
+ obj_properties = {
+ "@type": [
+ "File",
+ "Dataset"
+ ],
+ "contentSize": size,
+ "dateModified": date,
+ "name": gh_parts.path_raw.split('/')[-1],
+ "url": datafile
+ }
+ crate.add_file(datafile, properties=obj_properties)
+
+def create_rocrate(subject, file_path, start_date, end_date):
+ """
+ Create an RO-Crate metadata file describing the downloaded dataset.
+ """
+ crate = ROCrate()
+
+ # Initialise crate with dataset
+ crate.add_file(file_path)
+
+ # Add notebook details
+ nb_path = ipynbname.path()
+ nb = nbformat.read(nb_path, nbformat.NO_CONVERT)
+ metadata = nb.metadata.rocrate
+ nb_url = metadata.get("url", "")
+ nb_properties = {
+ "@type": ["File", "SoftwareSourceCode"],
+ "name": metadata.get("name", ""),
+ "description": metadata.get("description", ""),
+ "encodingFormat": "application/x-ipynb+json",
+ "codeRepository": metadata.get("codeRepository", ""),
+ "url": nb_url,
+ }
+ crate.add(ContextEntity(crate, nb_url, properties=nb_properties))
+
+ # Add action
+ action_id = f"{nb_path.stem}_run"
+ action_properties = {
+ "@type": "CreateAction",
+ "instrument": {"@id": nb_url},
+ "actionStatus": {"@id": "http://schema.org/CompletedActionStatus"},
+ "name": f"Run of notebook: {nb_path.name}",
+ "result": {"@id": f"{file_path.name}/"},
+ "object": [{"@id": o["url"]} for o in metadata["action"][0]["object"]],
+ "query": f"{subject['id']} ({subject['name']})",
+ "startDate": start_date,
+ "endDate": end_date,
+ }
+
+ # If there are any GitHub references in action objects, add them to the crate
+ for obj in metadata["action"][0]["object"]:
+ if "github.com" in obj["url"]:
+ add_gh_file(crate, obj["url"])
+
+ # Update dataset details
+ encoding = mimetypes.guess_type(file_path)[0]
+ stats = file_path.stat()
+ size = stats.st_size
+ date = datetime.fromtimestamp(stats.st_mtime).strftime("%Y-%m-%d")
+ rows = 0
+ with file_path.open("r") as df:
+ for line in df:
+ rows += 1
+ crate.update_jsonld(
+ {
+ "@id": file_path.name,
+ "dateModified": date,
+ "contentSize": size,
+ "size": rows,
+ "encodingFormat": encoding,
+ }
+ )
+ crate.add(ContextEntity(crate, action_id, properties=action_properties))
+
+ # Save the crate
+ crate.write(file_path.parent)
+ crate.write_zip(file_path.parent)
\ No newline at end of file
diff --git a/create-datasets.ipynb b/create-datasets.ipynb
index babf1ce..bed167c 100644
--- a/create-datasets.ipynb
+++ b/create-datasets.ipynb
@@ -13,20 +13,26 @@
"source": [
"# Create archived url datasets from Pandora's collections and subjects\n",
"\n",
- "This notebook helps you create a dataset of archived urls using Pandora's subject and collection groupings.\n",
+ "The Australian Web Archive makes billions of archived web pages searchable through Trove. But how would you go about constructing a search to find websites relating to Australian election campaigns? Fortunately you don't have to, as [Pandora](http://pandora.nla.gov.au/) provides a collection of archived web resources organised by subject and collection – including [thousands of sites about elections](http://pandora.nla.gov.au/subject/6). This notebook makes it easy to save details of all the archived websites under any heading in Pandora's subject hierarchy, creating custom datasets relating to specific topics or events.\n",
"\n",
- "The Australian Web Archive makes billions of archived web pages searchable through Trove. But how would you go about constructing a search that would find websites relating to election campaigns? Fortunately you don't have to, as Pandora provides a collection of archived web resources organised by subject and collection. By using harvests of Pandora's subject hierarchy and a complete list of archived titles, this notebook makes it easy for you to create custom datasets relating to a specific topic or event.\n",
- "\n",
- "This notebook uses pre-harvested datasets containing information about Pandora's subjects, collections and titles. New titles are added to Pandora frequently, so you might want to create your own updated versions using these notebooks:\n",
+ "For convenience, this notebook uses pre-harvested datasets containing information about Pandora's subjects, collections and titles. New titles are added to Pandora frequently, so you might want to create your own updated versions using these notebooks:\n",
"\n",
"- [Harvest Pandora subjects and collections](harvest-pandora-subject-collections.ipynb)\n",
"- [Harvest the full collection of Pandora titles](harvest-pandora-titles.ipynb)\n",
"\n",
"## Using this notebook\n",
"\n",
- "The simplest way to get started is to browse the subject and collection groupings in [Pandora](http://pandora.nla.gov.au/). Once you've found a subject or collection of interest, just copy its identifier, either `/subject/[subject number]` or `/col/[collection number]`. You also need to decide if you want *every* title under that subject or collection, including those associated with its children, or if you only want the titles directly linked to your selected grouping.\n",
+ "The simplest way to get started is to browse the subject and collection groupings in [Pandora](http://pandora.nla.gov.au/). Once you've found a subject or collection of interest, just copy its identifier, either `/subject/[subject number]` or `/col/[collection number]`. I've also created a [single-page version of the complete subject hierarchy](https://glam-workbench.net/trove-web-archives/pandora-subject-hierarchy/) which makes it a bit easier to see what's included under each level.\n",
+ "\n",
+ "Titles can be linked to any level in this hierarchy. To assemble a complete list of titles under a subject such as 'Arts', for example, you need to get all the titles from 'Arts', all of the titles from all of the sub-categories under 'Arts', and all of the titles from all of the collections and sub-collections under both 'Arts' and its subcategories. So when you create your dataset you need to decide if you want *every* title under that subject or collection, including those associated with its children, or if you only want the titles directly linked to your selected heading.\n",
+ "\n",
+ "You can then run either `create_subject_dataset([your subject id])` or `create_collection_dataset([your collection id])` in the cells below.\n",
"\n",
- "Then you can run either `create_subject_dataset([your subject id])` or `create_collection_dataset([your collection id])`.\n",
+ "If you want to include titles from any child categories or collections, set the `include_subcategories` and `include_collections` parameters to `True`.\n",
+ "\n",
+ "For example:\n",
+ "\n",
+ "- `create_subject_dataset(\"/subject/6\", include_collections=True)` will generate a dataset that contains every archived url under the 'Elections' heading, including urls in child collections.\n",
"\n",
"## Datasets\n",
"\n",
@@ -58,7 +64,7 @@
},
{
"cell_type": "code",
- "execution_count": 329,
+ "execution_count": 1,
"id": "9362f044-66c3-44b4-b4b4-8566603bb11d",
"metadata": {
"editable": true,
@@ -74,31 +80,29 @@
"True"
]
},
- "execution_count": 329,
+ "execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "import mimetypes\n",
"import os\n",
"from datetime import datetime\n",
"from pathlib import Path\n",
"\n",
- "import ipynbname\n",
- "import nbformat\n",
"import pandas as pd\n",
"from dotenv import load_dotenv\n",
"from IPython.display import HTML, display\n",
- "from rocrate.rocrate import ContextEntity, ROCrate\n",
"from slugify import slugify\n",
"\n",
+ "import crate_maker\n",
+ "\n",
"load_dotenv()"
]
},
{
"cell_type": "code",
- "execution_count": 300,
+ "execution_count": 2,
"id": "1ab9af6c-01d7-459b-8595-31325aeb3558",
"metadata": {
"editable": true,
@@ -109,6 +113,8 @@
},
"outputs": [],
"source": [
+ "# Load the pre-harvested datasets\n",
+ "\n",
"dfc = pd.read_json(\n",
" \"https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-collections.ndjson\",\n",
" lines=True,\n",
@@ -119,60 +125,7 @@
")\n",
"dft = pd.read_csv(\n",
" \"https://github.com/GLAM-Workbench/trove-web-archives-titles/raw/main/pandora-titles.csv\"\n",
- ")\n",
- "\n",
- "\n",
- "def create_rocrate(subject, file_path, start_date, end_date):\n",
- " \"\"\"\n",
- " Create an RO-Crate metadata file describing the downloaded dataset.\n",
- " \"\"\"\n",
- " crate = ROCrate()\n",
- " crate.add_file(file_path)\n",
- " nb_path = ipynbname.path()\n",
- " nb = nbformat.read(nb_path, nbformat.NO_CONVERT)\n",
- " metadata = nb.metadata.rocrate\n",
- " nb_url = metadata.get(\"url\", \"\")\n",
- " nb_properties = {\n",
- " \"@type\": [\"File\", \"SoftwareSourceCode\"],\n",
- " \"name\": metadata.get(\"name\", \"\"),\n",
- " \"description\": metadata.get(\"description\", \"\"),\n",
- " \"encodingFormat\": \"application/x-ipynb+json\",\n",
- " \"codeRepository\": metadata.get(\"codeRepository\", \"\"),\n",
- " \"url\": nb_url,\n",
- " }\n",
- " crate.add(ContextEntity(crate, nb_url, properties=nb_properties))\n",
- " action_id = f\"{nb_path.stem}_run\"\n",
- " action_properties = {\n",
- " \"@type\": \"CreateAction\",\n",
- " \"instrument\": {\"@id\": nb_url},\n",
- " \"actionStatus\": {\"@id\": \"http://schema.org/CompletedActionStatus\"},\n",
- " \"name\": f\"Run of notebook: {nb_path.name}\",\n",
- " \"result\": {\"@id\": f\"{file_path.name}/\"},\n",
- " \"object\": [{\"@id\": o[\"url\"]} for o in metadata[\"action\"][0][\"object\"]],\n",
- " \"query\": f\"{subject['id']} ({subject['name']})\",\n",
- " \"startDate\": start_date,\n",
- " \"endDate\": end_date,\n",
- " }\n",
- " encoding = mimetypes.guess_type(file_path)[0]\n",
- " stats = file_path.stat()\n",
- " size = stats.st_size\n",
- " date = datetime.fromtimestamp(stats.st_mtime).strftime(\"%Y-%m-%d\")\n",
- " rows = 0\n",
- " with file_path.open(\"r\") as df:\n",
- " for line in df:\n",
- " rows += 1\n",
- " crate.update_jsonld(\n",
- " {\n",
- " \"@id\": file_path.name,\n",
- " \"dateModified\": date,\n",
- " \"contentSize\": size,\n",
- " \"size\": rows,\n",
- " \"encodingFormat\": encoding,\n",
- " }\n",
- " )\n",
- " crate.add(ContextEntity(crate, action_id, properties=action_properties))\n",
- " crate.write(file_path.parent)\n",
- " crate.write_zip(file_path.parent)"
+ ")"
]
},
{
@@ -185,7 +138,7 @@
},
{
"cell_type": "code",
- "execution_count": 323,
+ "execution_count": 3,
"id": "cb8497ee-fae1-4252-bdb4-5fe1ab6fb12a",
"metadata": {
"editable": true,
@@ -197,6 +150,9 @@
"outputs": [],
"source": [
"def get_title_ids_in_collection(coll_id, include_subcollections=True):\n",
+ " \"\"\"\n",
+ " Get all the title ids in a collection.\n",
+ " \"\"\"\n",
" title_ids = []\n",
" coll = dfc.loc[dfc[\"id\"] == coll_id].iloc[0]\n",
" title_ids += coll[\"titles\"]\n",
@@ -207,9 +163,10 @@
" return title_ids\n",
"\n",
"\n",
- "def get_urls_by_subject(\n",
+ "def get_titles_by_subject(\n",
" subject, include_subcategories=False, include_collections=False\n",
"):\n",
+ "\n",
" title_ids = []\n",
" title_ids += subject[\"titles\"]\n",
" if include_subcategories:\n",
@@ -232,7 +189,7 @@
" start_date = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n",
" subject = dfs.loc[dfs[\"id\"] == id].iloc[0]\n",
"\n",
- " df = get_urls_by_subject(\n",
+ " df = get_titles_by_subject(\n",
" subject,\n",
" include_subcategories=include_subcategories,\n",
" include_collections=include_collections,\n",
@@ -245,7 +202,7 @@
" output_file = Path(output_path, f\"pandora-{subject_slug}.csv\")\n",
" df.to_csv(output_file, index=False)\n",
" if include_crate:\n",
- " create_rocrate(subject, output_file, start_date, end_date)\n",
+ " crate_maker.create_rocrate(subject, output_file, start_date, end_date)\n",
" display(\n",
" HTML(\n",
" f\"Download dataset: datasets/{subject_slug}.zip\"\n",
@@ -255,7 +212,7 @@
},
{
"cell_type": "code",
- "execution_count": 324,
+ "execution_count": 4,
"id": "a9895f20-1891-4c5e-8bfe-bb3ef54f023f",
"metadata": {
"editable": true,
@@ -270,7 +227,7 @@
{
"data": {
"text/html": [
- "Download dataset: datasets/subject-3-business-economy.zip"
+ "Download dataset: datasets/pandora-subject-3-business-economy.zip"
],
"text/plain": [
""
@@ -296,7 +253,7 @@
},
{
"cell_type": "code",
- "execution_count": 327,
+ "execution_count": 5,
"id": "a1f8b140-1453-4b1f-ba7f-6bdb3e77b11f",
"metadata": {
"editable": true,
@@ -330,7 +287,7 @@
" output_file = Path(output_path, f\"pandora-{coll_slug}.csv\")\n",
" df.to_csv(output_file, index=False)\n",
" if include_crate:\n",
- " create_rocrate(coll, output_file, start_date, end_date)\n",
+ " crate_maker.create_rocrate(coll, output_file, start_date, end_date)\n",
" display(\n",
" HTML(\n",
" f\"Download dataset: datasets/{coll_slug}.zip\"\n",
@@ -340,7 +297,7 @@
},
{
"cell_type": "code",
- "execution_count": 328,
+ "execution_count": 6,
"id": "f9dff1c6-4305-4d87-976e-a11f72a914e2",
"metadata": {
"editable": true,
@@ -371,7 +328,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"id": "ea3b5a17-9b47-4c48-a471-7b42c5948371",
"metadata": {
"editable": true,
@@ -383,7 +340,32 @@
},
"tags": []
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Download dataset: datasets/pandora-subject-3-business-economy.zip"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Download dataset: datasets/pandora-col-21326-museums.zip"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"# IGNORE CELL -- TESTING ONLY\n",
"\n",
@@ -454,6 +436,7 @@
"description": "This notebook helps you create a dataset of archived urls using Pandora's subject and collection groupings.\n\nThe Australian Web Archive makes billions of archived web pages searchable through Trove. But how would you go about constructing a search that would find websites relating to election campaigns? Fortunately you don't have to, as Pandora provides a collection of archived web resources organised by subject and collection. By using harvests of Pandora's subject hierarchy and a complete list of archived titles, this notebook makes it easy for you to create custom datasets relating to a specific topic or event.",
"mainEntityOfPage": "https://glam-workbench.net/trove-web-archives/create-datasets/",
"name": "Create title datasets from collections and subjects",
+ "position": 3,
"url": "https://github.com/GLAM-Workbench/trove-web-archives/raw/master/create-datasets.ipynb"
}
},
diff --git a/harvest-pandora-subject-collections.ipynb b/harvest-pandora-subject-collections.ipynb
index 3ee5751..0af9417 100644
--- a/harvest-pandora-subject-collections.ipynb
+++ b/harvest-pandora-subject-collections.ipynb
@@ -13,7 +13,9 @@
"source": [
"# Harvest Pandora subjects and collections\n",
"\n",
- "The [Pandora](http://pandora.nla.gov.au/) selective web archive assigns archived titles to subject and collection groupings. These curated collections help researchers find archived websites relating to specific topics or events, such as [election campaigns](http://pandora.nla.gov.au/subject/6). This notebook harvests Pandora's navigation hierarchy, saving the connections between subjects, collections, and titles. The datasets created can be used to assemble subject-based collections of archived websites for research.\n",
+ "This notebook harvests Pandora's navigation hierarchy, saving the connections between subjects, collections, and titles.\n",
+ "\n",
+ "The [Pandora](http://pandora.nla.gov.au/) selective web archive assigns archived titles to subject and collection groupings. These curated collections help researchers find archived websites relating to specific topics or events, such as [election campaigns](http://pandora.nla.gov.au/subject/6). This notebook creates two datasets containing details of all Pandora's subjects and collections. The datasets can be used to [assemble subject-based collections of archived websites for research](https://glam-workbench.net/trove-web-archives/create-datasets/).\n",
"\n",
"## Pandora vs Trove\n",
"\n",
@@ -25,9 +27,9 @@
"\n",
"## Subjects, Collections, and Titles\n",
"\n",
- "There are two levels of subject headings in Pandora. The top-level headings are displayed on the Pandora home page, for example, [Arts](http://pandora.nla.gov.au/subject/2) and [Politics](http://pandora.nla.gov.au/subject/21). The top-level headings can include sub-categories. For example, Arts includes sub-categories for Architecture and Dance. Both the top-level subjects and sub-categories can include collections and titles.\n",
+ "There are two levels of subject headings in Pandora. The top-level headings are displayed on the Pandora home page, for example, [Arts](http://pandora.nla.gov.au/subject/2) and [Politics](http://pandora.nla.gov.au/subject/21). The top-level headings can include sub-categories. For example, 'Arts' includes sub-categories for 'Architecture' and 'Dance'. Both the top-level subjects and sub-categories can include collections and titles.\n",
"\n",
- "Collections are more fine-grained groupings of titles, often related to specific events or activities. Collections can include sub-collections. In Pandora's web interface, the sub-collections are displayed as sub-headings on the collection page, but in the backend each sub-collection has its own identifier. For example, the Galleries collection, includes a list of gallery websites divided into sub-collections by the state in which they're located. Both collections and sub-collections can contain titles.\n",
+ "Collections are more fine-grained groupings of titles, often related to specific events or activities. Collections can include sub-collections. In Pandora's web interface, the sub-collections are displayed as sub-headings on the collection page, but in the backend each sub-collection has its own identifier. For example, the 'Galleries' collection, includes a list of gallery websites divided into sub-collections by the state in which they're located. Both collections and sub-collections can contain titles.\n",
"\n",
"Collections can appear in multiple subjects and sub-categories. This means that the harvesting process saves duplicate copies of collections that need to be removed.\n",
"\n",
@@ -39,12 +41,27 @@
"\n",
"## Datasets\n",
"\n",
- "This notebook creates two datasets:\n",
+ "This notebook creates two datasets in `ndjson` format (one JSON object per line):\n",
"\n",
"- `pandora-subjects.ndjson`\n",
"- `pandora-collections.ndjson`\n",
"\n",
- "Pre-harvested versions of these datasets are available from the [trove-web-archives-collections-data](https://github.com/GLAM-Workbench/trove-web-archives-collections-data) repository.\n"
+ "The `pandora-subjects.ndjson` file includes the following fields:\n",
+ "\n",
+ "- `name` – subject heading\n",
+ "- `id` – subject identifier in the form `/subject/[number]`\n",
+ "- `subcategories` – list of subcategory identifiers\n",
+ "- `collections` – list of collection identifiers\n",
+ "- `titles` – list of title identifiers\n",
+ "\n",
+ "The `pandora-collections.ndjson` file includes the following fields:\n",
+ "\n",
+ "- `name` – collection/subcollection name\n",
+ "- `id` – collection identifier in the form `/col/[number]`\n",
+ "- `subcollections` – list of subcollection identifiers\n",
+ "- `titles` – list of title identifiers\n",
+ "\n",
+ "Pre-harvested versions of these datasets are available from the [Pandora collections data](https://glam-workbench.net/trove-web-archives/pandora-collections-data/) section of the GLAM Workbench.\n"
]
},
{
@@ -77,7 +94,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 29,
"id": "1d6f6094-ce03-48d7-9ef2-7cdac14dba11",
"metadata": {
"editable": true,
@@ -88,157 +105,193 @@
},
"outputs": [],
"source": [
- "def get_title_ids(page_id):\n",
- " \"\"\"\n",
- " Get the TEP identifiers for all the titles on the specified page.\n",
- " Excludes titles in subcollections as they will can be harvested separately.\n",
- " \"\"\"\n",
- " title_ids = []\n",
- " page = 1\n",
- " # Subjects can have multiple pages of titles, so we'll go through page by page\n",
- " # until there's no more titles\n",
- " while page:\n",
- " response = requests.get(f\"http://pandora.nla.gov.au{page_id}/{page}\")\n",
+ "class SubjectHarvester:\n",
+ "\n",
+ " def __init__(\n",
+ " self,\n",
+ " subject_output=\"pandora-subjects.ndjson\",\n",
+ " collection_output=\"pandora-collections.ndjson\",\n",
+ " sample=None,\n",
+ " ):\n",
+ " self.subject_output = subject_output\n",
+ " self.collection_output = collection_output\n",
+ " self.sample = sample\n",
+ "\n",
+ " def get_title_ids(self, page_id):\n",
+ " \"\"\"\n",
+ " Get the TEP identifiers for all the titles on the specified page.\n",
+ " Excludes titles in subcollections as they will can be harvested separately.\n",
+ " \"\"\"\n",
+ " title_ids = []\n",
+ " page = 1\n",
+ " # Subjects can have multiple pages of titles, so we'll go through page by page\n",
+ " # until there's no more titles\n",
+ " while page:\n",
+ " response = requests.get(f\"http://pandora.nla.gov.au{page_id}/{page}\")\n",
+ " soup = BeautifulSoup(response.text, \"lxml\")\n",
+ " # we only want the first itemlist containing titles\n",
+ " # subsequent titles will be part of subcollections\n",
+ " title_links = []\n",
+ " for item_list in soup.find_all(\"div\", class_=\"itemlist\"):\n",
+ " # This checks if the title list has an h1 tag before it\n",
+ " # which indicates its actually a subcollection\n",
+ " if not (\n",
+ " item_list.find_previous_sibling(\"h1\")\n",
+ " and item_list.find_previous_sibling(\"h1\").name == \"h1\"\n",
+ " ):\n",
+ " # Extract the TEP ids from the links\n",
+ " title_links = item_list.find_all(\"a\", href=re.compile(r\"/tep/\\d+\"))\n",
+ " for title_link in title_links:\n",
+ " title_ids.append(title_link[\"href\"])\n",
+ " # Continue if it's a subject page and there were title links on this page\n",
+ " if title_links and \"/col/\" not in page_id:\n",
+ " page += 1\n",
+ " else:\n",
+ " page = None\n",
+ " time.sleep(0.5)\n",
+ " return title_ids\n",
+ "\n",
+ " def harvest_subcategories(self, subject_id):\n",
+ " \"\"\"\n",
+ " Harvest details of sub-categories from a subject page.\n",
+ " \"\"\"\n",
+ " subject_ids = []\n",
+ " # Get the subject page\n",
+ " response = requests.get(f\"http://pandora.nla.gov.au{subject_id}\")\n",
" soup = BeautifulSoup(response.text, \"lxml\")\n",
- " # we only want the first itemlist containing titles\n",
- " # subsequent titles will be part of subcollections\n",
- " title_links = []\n",
- " for item_list in soup.find_all(\"div\", class_=\"itemlist\"):\n",
- " # This checks if the title list has an h1 tag before it\n",
- " # which indicates its actually a subcollection\n",
- " if not (\n",
- " item_list.find_previous_sibling(\"h1\")\n",
- " and item_list.find_previous_sibling(\"h1\").name == \"h1\"\n",
- " ):\n",
- " # Extract the TEP ids from the links\n",
- " title_links = item_list.find_all(\"a\", href=re.compile(r\"/tep/\\d+\"))\n",
- " for title_link in title_links:\n",
- " title_ids.append(title_link[\"href\"])\n",
- " # Continue if it's a subject page and there were title links on this page\n",
- " if title_links and \"/col/\" not in page_id:\n",
- " page += 1\n",
- " else:\n",
- " page = None\n",
- " time.sleep(0.5)\n",
- " return title_ids\n",
- "\n",
- "\n",
- "def harvest_subcategories(subject_id):\n",
- " \"\"\"\n",
- " Harvest details of sub-categories from a subject page.\n",
- " \"\"\"\n",
- " subject_ids = []\n",
- " response = requests.get(f\"http://pandora.nla.gov.au{subject_id}\")\n",
- " soup = BeautifulSoup(response.text, \"lxml\")\n",
- " subject_links = soup.find_all(\"a\", href=re.compile(r\"/subject/\\d+$\"))\n",
- "\n",
- " for subject_link in subject_links:\n",
- " subject_name = \" \".join(subject_link.stripped_strings)\n",
- " subject_id = subject_link[\"href\"]\n",
- " collection_ids = harvest_collections(subject_id)\n",
- " title_ids = get_title_ids(subject_id)\n",
- " with Path(subjects_output).open(\"a\") as subjects_file:\n",
- " subjects_file.write(\n",
- " json.dumps(\n",
- " {\n",
- " \"name\": subject_name,\n",
- " \"id\": subject_id,\n",
- " \"collections\": collection_ids,\n",
- " \"titles\": title_ids,\n",
- " }\n",
+ " # Get all the links to subcategories\n",
+ " subject_links = soup.find_all(\"a\", href=re.compile(r\"/subject/\\d+$\"))\n",
+ " # Process all the sub-categories\n",
+ " for subject_link in subject_links:\n",
+ " subject_name = \" \".join(subject_link.stripped_strings)\n",
+ " subject_id = subject_link[\"href\"]\n",
+ " # Get collections\n",
+ " collection_ids = self.harvest_collections(subject_id)\n",
+ " # Get titles\n",
+ " title_ids = self.get_title_ids(subject_id)\n",
+ " with Path(self.subject_output).open(\"a\") as subjects_file:\n",
+ " subjects_file.write(\n",
+ " json.dumps(\n",
+ " {\n",
+ " \"name\": subject_name,\n",
+ " \"id\": subject_id,\n",
+ " \"collections\": collection_ids,\n",
+ " \"titles\": title_ids,\n",
+ " }\n",
+ " )\n",
+ " + \"\\n\"\n",
" )\n",
- " + \"\\n\"\n",
- " )\n",
- " subject_ids.append(subject_id)\n",
- " return subject_ids\n",
- "\n",
- "\n",
- "def harvest_subcollections(coll_id, coll_name):\n",
- " \"\"\"\n",
- " Harvest sub-collections from a collection page.\n",
- " \"\"\"\n",
- " collection_ids = []\n",
- " response = requests.get(f\"http://pandora.nla.gov.au{coll_id}\")\n",
- " soup = BeautifulSoup(response.text, \"lxml\")\n",
- " for subc in soup.find_all(\"h1\"):\n",
- " sub_link = subc.find(\"a\", {\"name\": re.compile(r\"\\d+\")})\n",
- " if sub_link:\n",
- " sub_name = sub_link.string\n",
- " if coll_name not in sub_name:\n",
- " sub_name = f\"{coll_name} - {sub_name}\"\n",
- " sub_id = f\"/col/{sub_link['name']}\"\n",
- " title_ids = get_title_ids(sub_id)\n",
- " with Path(collections_output).open(\"a\") as collections_file:\n",
+ " subject_ids.append(subject_id)\n",
+ " return subject_ids\n",
+ "\n",
+ " def harvest_subcollections(self, coll_id, coll_name):\n",
+ " \"\"\"\n",
+ " Harvest sub-collections from a collection page.\n",
+ " \"\"\"\n",
+ " collection_ids = []\n",
+ " # Get the collection page\n",
+ " response = requests.get(f\"http://pandora.nla.gov.au{coll_id}\")\n",
+ " soup = BeautifulSoup(response.text, \"lxml\")\n",
+ " # Sub-collections are included in the collection pages and identified with h1 headings.\n",
+ " # The h1 headings include a name attribute that is set to the sub-collection id.\n",
+ " # You can use the id to request a page that just has the subcollection.\n",
+ " # First get all the h1 tags\n",
+ " for subc in soup.find_all(\"h1\"):\n",
+ " # Get the id value from the name attribute\n",
+ " sub_link = subc.find(\"a\", {\"name\": re.compile(r\"\\d+\")})\n",
+ " if sub_link:\n",
+ " sub_name = sub_link.string\n",
+ " # Add the collection name to the sub collection name (if it's not already there)\n",
+ " if coll_name not in sub_name:\n",
+ " sub_name = f\"{coll_name} - {sub_name}\"\n",
+ " # Use the sub-collection id to get a list of titles in the sub-collection\n",
+ " sub_id = f\"/col/{sub_link['name']}\"\n",
+ " title_ids = self.get_title_ids(sub_id)\n",
+ " with Path(self.collection_output).open(\"a\") as collections_file:\n",
+ " collections_file.write(\n",
+ " json.dumps(\n",
+ " {\n",
+ " \"name\": sub_name,\n",
+ " \"id\": sub_id,\n",
+ " \"titles\": title_ids,\n",
+ " \"subcollections\": [],\n",
+ " }\n",
+ " )\n",
+ " + \"\\n\"\n",
+ " )\n",
+ " collection_ids.append(sub_id)\n",
+ " return collection_ids\n",
+ "\n",
+ " def harvest_collections(self, subject_id):\n",
+ " \"\"\"\n",
+ " Harvest details of collections from a subject, or sub-category page.\n",
+ " \"\"\"\n",
+ " collection_ids = []\n",
+ " # Get the subject page\n",
+ " response = requests.get(f\"http://pandora.nla.gov.au{subject_id}\")\n",
+ " soup = BeautifulSoup(response.text, \"lxml\")\n",
+ " # Get all of the links to collection pages\n",
+ " collection_links = soup.find_all(\"a\", href=re.compile(r\"/col/\\d+$\"))\n",
+ " # Process each collection page\n",
+ " for coll_link in collection_links:\n",
+ " coll_name = \" \".join(coll_link.stripped_strings)\n",
+ " coll_id = coll_link[\"href\"]\n",
+ " # Get any sub-collections\n",
+ " subcollection_ids = self.harvest_subcollections(coll_id, coll_name)\n",
+ " # Get titles\n",
+ " title_ids = self.get_title_ids(coll_id)\n",
+ " with Path(self.collection_output).open(\"a\") as collections_file:\n",
" collections_file.write(\n",
" json.dumps(\n",
" {\n",
- " \"name\": sub_name,\n",
- " \"id\": sub_id,\n",
+ " \"name\": coll_name,\n",
+ " \"id\": coll_id,\n",
+ " \"subcollections\": subcollection_ids,\n",
" \"titles\": title_ids,\n",
- " \"subcollections\": [],\n",
" }\n",
" )\n",
" + \"\\n\"\n",
" )\n",
- " collection_ids.append(sub_id)\n",
- " return collection_ids\n",
- "\n",
- "\n",
- "def harvest_collections(subject_id):\n",
- " \"\"\"\n",
- " Harvest details of collections from a subject, or sub-category page.\n",
- " \"\"\"\n",
- " collection_ids = []\n",
- " response = requests.get(f\"http://pandora.nla.gov.au{subject_id}\")\n",
- " soup = BeautifulSoup(response.text, \"lxml\")\n",
- " collection_links = soup.find_all(\"a\", href=re.compile(r\"/col/\\d+$\"))\n",
- " for coll_link in collection_links:\n",
- " coll_name = \" \".join(coll_link.stripped_strings)\n",
- " coll_id = coll_link[\"href\"]\n",
- " subcollection_ids = harvest_subcollections(coll_id, coll_name)\n",
- " title_ids = get_title_ids(coll_id)\n",
- " with Path(collections_output).open(\"a\") as collections_file:\n",
- " collections_file.write(\n",
- " json.dumps(\n",
- " {\n",
- " \"name\": coll_name,\n",
- " \"id\": coll_id,\n",
- " \"subcollections\": subcollection_ids,\n",
- " \"titles\": title_ids,\n",
- " }\n",
- " )\n",
- " + \"\\n\"\n",
- " )\n",
- " collection_ids.append(coll_id)\n",
- " return collection_ids\n",
- "\n",
- "\n",
- "def harvest_subjects(sample=None):\n",
- " Path(subjects_output).unlink(missing_ok=True)\n",
- " Path(collections_output).unlink(missing_ok=True)\n",
- " response = requests.get(\"http://pandora.nla.gov.au/\")\n",
- " soup = BeautifulSoup(response.text, \"lxml\")\n",
- " subject_list = soup.find(\"div\", class_=\"browseSubjects\").find_all(\"li\")\n",
- " for subject in tqdm(subject_list[:sample]):\n",
- " subject_link = subject.find(\"a\")\n",
- " subject_name = \" \".join(subject_link.stripped_strings)\n",
- " subject_id = subject_link[\"href\"]\n",
- " subcategory_ids = harvest_subcategories(subject_id)\n",
- " subcollection_ids = harvest_collections(subject_id)\n",
- " title_ids = get_title_ids(subject_id)\n",
- " with Path(subjects_output).open(\"a\") as subjects_file:\n",
- " subjects_file.write(\n",
- " json.dumps(\n",
- " {\n",
- " \"name\": subject_name,\n",
- " \"id\": subject_id,\n",
- " \"subcategories\": subcategory_ids,\n",
- " \"collections\": subcollection_ids,\n",
- " \"titles\": title_ids,\n",
- " }\n",
- " )\n",
- " + \"\\n\"\n",
- " )"
+ " collection_ids.append(coll_id)\n",
+ " return collection_ids\n",
+ "\n",
+ " def harvest(self):\n",
+ " \"\"\"\n",
+ " Start the harvest by getting the top-level subjects on the Pandora home page\n",
+ " and work down the hierarchy from there.\n",
+ " \"\"\"\n",
+ " # Remove old data files\n",
+ " Path(self.subject_output).unlink(missing_ok=True)\n",
+ " Path(self.collection_output).unlink(missing_ok=True)\n",
+ " # Get the Pandora home page\n",
+ " response = requests.get(\"http://pandora.nla.gov.au/\")\n",
+ " soup = BeautifulSoup(response.text, \"lxml\")\n",
+ " # Find the list of subjects\n",
+ " subject_list = soup.find(\"div\", class_=\"browseSubjects\").find_all(\"li\")\n",
+ " # Process each top-level subject\n",
+ " for subject in tqdm(subject_list[: self.sample]):\n",
+ " subject_link = subject.find(\"a\")\n",
+ " subject_name = \" \".join(subject_link.stripped_strings)\n",
+ " subject_id = subject_link[\"href\"]\n",
+ " # Get subcategories\n",
+ " subcategory_ids = self.harvest_subcategories(subject_id)\n",
+ " # Get collections\n",
+ " subcollection_ids = self.harvest_collections(subject_id)\n",
+ " # Get titles\n",
+ " title_ids = self.get_title_ids(subject_id)\n",
+ " with Path(self.subject_output).open(\"a\") as subjects_file:\n",
+ " subjects_file.write(\n",
+ " json.dumps(\n",
+ " {\n",
+ " \"name\": subject_name,\n",
+ " \"id\": subject_id,\n",
+ " \"subcategories\": subcategory_ids,\n",
+ " \"collections\": subcollection_ids,\n",
+ " \"titles\": title_ids,\n",
+ " }\n",
+ " )\n",
+ " + \"\\n\"\n",
+ " )"
]
},
{
@@ -256,10 +309,8 @@
},
"outputs": [],
"source": [
- "subjects_output = \"pandora-subjects.ndjson\"\n",
- "collections_output = \"pandora-collections.ndjson\"\n",
- "\n",
- "harvest_subjects()"
+ "harvester = SubjectHarvester()\n",
+ "harvester.harvest()"
]
},
{
@@ -377,13 +428,16 @@
"source": [
"# IGNORE CELL --TESTING ONLY\n",
"if os.getenv(\"GW_STATUS\") == \"dev\":\n",
- " subjects_output = \"pandora-subjects-test.ndjson\"\n",
- " collections_output = \"pandora-collections-test.ndjson\"\n",
"\n",
- " harvest_subjects(sample=1)\n",
+ " harvester = SubjectHarvester(\n",
+ " subject_output=\"pandora-subjects-test.ndjson\",\n",
+ " collection_output=\"pandora-collections-test.ndjson\",\n",
+ " sample=1,\n",
+ " )\n",
+ " harvester.harvest()\n",
"\n",
- " Path(subjects_output).unlink(missing_ok=True)\n",
- " Path(collections_output).unlink(missing_ok=True)"
+ " Path(\"pandora-subjects-test.ndjson\").unlink(missing_ok=True)\n",
+ " Path(\"pandora-collections-test.ndjson\").unlink(missing_ok=True)"
]
},
{
@@ -421,6 +475,7 @@
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
+ "position": 1,
"rocrate": {
"action": [
{
@@ -444,7 +499,7 @@
"orcid": "https://orcid.org/0000-0001-7956-4498"
}
],
- "description": "This notebook harvests Pandora's navigation hierarchy, saving the connections between subjects, collections, and titles. The datasets created can be used to assemble subject-based collections of archived websites for research.",
+ "description": "This notebook harvests Pandora's navigation hierarchy, saving the connections between subjects, collections, and titles.\n\nThe [Pandora](http://pandora.nla.gov.au/) selective web archive assigns archived titles to subject and collection groupings. These curated collections help researchers find archived websites relating to specific topics or events, such as [election campaigns](http://pandora.nla.gov.au/subject/6). This notebook creates two datasets containing details of all Pandora's subjects and collections. The datasets can be used to [assemble subject-based collections of archived websites for research](https://glam-workbench.net/trove-web-archives/create-datasets/).",
"mainEntityOfPage": "https://glam-workbench.net/trove-web-archives/harvest-pandora-subject-collections/",
"name": "Harvest Pandora subjects and collections"
}
diff --git a/harvest-pandora-titles.ipynb b/harvest-pandora-titles.ipynb
index b1ffe68..505b80f 100644
--- a/harvest-pandora-titles.ipynb
+++ b/harvest-pandora-titles.ipynb
@@ -50,7 +50,7 @@
"- `gathered_url` – the url that was archived\n",
"- `surt` – the surt (Sort-friendly URI Reordering Transform) is a version of the url that reverses the order of the domain components to put the top-level domain first, making it easier to group or sort resources by domain\n",
"\n",
- "A pre-harvested version of this dataset is available from the [trove-web-archives-collections-data](https://github.com/GLAM-Workbench/trove-web-archives-collections-data) repository."
+ "A pre-harvested version of this dataset is available from the [Pandora titles data](https://glam-workbench.net/trove-web-archives/pandora-titles-data/) repository."
]
},
{
@@ -376,9 +376,10 @@
"orcid": "https://orcid.org/0000-0001-7956-4498"
}
],
- "description": "This notebook harvests a complete collection of archived web page titles from [Pandora](http://pandora.nla.gov.au/), the National Library of Australia's selective web archive.",
+ "description": "This notebook harvests a complete collection of archived web page titles from [Pandora](http://pandora.nla.gov.au/), the National Library of Australia's selective web archive.\n\nPandora has been selecting web sites and online resources for preservation since 1996. It has assembled a collection of more than 80,000 titles, organised into subjects and collections. The archived websites are now part of the Australian Web Archive (AWA), which combines the selected titles with broader domain harvests, and is searchable through Trove. However, Pandora's curated collections offer a useful entry point for researchers trying to find web sites relating to particular topics or events.",
"mainEntityOfPage": "https://glam-workbench.net/trove-web-archives/harvest-pandora-titles/",
- "name": "Harvest the full collection of Pandora titles"
+ "name": "Harvest the full collection of Pandora titles",
+ "position": 2
}
},
"nbformat": 4,
diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json
index 123d2e8..daed830 100644
--- a/ro-crate-metadata.json
+++ b/ro-crate-metadata.json
@@ -9,8 +9,8 @@
"@id": "https://orcid.org/0000-0001-7956-4498"
}
],
- "datePublished": "2024-05-03T03:41:59+00:00",
- "description": "A GLAM Workbench repository",
+ "datePublished": "2024-05-06",
+ "description": "This repository includes information on finding, understanding, and using Pandora's collections of archived web pages.\n\n[Pandora](http://pandora.nla.gov.au/) has been selecting web sites and online resources for preservation since 1996. It has assembled a collection of more than 80,000 titles, organised into subjects and collections. The archived websites are now part of the Australian Web Archive (AWA), which combines the selected titles with broader domain harvests, and is searchable through Trove. However, Pandora's curated collections offer a useful entry point for researchers trying to find web sites relating to particular topics or events.\n\nThe [Web Archives](https://glam-workbench.net/web-archives/) section of the GLAM Workbench provides documentation, tools, and examples to help you work with data from a range of web archives, including the Australian Web Archive. The title urls obtained through Pandora can be used to obtain additional data from the AWA for analysis.",
"hasPart": [
{
"@id": "create-datasets.ipynb"
@@ -38,7 +38,8 @@
"@id": "https://glam-workbench.net/trove-web-archives"
},
"name": "trove-web-archives",
- "url": "https://github.com/GLAM-Workbench/trove-web-archives/"
+ "url": "https://github.com/GLAM-Workbench/trove-web-archives/",
+ "version": "v1.0.0"
},
{
"@id": "ro-crate-metadata.json",
@@ -95,7 +96,7 @@
"@id": "https://github.com/GLAM-Workbench/trove-web-archives-titles"
},
"name": "pandora-titles.csv",
- "sdDatePublished": "2024-05-03",
+ "sdDatePublished": "2024-05-06",
"size": 87741,
"url": "https://github.com/GLAM-Workbench/trove-web-archives-titles/raw/main/pandora-titles.csv"
},
@@ -111,7 +112,7 @@
"@id": "https://github.com/GLAM-Workbench/trove-web-archives-collections"
},
"name": "pandora-subjects.ndjson",
- "sdDatePublished": "2024-05-03",
+ "sdDatePublished": "2024-05-06",
"size": 149,
"url": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-subjects.ndjson"
},
@@ -127,7 +128,7 @@
"@id": "https://github.com/GLAM-Workbench/trove-web-archives-collections"
},
"name": "pandora-collections.ndjson",
- "sdDatePublished": "2024-05-03",
+ "sdDatePublished": "2024-05-06",
"size": 1920,
"url": "https://github.com/GLAM-Workbench/trove-web-archives-collections/raw/main/pandora-collections.ndjson"
},
@@ -176,7 +177,7 @@
"conformsTo": {
"@id": "https://purl.archive.org/textcommons/profile#Notebook"
},
- "description": "This notebook harvests a complete collection of archived web page titles from [Pandora](http://pandora.nla.gov.au/), the National Library of Australia's selective web archive.",
+ "description": "This notebook harvests a complete collection of archived web page titles from [Pandora](http://pandora.nla.gov.au/), the National Library of Australia's selective web archive.\n\nPandora has been selecting web sites and online resources for preservation since 1996. It has assembled a collection of more than 80,000 titles, organised into subjects and collections. The archived websites are now part of the Australian Web Archive (AWA), which combines the selected titles with broader domain harvests, and is searchable through Trove. However, Pandora's curated collections offer a useful entry point for researchers trying to find web sites relating to particular topics or events.",
"encodingFormat": "application/x-ipynb+json",
"mainEntityOfPage": {
"@id": "https://glam-workbench.net/trove-web-archives/harvest-pandora-titles/"
@@ -259,7 +260,7 @@
"isPartOf": {
"@id": "https://glam-workbench.net"
},
- "name": "Create datasets",
+ "name": "Harvest Pandora subjects and collections",
"url": "https://glam-workbench.net/trove-web-archives/create-datasets/"
},
{
@@ -268,7 +269,7 @@
"isPartOf": {
"@id": "https://glam-workbench.net"
},
- "name": "Harvest pandora subject collections",
+ "name": "Harvest Pandora subjects and collections",
"url": "https://glam-workbench.net/trove-web-archives/harvest-pandora-subject-collections/"
},
{
@@ -317,7 +318,7 @@
"isPartOf": {
"@id": "https://glam-workbench.net"
},
- "name": "Harvest pandora titles",
+ "name": "Harvest Pandora subjects and collections",
"url": "https://glam-workbench.net/trove-web-archives/harvest-pandora-titles/"
},
{
@@ -356,6 +357,15 @@
"@id": "https://github.com/GLAM-Workbench/trove-web-archives-titles/raw/main/pandora-titles.csv"
}
]
+ },
+ {
+ "@id": "#create_version_v1_0_0",
+ "@type": "UpdateAction",
+ "actionStatus": {
+ "@id": "http://schema.org/CompletedActionStatus"
+ },
+ "endDate": "2024-05-06",
+ "name": "Create version v1.0.0"
}
]
}
\ No newline at end of file