Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,11 @@ Newspaper.json
data/trove-lists-2022-07-05.csv
git_all_versions_of
trove_tag_counts_20220706.csv
.env
.env
tags_errors_test.ndjson
tags_errors.ndjson
data/trove-lists-2023-07-21.csv
data/trove-lists-2024-05-29.csv
data/trove-lists.csv
start.txt
trove_tag_counts_20240606.csv
8 changes: 4 additions & 4 deletions .zenodo.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"related_identifiers": [
{
"scheme": "url",
"identifier": "https://github.com/GLAM-Workbench/trove-lists/tree/v1.0.0",
"identifier": "https://github.com/GLAM-Workbench/trove-lists/tree/v1.1.0",
"relation": "isDerivedFrom",
"resource_type": "software"
},
Expand All @@ -22,20 +22,20 @@
"resource_type": "other"
}
],
"version": "v1.0.0",
"version": "v1.1.0",
"upload_type": "software",
"keywords": [
"digital humanities",
"Jupyter",
"GLAM Workbench"
],
"publication_date": "2023-01-25",
"publication_date": "2024-06-06",
"creators": [
{
"name": "Sherratt, Tim",
"orcid": "0000-0001-7956-4498"
}
],
"access_right": "open",
"description": "<p>Current version: <a href=\"https://github.com/GLAM-Workbench/trove-lists/releases/tag/v1.0.0\">v1.0.0</a></p> <p>Jupyter notebooks to work with data from Trove’s public lists and tags. See the <a href=\"https://glam-workbench.net/trove-lists/\">GLAM Workbench</a> for more details.</p> <h2 id=\"notebook-topics\">Notebook topics</h2> <h3 id=\"lists\">Lists</h3> <ul> <li><strong>Convert a Trove list into a CSV file</strong> – extracts list data from the Trove API and saves the results as CSV files (with separate files for newspaper articles and other resources); optionally save OCRd test, PDFs, and images of any listed newspaper articles.</li> <li><strong>Convert a Trove list into a CollectionBuilder exhibition</strong> – converts Trove lists into a series of files that can be uploaded to a <a href=\"https://github.com/CollectionBuilder/collectionbuilder-gh\">CollectionBuilder-GH</a> repository to create an instant exhibition.</li> <li><strong>Harvest summary data from Trove lists</strong> – extract and analyse data from all public lists in Trove</li> </ul> <h3 id=\"tags\">Tags</h3> <ul> <li><strong>Harvest public tags from Trove zones</strong> – assemble a dataset containing all public tags added to Trove</li> <li><strong>Analyse public tags added to Trove</strong> – explore ways of analysing and visualising the complete dataset of public tags added to Trove resources</li> </ul> <h2 id=\"cite-as\">Cite as</h2> <p>See the GLAM Workbench or <a href=\"https://doi.org/10.5281/zenodo.3521723\">Zenodo</a> for up-to-date citation details.</p> <hr /> <p>This repository is part of the <a href=\"https://glam-workbench.github.io/\">GLAM Workbench</a>.<br /> If you think this project is worthwhile, you might like <a href=\"https://github.com/sponsors/wragge?o=esb\">to sponsor me on GitHub</a>.</p>"
"description": "<p>Trove lists and tags are created by Trove users to organise and describe resources. The details of public lists and tags are available through the Trove API. The notebooks in this repository demonstrate how to harvest and analyse list and tag data.</p> <p>For more information and documentation see the <a href=\"https://glam-workbench.net/trove-lists\">Trove lists and tags</a> section of the <a href=\"https://glam-workbench.net\">GLAM Workbench</a>.</p> <h2 id=\"notebooks\">Notebooks</h2> <ul> <li>Convert a Trove list into a CollectionBuilder exhibition</li> <li>Harvest public tags from Trove zones</li> <li>Convert a Trove list into a CSV file</li> <li>Analyse public tags added to Trove</li> <li>Harvest summary data from Trove lists</li> </ul> <h2 id=\"associated-datasets\">Associated datasets</h2> <ul> <li><a href=\"https://github.com/GLAM-Workbench/trove-lists-metadata/\">trove-lists-metadata</a></li> <li><a href=\"https://zenodo.org/doi/10.5281/zenodo.5094313\">Trove public tags</a></li> <li><a href=\"https://zenodo.org/doi/10.5281/zenodo.7563922\">Trove tag counts</a></li> </ul> <hr /> <p>Created by <a href=\"https://timsherratt.au\">Tim Sherratt</a> for the <a href=\"https://glam-workbench.net\">GLAM Workbench</a></p>"
}
216 changes: 54 additions & 162 deletions Convert-a-Trove-list-into-a-CSV-file.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@
"\n",
"import pandas as pd\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from IPython.display import HTML\n",
"from PIL import UnidentifiedImageError\n",
"from requests.adapters import HTTPAdapter\n",
"from requests.exceptions import HTTPError\n",
"from requests.packages.urllib3.util.retry import Retry\n",
Expand All @@ -62,20 +64,9 @@
"s = requests.Session()\n",
"retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])\n",
"s.mount(\"http://\", HTTPAdapter(max_retries=retries))\n",
"s.mount(\"https://\", HTTPAdapter(max_retries=retries))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"# Load variables from the .env file if it exists\n",
"# Use %%capture to suppress messages\n",
"%load_ext dotenv\n",
"%dotenv"
"s.mount(\"https://\", HTTPAdapter(max_retries=retries))\n",
"\n",
"load_dotenv()"
]
},
{
Expand All @@ -102,7 +93,9 @@
"\n",
"# Use api key value from environment variables if it is available\n",
"if os.getenv(\"TROVE_API_KEY\"):\n",
" API_KEY = os.getenv(\"TROVE_API_KEY\")"
" API_KEY = os.getenv(\"TROVE_API_KEY\")\n",
"\n",
"headers = {\"X-API-KEY\": API_KEY}"
]
},
{
Expand All @@ -128,7 +121,7 @@
"save_pdfs = False\n",
"\n",
"# Change this to False if you don't want to save images of newspaper articles\n",
"save_images = True"
"save_images = False"
]
},
{
Expand All @@ -146,23 +139,9 @@
"metadata": {},
"outputs": [],
"source": [
"def listify(value):\n",
" \"\"\"\n",
" Sometimes values can be lists and sometimes not.\n",
" Turn them all into lists to make life easier.\n",
" \"\"\"\n",
" if isinstance(value, (str, int)):\n",
" try:\n",
" value = str(value)\n",
" except ValueError:\n",
" pass\n",
" value = [value]\n",
" return value\n",
"\n",
"\n",
"def get_url(identifiers, linktype):\n",
" \"\"\"\n",
" Loop through the identifiers to find the request url.\n",
" Loop through the identifiers to find the requested url.\n",
" \"\"\"\n",
" url = \"\"\n",
" for identifier in identifiers:\n",
Expand Down Expand Up @@ -191,14 +170,14 @@
"\n",
"\n",
"def get_list(list_id):\n",
" list_url = f\"https://api.trove.nla.gov.au/v2/list/{list_id}?encoding=json&reclevel=full&include=listItems&key={API_KEY}\"\n",
" response = s.get(list_url)\n",
" list_url = f\"https://api.trove.nla.gov.au/v3/list/{list_id}?encoding=json&reclevel=full&include=listItems\"\n",
" response = s.get(list_url, headers=headers)\n",
" return response.json()\n",
"\n",
"\n",
"def get_article(id):\n",
" article_api_url = f\"https://api.trove.nla.gov.au/v2/newspaper/{id}/?encoding=json&reclevel=full&include=articletext&key={API_KEY}\"\n",
" response = s.get(article_api_url)\n",
" article_api_url = f\"https://api.trove.nla.gov.au/v3/newspaper/{id}?encoding=json&reclevel=full&include=articletext\"\n",
" response = s.get(article_api_url, headers=headers)\n",
" return response.json()\n",
"\n",
"\n",
Expand Down Expand Up @@ -265,15 +244,15 @@
" data = get_list(list_id)\n",
" works = []\n",
" articles = []\n",
" for item in tqdm(data[\"list\"][0][\"listItem\"]):\n",
" for item in tqdm(data[\"listItem\"]):\n",
" for zone, record in item.items():\n",
" if zone == \"work\":\n",
" work = {\n",
" \"id\": record.get(\"id\", \"\"),\n",
" \"title\": record.get(\"title\", \"\"),\n",
" \"type\": \"|\".join(listify(record.get(\"type\", \"\"))),\n",
" \"issued\": \"|\".join(listify(record.get(\"issued\", \"\"))),\n",
" \"contributor\": \"|\".join(listify(record.get(\"contributor\", \"\"))),\n",
" \"type\": \"|\".join(record.get(\"type\", [])),\n",
" \"issued\": record.get(\"issued\", \"\"),\n",
" \"contributor\": \"|\".join(record.get(\"contributor\", [])),\n",
" \"trove_url\": record.get(\"troveUrl\", \"\"),\n",
" \"fulltext_url\": get_url(record.get(\"identifier\", \"\"), \"fulltext\"),\n",
" \"thumbnail_url\": get_url(record.get(\"identifier\", \"\"), \"thumbnail\"),\n",
Expand All @@ -286,29 +265,27 @@
" \"category\": record.get(\"category\", \"\"),\n",
" \"date\": record.get(\"date\", \"\"),\n",
" \"newspaper_id\": record.get(\"title\", {}).get(\"id\"),\n",
" \"newspaper_title\": record.get(\"title\", {}).get(\"value\"),\n",
" \"newspaper_title\": record.get(\"title\", {}).get(\"title\"),\n",
" \"page\": record.get(\"page\", \"\"),\n",
" \"page_sequence\": record.get(\"pageSequence\", \"\"),\n",
" \"trove_url\": f'http://nla.gov.au/nla.news-article{record.get(\"id\")}',\n",
" }\n",
" full_details = get_article(record.get(\"id\"))\n",
" article[\"words\"] = full_details[\"article\"].get(\"wordCount\", \"\")\n",
" article[\"illustrated\"] = full_details[\"article\"].get(\"illustrated\", \"\")\n",
" article[\"corrections\"] = full_details[\"article\"].get(\n",
" \"correctionCount\", \"\"\n",
" )\n",
" if \"trovePageUrl\" in full_details[\"article\"]:\n",
" article[\"words\"] = full_details.get(\"wordCount\", \"\")\n",
" article[\"illustrated\"] = full_details.get(\"illustrated\", \"\")\n",
" article[\"corrections\"] = full_details.get(\"correctionCount\", \"\")\n",
" if \"trovePageUrl\" in full_details:\n",
" page_id = re.search(\n",
" r\"page\\/(\\d+)\", full_details[\"article\"][\"trovePageUrl\"]\n",
" r\"page(\\d+)\", full_details[\"trovePageUrl\"]\n",
" ).group(1)\n",
" article[\n",
" \"page_url\"\n",
" ] = f\"http://trove.nla.gov.au/newspaper/page/{page_id}\"\n",
" article[\"page_url\"] = (\n",
" f\"http://trove.nla.gov.au/newspaper/page/{page_id}\"\n",
" )\n",
" else:\n",
" article[\"page_url\"] = \"\"\n",
" filename = make_filename(article)\n",
" if save_texts:\n",
" text = full_details[\"article\"].get(\"articleText\")\n",
" text = full_details.get(\"articleText\")\n",
" text_file = Path(list_dir, \"text\", f\"{filename}.txt\")\n",
" if text:\n",
" text = re.sub(r\"<[^<]+?>\", \"\", text)\n",
Expand All @@ -325,7 +302,18 @@
" for chunk in response.iter_content(chunk_size=128):\n",
" pf.write(chunk)\n",
" if save_images:\n",
" download_images(article[\"id\"], Path(list_dir, \"image\"))\n",
" images = []\n",
" tries = 0\n",
" # Trove has had some issues loading newspaper images lately\n",
" # This is an attempted workaround\n",
" while not images and tries < 2:\n",
" try:\n",
" images = download_images(\n",
" article[\"id\"], Path(list_dir, \"image\"), masked=True\n",
" )\n",
" except UnidentifiedImageError:\n",
" time.sleep(5)\n",
" tries += 1\n",
"\n",
" articles.append(article)\n",
" if articles:\n",
Expand Down Expand Up @@ -432,117 +420,21 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"vscode": {
"interpreter": {
"hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
}
"version": "3.10.12"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {
"00416a59f88b4d58870633a090d29096": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "2.0.0",
"model_name": "HBoxModel",
"state": {
"children": [
"IPY_MODEL_074a80383fc549c5897b15f7e0a095cb",
"IPY_MODEL_38eb213b920d4d9997a6ac6472374356",
"IPY_MODEL_2ba81b5e07254e0d9d857b0cfcd9e2de"
],
"layout": "IPY_MODEL_3c66a2075b454639bf8a778cf8771771"
}
},
"074a80383fc549c5897b15f7e0a095cb": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "2.0.0",
"model_name": "HTMLModel",
"state": {
"layout": "IPY_MODEL_dd69496fe0234287980a95e5fc1ba4b4",
"style": "IPY_MODEL_7fe5a762946f44c3aacaea6e2a8a5af6",
"value": "100%"
}
},
"2ba81b5e07254e0d9d857b0cfcd9e2de": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "2.0.0",
"model_name": "HTMLModel",
"state": {
"layout": "IPY_MODEL_8533d2f19e41418181b51338176a56ce",
"style": "IPY_MODEL_d8c985e7953145c382d6f6a7f75ae6e2",
"value": " 23/23 [00:04&lt;00:00, 3.18it/s]"
}
},
"38eb213b920d4d9997a6ac6472374356": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "2.0.0",
"model_name": "FloatProgressModel",
"state": {
"bar_style": "success",
"layout": "IPY_MODEL_a4e0a04976654253ab53c86ab448f8da",
"max": 23,
"style": "IPY_MODEL_b69918428ea34736b97efc149aa1b309",
"value": 23
}
},
"3c66a2075b454639bf8a778cf8771771": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "2.0.0",
"model_name": "LayoutModel",
"state": {}
},
"7fe5a762946f44c3aacaea6e2a8a5af6": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "2.0.0",
"model_name": "HTMLStyleModel",
"state": {
"description_width": "",
"font_size": null,
"text_color": null
}
},
"8533d2f19e41418181b51338176a56ce": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "2.0.0",
"model_name": "LayoutModel",
"state": {}
},
"a4e0a04976654253ab53c86ab448f8da": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "2.0.0",
"model_name": "LayoutModel",
"state": {}
},
"b69918428ea34736b97efc149aa1b309": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "2.0.0",
"model_name": "ProgressStyleModel",
"state": {
"description_width": ""
}
},
"d8c985e7953145c382d6f6a7f75ae6e2": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "2.0.0",
"model_name": "HTMLStyleModel",
"state": {
"description_width": "",
"font_size": null,
"text_color": null
}
},
"dd69496fe0234287980a95e5fc1ba4b4": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "2.0.0",
"model_name": "LayoutModel",
"state": {}
}
},
"version_major": 2,
"version_minor": 0
}
"rocrate": {
"author": [
{
"mainEntityOfPage": "https://timsherratt.au",
"name": "Sherratt, Tim",
"orcid": "https://orcid.org/0000-0001-7956-4498"
}
],
"category": "Lists",
"description": "This notebook converts Trove lists into CSV files (spreadsheets). Separate CSV files are created for newspaper articles and works from Trove's other zones. You can also save the OCRd text, a PDF, and an image of each newspaper article.",
"mainEntityOfPage": "https://glam-workbench.net/trove-lists/convert-a-trove-list-into-a-csv-file/",
"name": "Convert a Trove list into a CSV file",
"position": 0
}
},
"nbformat": 4,
Expand Down
Loading