GLAM-Workbench · wragge · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/.gitignore b/.gitignore
@@ -17,4 +17,11 @@ Newspaper.json
 data/trove-lists-2022-07-05.csv
 git_all_versions_of
 trove_tag_counts_20220706.csv
-.env
+.env
+tags_errors_test.ndjson
+tags_errors.ndjson
+data/trove-lists-2023-07-21.csv
+data/trove-lists-2024-05-29.csv
+data/trove-lists.csv
+start.txt
+trove_tag_counts_20240606.csv
diff --git a/.zenodo.json b/.zenodo.json
@@ -5,7 +5,7 @@
   "related_identifiers": [
     {
       "scheme": "url",
-      "identifier": "https://github.com/GLAM-Workbench/trove-lists/tree/v1.0.0",
+      "identifier": "https://github.com/GLAM-Workbench/trove-lists/tree/v1.1.0",
       "relation": "isDerivedFrom",
       "resource_type": "software"
     },
@@ -22,20 +22,20 @@
       "resource_type": "other"
     }
   ],
-  "version": "v1.0.0",
+  "version": "v1.1.0",
   "upload_type": "software",
   "keywords": [
     "digital humanities",
     "Jupyter",
     "GLAM Workbench"
   ],
-  "publication_date": "2023-01-25",
+  "publication_date": "2024-06-06",
   "creators": [
     {
       "name": "Sherratt, Tim",
       "orcid": "0000-0001-7956-4498"
     }
   ],
   "access_right": "open",
-  "description": "<p>Current version: <a href=\"https://github.com/GLAM-Workbench/trove-lists/releases/tag/v1.0.0\">v1.0.0</a></p> <p>Jupyter notebooks to work with data from Trove’s public lists and tags. See the <a href=\"https://glam-workbench.net/trove-lists/\">GLAM Workbench</a> for more details.</p> <h2 id=\"notebook-topics\">Notebook topics</h2> <h3 id=\"lists\">Lists</h3> <ul> <li><strong>Convert a Trove list into a CSV file</strong> – extracts list data from the Trove API and saves the results as CSV files (with separate files for newspaper articles and other resources); optionally save OCRd test, PDFs, and images of any listed newspaper articles.</li> <li><strong>Convert a Trove list into a CollectionBuilder exhibition</strong> – converts Trove lists into a series of files that can be uploaded to a <a href=\"https://github.com/CollectionBuilder/collectionbuilder-gh\">CollectionBuilder-GH</a> repository to create an instant exhibition.</li> <li><strong>Harvest summary data from Trove lists</strong> – extract and analyse data from all public lists in Trove</li> </ul> <h3 id=\"tags\">Tags</h3> <ul> <li><strong>Harvest public tags from Trove zones</strong> – assemble a dataset containing all public tags added to Trove</li> <li><strong>Analyse public tags added to Trove</strong> – explore ways of analysing and visualising the complete dataset of public tags added to Trove resources</li> </ul> <h2 id=\"cite-as\">Cite as</h2> <p>See the GLAM Workbench or <a href=\"https://doi.org/10.5281/zenodo.3521723\">Zenodo</a> for up-to-date citation details.</p> <hr /> <p>This repository is part of the <a href=\"https://glam-workbench.github.io/\">GLAM Workbench</a>.<br /> If you think this project is worthwhile, you might like <a href=\"https://github.com/sponsors/wragge?o=esb\">to sponsor me on GitHub</a>.</p>"
+  "description": "<p>Trove lists and tags are created by Trove users to organise and describe resources. The details of public lists and tags are available through the Trove API. The notebooks in this repository demonstrate how to harvest and analyse list and tag data.</p> <p>For more information and documentation see the <a href=\"https://glam-workbench.net/trove-lists\">Trove lists and tags</a> section of the <a href=\"https://glam-workbench.net\">GLAM Workbench</a>.</p> <h2 id=\"notebooks\">Notebooks</h2> <ul> <li>Convert a Trove list into a CollectionBuilder exhibition</li> <li>Harvest public tags from Trove zones</li> <li>Convert a Trove list into a CSV file</li> <li>Analyse public tags added to Trove</li> <li>Harvest summary data from Trove lists</li> </ul> <h2 id=\"associated-datasets\">Associated datasets</h2> <ul> <li><a href=\"https://github.com/GLAM-Workbench/trove-lists-metadata/\">trove-lists-metadata</a></li> <li><a href=\"https://zenodo.org/doi/10.5281/zenodo.5094313\">Trove public tags</a></li> <li><a href=\"https://zenodo.org/doi/10.5281/zenodo.7563922\">Trove tag counts</a></li> </ul> <hr /> <p>Created by <a href=\"https://timsherratt.au\">Tim Sherratt</a> for the <a href=\"https://glam-workbench.net\">GLAM Workbench</a></p>"
 }
diff --git a/Convert-a-Trove-list-into-a-CSV-file.ipynb b/Convert-a-Trove-list-into-a-CSV-file.ipynb
@@ -52,7 +52,9 @@
     "\n",
     "import pandas as pd\n",
     "import requests\n",
+    "from dotenv import load_dotenv\n",
     "from IPython.display import HTML\n",
+    "from PIL import UnidentifiedImageError\n",
     "from requests.adapters import HTTPAdapter\n",
     "from requests.exceptions import HTTPError\n",
     "from requests.packages.urllib3.util.retry import Retry\n",
@@ -62,20 +64,9 @@
     "s = requests.Session()\n",
     "retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])\n",
     "s.mount(\"http://\", HTTPAdapter(max_retries=retries))\n",
-    "s.mount(\"https://\", HTTPAdapter(max_retries=retries))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%capture\n",
-    "# Load variables from the .env file if it exists\n",
-    "# Use %%capture to suppress messages\n",
-    "%load_ext dotenv\n",
-    "%dotenv"
+    "s.mount(\"https://\", HTTPAdapter(max_retries=retries))\n",
+    "\n",
+    "load_dotenv()"
    ]
   },
   {
@@ -102,7 +93,9 @@
     "\n",
     "# Use api key value from environment variables if it is available\n",
     "if os.getenv(\"TROVE_API_KEY\"):\n",
-    "    API_KEY = os.getenv(\"TROVE_API_KEY\")"
+    "    API_KEY = os.getenv(\"TROVE_API_KEY\")\n",
+    "\n",
+    "headers = {\"X-API-KEY\": API_KEY}"
    ]
   },
   {
@@ -128,7 +121,7 @@
     "save_pdfs = False\n",
     "\n",
     "# Change this to False if you don't want to save images of newspaper articles\n",
-    "save_images = True"
+    "save_images = False"
    ]
   },
   {
@@ -146,23 +139,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def listify(value):\n",
-    "    \"\"\"\n",
-    "    Sometimes values can be lists and sometimes not.\n",
-    "    Turn them all into lists to make life easier.\n",
-    "    \"\"\"\n",
-    "    if isinstance(value, (str, int)):\n",
-    "        try:\n",
-    "            value = str(value)\n",
-    "        except ValueError:\n",
-    "            pass\n",
-    "        value = [value]\n",
-    "    return value\n",
-    "\n",
-    "\n",
     "def get_url(identifiers, linktype):\n",
     "    \"\"\"\n",
-    "    Loop through the identifiers to find the request url.\n",
+    "    Loop through the identifiers to find the requested url.\n",
     "    \"\"\"\n",
     "    url = \"\"\n",
     "    for identifier in identifiers:\n",
@@ -191,14 +170,14 @@
     "\n",
     "\n",
     "def get_list(list_id):\n",
-    "    list_url = f\"https://api.trove.nla.gov.au/v2/list/{list_id}?encoding=json&reclevel=full&include=listItems&key={API_KEY}\"\n",
-    "    response = s.get(list_url)\n",
+    "    list_url = f\"https://api.trove.nla.gov.au/v3/list/{list_id}?encoding=json&reclevel=full&include=listItems\"\n",
+    "    response = s.get(list_url, headers=headers)\n",
     "    return response.json()\n",
     "\n",
     "\n",
     "def get_article(id):\n",
-    "    article_api_url = f\"https://api.trove.nla.gov.au/v2/newspaper/{id}/?encoding=json&reclevel=full&include=articletext&key={API_KEY}\"\n",
-    "    response = s.get(article_api_url)\n",
+    "    article_api_url = f\"https://api.trove.nla.gov.au/v3/newspaper/{id}?encoding=json&reclevel=full&include=articletext\"\n",
+    "    response = s.get(article_api_url, headers=headers)\n",
     "    return response.json()\n",
     "\n",
     "\n",
@@ -265,15 +244,15 @@
     "    data = get_list(list_id)\n",
     "    works = []\n",
     "    articles = []\n",
-    "    for item in tqdm(data[\"list\"][0][\"listItem\"]):\n",
+    "    for item in tqdm(data[\"listItem\"]):\n",
     "        for zone, record in item.items():\n",
     "            if zone == \"work\":\n",
     "                work = {\n",
     "                    \"id\": record.get(\"id\", \"\"),\n",
     "                    \"title\": record.get(\"title\", \"\"),\n",
-    "                    \"type\": \"|\".join(listify(record.get(\"type\", \"\"))),\n",
-    "                    \"issued\": \"|\".join(listify(record.get(\"issued\", \"\"))),\n",
-    "                    \"contributor\": \"|\".join(listify(record.get(\"contributor\", \"\"))),\n",
+    "                    \"type\": \"|\".join(record.get(\"type\", [])),\n",
+    "                    \"issued\": record.get(\"issued\", \"\"),\n",
+    "                    \"contributor\": \"|\".join(record.get(\"contributor\", [])),\n",
     "                    \"trove_url\": record.get(\"troveUrl\", \"\"),\n",
     "                    \"fulltext_url\": get_url(record.get(\"identifier\", \"\"), \"fulltext\"),\n",
     "                    \"thumbnail_url\": get_url(record.get(\"identifier\", \"\"), \"thumbnail\"),\n",
@@ -286,29 +265,27 @@
     "                    \"category\": record.get(\"category\", \"\"),\n",
     "                    \"date\": record.get(\"date\", \"\"),\n",
     "                    \"newspaper_id\": record.get(\"title\", {}).get(\"id\"),\n",
-    "                    \"newspaper_title\": record.get(\"title\", {}).get(\"value\"),\n",
+    "                    \"newspaper_title\": record.get(\"title\", {}).get(\"title\"),\n",
     "                    \"page\": record.get(\"page\", \"\"),\n",
     "                    \"page_sequence\": record.get(\"pageSequence\", \"\"),\n",
     "                    \"trove_url\": f'http://nla.gov.au/nla.news-article{record.get(\"id\")}',\n",
     "                }\n",
     "                full_details = get_article(record.get(\"id\"))\n",
-    "                article[\"words\"] = full_details[\"article\"].get(\"wordCount\", \"\")\n",
-    "                article[\"illustrated\"] = full_details[\"article\"].get(\"illustrated\", \"\")\n",
-    "                article[\"corrections\"] = full_details[\"article\"].get(\n",
-    "                    \"correctionCount\", \"\"\n",
-    "                )\n",
-    "                if \"trovePageUrl\" in full_details[\"article\"]:\n",
+    "                article[\"words\"] = full_details.get(\"wordCount\", \"\")\n",
+    "                article[\"illustrated\"] = full_details.get(\"illustrated\", \"\")\n",
+    "                article[\"corrections\"] = full_details.get(\"correctionCount\", \"\")\n",
+    "                if \"trovePageUrl\" in full_details:\n",
     "                    page_id = re.search(\n",
-    "                        r\"page\\/(\\d+)\", full_details[\"article\"][\"trovePageUrl\"]\n",
+    "                        r\"page(\\d+)\", full_details[\"trovePageUrl\"]\n",
     "                    ).group(1)\n",
-    "                    article[\n",
-    "                        \"page_url\"\n",
-    "                    ] = f\"http://trove.nla.gov.au/newspaper/page/{page_id}\"\n",
+    "                    article[\"page_url\"] = (\n",
+    "                        f\"http://trove.nla.gov.au/newspaper/page/{page_id}\"\n",
+    "                    )\n",
     "                else:\n",
     "                    article[\"page_url\"] = \"\"\n",
     "                filename = make_filename(article)\n",
     "                if save_texts:\n",
-    "                    text = full_details[\"article\"].get(\"articleText\")\n",
+    "                    text = full_details.get(\"articleText\")\n",
     "                    text_file = Path(list_dir, \"text\", f\"{filename}.txt\")\n",
     "                    if text:\n",
     "                        text = re.sub(r\"<[^<]+?>\", \"\", text)\n",
@@ -325,7 +302,18 @@
     "                            for chunk in response.iter_content(chunk_size=128):\n",
     "                                pf.write(chunk)\n",
     "                if save_images:\n",
-    "                    download_images(article[\"id\"], Path(list_dir, \"image\"))\n",
+    "                    images = []\n",
+    "                    tries = 0\n",
+    "                    # Trove has had some issues loading newspaper images lately\n",
+    "                    # This is an attempted workaround\n",
+    "                    while not images and tries < 2:\n",
+    "                        try:\n",
+    "                            images = download_images(\n",
+    "                                article[\"id\"], Path(list_dir, \"image\"), masked=True\n",
+    "                            )\n",
+    "                        except UnidentifiedImageError:\n",
+    "                            time.sleep(5)\n",
+    "                            tries += 1\n",
     "\n",
     "                articles.append(article)\n",
     "    if articles:\n",
@@ -432,117 +420,21 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
-   }
+   "version": "3.10.12"
   },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "state": {
-     "00416a59f88b4d58870633a090d29096": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HBoxModel",
-      "state": {
-       "children": [
-        "IPY_MODEL_074a80383fc549c5897b15f7e0a095cb",
-        "IPY_MODEL_38eb213b920d4d9997a6ac6472374356",
-        "IPY_MODEL_2ba81b5e07254e0d9d857b0cfcd9e2de"
-       ],
-       "layout": "IPY_MODEL_3c66a2075b454639bf8a778cf8771771"
-      }
-     },
-     "074a80383fc549c5897b15f7e0a095cb": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_dd69496fe0234287980a95e5fc1ba4b4",
-       "style": "IPY_MODEL_7fe5a762946f44c3aacaea6e2a8a5af6",
-       "value": "100%"
-      }
-     },
-     "2ba81b5e07254e0d9d857b0cfcd9e2de": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_8533d2f19e41418181b51338176a56ce",
-       "style": "IPY_MODEL_d8c985e7953145c382d6f6a7f75ae6e2",
-       "value": " 23/23 [00:04&lt;00:00,  3.18it/s]"
-      }
-     },
-     "38eb213b920d4d9997a6ac6472374356": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "FloatProgressModel",
-      "state": {
-       "bar_style": "success",
-       "layout": "IPY_MODEL_a4e0a04976654253ab53c86ab448f8da",
-       "max": 23,
-       "style": "IPY_MODEL_b69918428ea34736b97efc149aa1b309",
-       "value": 23
-      }
-     },
-     "3c66a2075b454639bf8a778cf8771771": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "7fe5a762946f44c3aacaea6e2a8a5af6": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "8533d2f19e41418181b51338176a56ce": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "a4e0a04976654253ab53c86ab448f8da": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "b69918428ea34736b97efc149aa1b309": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "ProgressStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "d8c985e7953145c382d6f6a7f75ae6e2": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "dd69496fe0234287980a95e5fc1ba4b4": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     }
-    },
-    "version_major": 2,
-    "version_minor": 0
-   }
+  "rocrate": {
+   "author": [
+    {
+     "mainEntityOfPage": "https://timsherratt.au",
+     "name": "Sherratt, Tim",
+     "orcid": "https://orcid.org/0000-0001-7956-4498"
+    }
+   ],
+   "category": "Lists",
+   "description": "This notebook converts Trove lists into CSV files (spreadsheets). Separate CSV files are created for newspaper articles and works from Trove's other zones. You can also save the OCRd text, a PDF, and an image of each newspaper article.",
+   "mainEntityOfPage": "https://glam-workbench.net/trove-lists/convert-a-trove-list-into-a-csv-file/",
+   "name": "Convert a Trove list into a CSV file",
+   "position": 0
   }
  },
  "nbformat": 4,