Merge pull request #2 from GLAM-Workbench/update

Update documentation and extract ro-crate code to separate file
GLAM-Workbench · May 6, 2024 · 7ce1a52 · 7ce1a52
2 parents 67b166c + 718a1ec
commit 7ce1a52
Show file tree

Hide file tree

Showing 8 changed files with 457 additions and 288 deletions.
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,4 @@ Untitled*
 .python-version
 http_cache.sqlite
 data-rocrate
+subject-hierarchy.md
diff --git a/.zenodo.json b/.zenodo.json
@@ -1,42 +1,41 @@
 {
-    "language": "eng",
-    "license": "MIT",
-    "title": "GLAM-Workbench/trove-web-archives",
-    "related_identifiers": [
-      {
-        "scheme": "url",
-        "identifier": "https://github.com/GLAM-Workbench/trove-web-archives/tree/v0.0.0",
-        "relation": "isDerivedFrom",
-        "resource_type": "software"
-      },
-      {
-        "scheme": "url",
-        "identifier": "https://glam-workbench.net/trove-web-archives/",
-        "relation": "isDocumentedBy",
-        "resource_type": "publication-softwaredocumentation"
-      },
-      {
-        "scheme": "url",
-        "identifier": "https://glam-workbench.net/",
-        "relation": "isPartOf",
-        "resource_type": "other"
-      }
-    ],
-    "version": "v0.0.0",
-    "upload_type": "software",
-    "keywords": [
-      "digital humanities",
-      "Jupyter",
-      "GLAM Workbench"
-    ],
-    "publication_date": "2024-03-03",
-    "creators": [
+  "language": "eng",
+  "license": "MIT",
+  "title": "GLAM-Workbench/trove-web-archives",
+  "related_identifiers": [
     {
-        "name": "Sherratt, Tim",
-        "orcid": "0000-0001-7956-4498"
+      "scheme": "url",
+      "identifier": "https://github.com/GLAM-Workbench/trove-web-archives/tree/v1.0.0",
+      "relation": "isDerivedFrom",
+      "resource_type": "software"
+    },
+    {
+      "scheme": "url",
+      "identifier": "https://glam-workbench.net/trove-web-archives/",
+      "relation": "isDocumentedBy",
+      "resource_type": "publication-softwaredocumentation"
+    },
+    {
+      "scheme": "url",
+      "identifier": "https://glam-workbench.net/",
+      "relation": "isPartOf",
+      "resource_type": "other"
+    }
+  ],
+  "version": "v1.0.0",
+  "upload_type": "software",
+  "keywords": [
+    "digital humanities",
+    "Jupyter",
+    "GLAM Workbench"
+  ],
+  "publication_date": "2024-05-06",
+  "creators": [
+    {
+      "name": "Sherratt, Tim",
+      "orcid": "0000-0001-7956-4498"
     }
-],
-    "access_right": "open",
-    "description": ""
-  }
-
+  ],
+  "access_right": "open",
+  "description": "<p>A GLAM Workbench repository</p> <p>For more information and documentation see the <a href=\"https://glam-workbench.net/trove-web-archives\">Trove web archive collections (Pandora)</a> section of the <a href=\"https://glam-workbench.net\">GLAM Workbench</a>.</p> <h2 id=\"notebooks\">Notebooks</h2> <ul> <li>Create title datasets from collections and subjects</li> <li>Harvest Pandora subjects and collections</li> <li>Harvest the full collection of Pandora titles</li> </ul> <h2 id=\"associated-datasets\">Associated datasets</h2> <ul> <li><a href=\"https://github.com/GLAM-Workbench/trove-web-archives-collections\">trove-web-archives-collections</a></li> <li><a href=\"https://github.com/GLAM-Workbench/trove-web-archives-titles\">trove-web-archives-titles</a></li> </ul> <hr /> <p>Created by <a href=\"https://timsherratt.au\">Tim Sherratt</a> for the <a href=\"https://glam-workbench.net\">GLAM Workbench</a></p>"
+}
diff --git a/README.md b/README.md
@@ -1,6 +1,12 @@
 # trove-web-archives
 
-A GLAM Workbench repository
+CURRENT VERSION: v1.0.0
+
+This repository includes information on finding, understanding, and using Pandora's collections of archived web pages.
+
+[Pandora](http://pandora.nla.gov.au/) has been selecting web sites and online resources for preservation since 1996. It has assembled a collection of more than 80,000 titles, organised into subjects and collections. The archived websites are now part of the Australian Web Archive (AWA), which combines the selected titles with broader domain harvests, and is searchable through Trove. However, Pandora's curated collections offer a useful entry point for researchers trying to find web sites relating to particular topics or events.
+
+The [Web Archives](https://glam-workbench.net/web-archives/) section of the GLAM Workbench provides documentation, tools, and examples to help you work with data from a range of web archives, including the Australian Web Archive. The title urls obtained through Pandora can be used to obtain additional data from the AWA for analysis.
 
 For more information and documentation see the [Trove web archive collections (Pandora)](https://glam-workbench.net/trove-web-archives) section of the [GLAM Workbench](https://glam-workbench.net).
 

diff --git a/crate_maker.py b/crate_maker.py
@@ -0,0 +1,114 @@
+from rocrate.rocrate import ContextEntity, ROCrate
+import ipynbname
+import nbformat
+import mimetypes
+from datetime import datetime
+from giturlparse import parse as ghparse
+import requests
+
+def add_gh_file(crate, url):
+    datafile = url.replace("/raw/", "/blob/")
+    gh_parts = ghparse(datafile)
+
+    # API url to get the latest commit for this file
+    gh_commit_url = f"https://api.github.com/repos/{gh_parts.owner}/{gh_parts.repo}/commits?path={gh_parts.path_raw.split('/')[-1]}"
+    try:
+        response = requests.get(gh_commit_url)
+
+        # Get the date of the last commit
+        date = response.json()[0]["commit"]["committer"]["date"][:10]
+
+    except (IndexError, KeyError):
+        date = None
+
+    # Different API endpoint for file data
+    gh_file_url = f"https://api.github.com/repos/{gh_parts.owner}/{gh_parts.repo}/contents/{gh_parts.path_raw.split('/')[-1]}"
+    try:
+        response = requests.get(gh_file_url)
+        contents_data = response.json()
+        # Get the file size
+        try:
+            size = contents_data["size"]
+        except TypeError:
+            size = None
+
+    except KeyError:
+            size = None
+    obj_properties = {
+        "@type": [
+            "File",
+            "Dataset"
+        ],
+        "contentSize": size,
+        "dateModified": date,
+        "name": gh_parts.path_raw.split('/')[-1],
+        "url": datafile
+    }
+    crate.add_file(datafile, properties=obj_properties)
+
+def create_rocrate(subject, file_path, start_date, end_date):
+    """
+    Create an RO-Crate metadata file describing the downloaded dataset.
+    """
+    crate = ROCrate()
+
+    # Initialise crate with dataset
+    crate.add_file(file_path)
+
+    # Add notebook details
+    nb_path = ipynbname.path()
+    nb = nbformat.read(nb_path, nbformat.NO_CONVERT)
+    metadata = nb.metadata.rocrate
+    nb_url = metadata.get("url", "")
+    nb_properties = {
+        "@type": ["File", "SoftwareSourceCode"],
+        "name": metadata.get("name", ""),
+        "description": metadata.get("description", ""),
+        "encodingFormat": "application/x-ipynb+json",
+        "codeRepository": metadata.get("codeRepository", ""),
+        "url": nb_url,
+    }
+    crate.add(ContextEntity(crate, nb_url, properties=nb_properties))
+
+    # Add action
+    action_id = f"{nb_path.stem}_run"
+    action_properties = {
+        "@type": "CreateAction",
+        "instrument": {"@id": nb_url},
+        "actionStatus": {"@id": "http://schema.org/CompletedActionStatus"},
+        "name": f"Run of notebook: {nb_path.name}",
+        "result": {"@id": f"{file_path.name}/"},
+        "object": [{"@id": o["url"]} for o in metadata["action"][0]["object"]],
+        "query": f"{subject['id']} ({subject['name']})",
+        "startDate": start_date,
+        "endDate": end_date,
+    }
+
+    # If there are any GitHub references in action objects, add them to the crate
+    for obj in metadata["action"][0]["object"]:
+        if "github.com" in obj["url"]:
+            add_gh_file(crate, obj["url"])
+
+    # Update dataset details
+    encoding = mimetypes.guess_type(file_path)[0]
+    stats = file_path.stat()
+    size = stats.st_size
+    date = datetime.fromtimestamp(stats.st_mtime).strftime("%Y-%m-%d")
+    rows = 0
+    with file_path.open("r") as df:
+        for line in df:
+            rows += 1
+    crate.update_jsonld(
+        {
+            "@id": file_path.name,
+            "dateModified": date,
+            "contentSize": size,
+            "size": rows,
+            "encodingFormat": encoding,
+        }
+    )
+    crate.add(ContextEntity(crate, action_id, properties=action_properties))
+
+    # Save the crate
+    crate.write(file_path.parent)
+    crate.write_zip(file_path.parent)