Skip to content

Commit

Permalink
Merge pull request #2 from GLAM-Workbench/update
Browse files Browse the repository at this point in the history
Update documentation and extract ro-crate code to separate file
  • Loading branch information
wragge authored May 6, 2024
2 parents 67b166c + 718a1ec commit 7ce1a52
Show file tree
Hide file tree
Showing 8 changed files with 457 additions and 288 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ Untitled*
.python-version
http_cache.sqlite
data-rocrate
subject-hierarchy.md
77 changes: 38 additions & 39 deletions .zenodo.json
Original file line number Diff line number Diff line change
@@ -1,42 +1,41 @@
{
"language": "eng",
"license": "MIT",
"title": "GLAM-Workbench/trove-web-archives",
"related_identifiers": [
{
"scheme": "url",
"identifier": "https://github.com/GLAM-Workbench/trove-web-archives/tree/v0.0.0",
"relation": "isDerivedFrom",
"resource_type": "software"
},
{
"scheme": "url",
"identifier": "https://glam-workbench.net/trove-web-archives/",
"relation": "isDocumentedBy",
"resource_type": "publication-softwaredocumentation"
},
{
"scheme": "url",
"identifier": "https://glam-workbench.net/",
"relation": "isPartOf",
"resource_type": "other"
}
],
"version": "v0.0.0",
"upload_type": "software",
"keywords": [
"digital humanities",
"Jupyter",
"GLAM Workbench"
],
"publication_date": "2024-03-03",
"creators": [
"language": "eng",
"license": "MIT",
"title": "GLAM-Workbench/trove-web-archives",
"related_identifiers": [
{
"name": "Sherratt, Tim",
"orcid": "0000-0001-7956-4498"
"scheme": "url",
"identifier": "https://github.com/GLAM-Workbench/trove-web-archives/tree/v1.0.0",
"relation": "isDerivedFrom",
"resource_type": "software"
},
{
"scheme": "url",
"identifier": "https://glam-workbench.net/trove-web-archives/",
"relation": "isDocumentedBy",
"resource_type": "publication-softwaredocumentation"
},
{
"scheme": "url",
"identifier": "https://glam-workbench.net/",
"relation": "isPartOf",
"resource_type": "other"
}
],
"version": "v1.0.0",
"upload_type": "software",
"keywords": [
"digital humanities",
"Jupyter",
"GLAM Workbench"
],
"publication_date": "2024-05-06",
"creators": [
{
"name": "Sherratt, Tim",
"orcid": "0000-0001-7956-4498"
}
],
"access_right": "open",
"description": ""
}

],
"access_right": "open",
"description": "<p>A GLAM Workbench repository</p> <p>For more information and documentation see the <a href=\"https://glam-workbench.net/trove-web-archives\">Trove web archive collections (Pandora)</a> section of the <a href=\"https://glam-workbench.net\">GLAM Workbench</a>.</p> <h2 id=\"notebooks\">Notebooks</h2> <ul> <li>Create title datasets from collections and subjects</li> <li>Harvest Pandora subjects and collections</li> <li>Harvest the full collection of Pandora titles</li> </ul> <h2 id=\"associated-datasets\">Associated datasets</h2> <ul> <li><a href=\"https://github.com/GLAM-Workbench/trove-web-archives-collections\">trove-web-archives-collections</a></li> <li><a href=\"https://github.com/GLAM-Workbench/trove-web-archives-titles\">trove-web-archives-titles</a></li> </ul> <hr /> <p>Created by <a href=\"https://timsherratt.au\">Tim Sherratt</a> for the <a href=\"https://glam-workbench.net\">GLAM Workbench</a></p>"
}
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# trove-web-archives

A GLAM Workbench repository
CURRENT VERSION: v1.0.0

This repository includes information on finding, understanding, and using Pandora's collections of archived web pages.

[Pandora](http://pandora.nla.gov.au/) has been selecting web sites and online resources for preservation since 1996. It has assembled a collection of more than 80,000 titles, organised into subjects and collections. The archived websites are now part of the Australian Web Archive (AWA), which combines the selected titles with broader domain harvests, and is searchable through Trove. However, Pandora's curated collections offer a useful entry point for researchers trying to find web sites relating to particular topics or events.

The [Web Archives](https://glam-workbench.net/web-archives/) section of the GLAM Workbench provides documentation, tools, and examples to help you work with data from a range of web archives, including the Australian Web Archive. The title urls obtained through Pandora can be used to obtain additional data from the AWA for analysis.

For more information and documentation see the [Trove web archive collections (Pandora)](https://glam-workbench.net/trove-web-archives) section of the [GLAM Workbench](https://glam-workbench.net).

Expand Down
114 changes: 114 additions & 0 deletions crate_maker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from rocrate.rocrate import ContextEntity, ROCrate
import ipynbname
import nbformat
import mimetypes
from datetime import datetime
from giturlparse import parse as ghparse
import requests

def add_gh_file(crate, url):
datafile = url.replace("/raw/", "/blob/")
gh_parts = ghparse(datafile)

# API url to get the latest commit for this file
gh_commit_url = f"https://api.github.com/repos/{gh_parts.owner}/{gh_parts.repo}/commits?path={gh_parts.path_raw.split('/')[-1]}"
try:
response = requests.get(gh_commit_url)

# Get the date of the last commit
date = response.json()[0]["commit"]["committer"]["date"][:10]

except (IndexError, KeyError):
date = None

# Different API endpoint for file data
gh_file_url = f"https://api.github.com/repos/{gh_parts.owner}/{gh_parts.repo}/contents/{gh_parts.path_raw.split('/')[-1]}"
try:
response = requests.get(gh_file_url)
contents_data = response.json()
# Get the file size
try:
size = contents_data["size"]
except TypeError:
size = None

except KeyError:
size = None
obj_properties = {
"@type": [
"File",
"Dataset"
],
"contentSize": size,
"dateModified": date,
"name": gh_parts.path_raw.split('/')[-1],
"url": datafile
}
crate.add_file(datafile, properties=obj_properties)

def create_rocrate(subject, file_path, start_date, end_date):
"""
Create an RO-Crate metadata file describing the downloaded dataset.
"""
crate = ROCrate()

# Initialise crate with dataset
crate.add_file(file_path)

# Add notebook details
nb_path = ipynbname.path()
nb = nbformat.read(nb_path, nbformat.NO_CONVERT)
metadata = nb.metadata.rocrate
nb_url = metadata.get("url", "")
nb_properties = {
"@type": ["File", "SoftwareSourceCode"],
"name": metadata.get("name", ""),
"description": metadata.get("description", ""),
"encodingFormat": "application/x-ipynb+json",
"codeRepository": metadata.get("codeRepository", ""),
"url": nb_url,
}
crate.add(ContextEntity(crate, nb_url, properties=nb_properties))

# Add action
action_id = f"{nb_path.stem}_run"
action_properties = {
"@type": "CreateAction",
"instrument": {"@id": nb_url},
"actionStatus": {"@id": "http://schema.org/CompletedActionStatus"},
"name": f"Run of notebook: {nb_path.name}",
"result": {"@id": f"{file_path.name}/"},
"object": [{"@id": o["url"]} for o in metadata["action"][0]["object"]],
"query": f"{subject['id']} ({subject['name']})",
"startDate": start_date,
"endDate": end_date,
}

# If there are any GitHub references in action objects, add them to the crate
for obj in metadata["action"][0]["object"]:
if "github.com" in obj["url"]:
add_gh_file(crate, obj["url"])

# Update dataset details
encoding = mimetypes.guess_type(file_path)[0]
stats = file_path.stat()
size = stats.st_size
date = datetime.fromtimestamp(stats.st_mtime).strftime("%Y-%m-%d")
rows = 0
with file_path.open("r") as df:
for line in df:
rows += 1
crate.update_jsonld(
{
"@id": file_path.name,
"dateModified": date,
"contentSize": size,
"size": rows,
"encodingFormat": encoding,
}
)
crate.add(ContextEntity(crate, action_id, properties=action_properties))

# Save the crate
crate.write(file_path.parent)
crate.write_zip(file_path.parent)
Loading

0 comments on commit 7ce1a52

Please sign in to comment.