generated from GLAM-Workbench/glam-workbench-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from GLAM-Workbench/update
Update documentation and extract ro-crate code to separate file
- Loading branch information
Showing
8 changed files
with
457 additions
and
288 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,3 +22,4 @@ Untitled* | |
.python-version | ||
http_cache.sqlite | ||
data-rocrate | ||
subject-hierarchy.md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1,41 @@ | ||
{ | ||
"language": "eng", | ||
"license": "MIT", | ||
"title": "GLAM-Workbench/trove-web-archives", | ||
"related_identifiers": [ | ||
{ | ||
"scheme": "url", | ||
"identifier": "https://github.com/GLAM-Workbench/trove-web-archives/tree/v0.0.0", | ||
"relation": "isDerivedFrom", | ||
"resource_type": "software" | ||
}, | ||
{ | ||
"scheme": "url", | ||
"identifier": "https://glam-workbench.net/trove-web-archives/", | ||
"relation": "isDocumentedBy", | ||
"resource_type": "publication-softwaredocumentation" | ||
}, | ||
{ | ||
"scheme": "url", | ||
"identifier": "https://glam-workbench.net/", | ||
"relation": "isPartOf", | ||
"resource_type": "other" | ||
} | ||
], | ||
"version": "v0.0.0", | ||
"upload_type": "software", | ||
"keywords": [ | ||
"digital humanities", | ||
"Jupyter", | ||
"GLAM Workbench" | ||
], | ||
"publication_date": "2024-03-03", | ||
"creators": [ | ||
"language": "eng", | ||
"license": "MIT", | ||
"title": "GLAM-Workbench/trove-web-archives", | ||
"related_identifiers": [ | ||
{ | ||
"name": "Sherratt, Tim", | ||
"orcid": "0000-0001-7956-4498" | ||
"scheme": "url", | ||
"identifier": "https://github.com/GLAM-Workbench/trove-web-archives/tree/v1.0.0", | ||
"relation": "isDerivedFrom", | ||
"resource_type": "software" | ||
}, | ||
{ | ||
"scheme": "url", | ||
"identifier": "https://glam-workbench.net/trove-web-archives/", | ||
"relation": "isDocumentedBy", | ||
"resource_type": "publication-softwaredocumentation" | ||
}, | ||
{ | ||
"scheme": "url", | ||
"identifier": "https://glam-workbench.net/", | ||
"relation": "isPartOf", | ||
"resource_type": "other" | ||
} | ||
], | ||
"version": "v1.0.0", | ||
"upload_type": "software", | ||
"keywords": [ | ||
"digital humanities", | ||
"Jupyter", | ||
"GLAM Workbench" | ||
], | ||
"publication_date": "2024-05-06", | ||
"creators": [ | ||
{ | ||
"name": "Sherratt, Tim", | ||
"orcid": "0000-0001-7956-4498" | ||
} | ||
], | ||
"access_right": "open", | ||
"description": "" | ||
} | ||
|
||
], | ||
"access_right": "open", | ||
"description": "<p>A GLAM Workbench repository</p> <p>For more information and documentation see the <a href=\"https://glam-workbench.net/trove-web-archives\">Trove web archive collections (Pandora)</a> section of the <a href=\"https://glam-workbench.net\">GLAM Workbench</a>.</p> <h2 id=\"notebooks\">Notebooks</h2> <ul> <li>Create title datasets from collections and subjects</li> <li>Harvest Pandora subjects and collections</li> <li>Harvest the full collection of Pandora titles</li> </ul> <h2 id=\"associated-datasets\">Associated datasets</h2> <ul> <li><a href=\"https://github.com/GLAM-Workbench/trove-web-archives-collections\">trove-web-archives-collections</a></li> <li><a href=\"https://github.com/GLAM-Workbench/trove-web-archives-titles\">trove-web-archives-titles</a></li> </ul> <hr /> <p>Created by <a href=\"https://timsherratt.au\">Tim Sherratt</a> for the <a href=\"https://glam-workbench.net\">GLAM Workbench</a></p>" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
from rocrate.rocrate import ContextEntity, ROCrate | ||
import ipynbname | ||
import nbformat | ||
import mimetypes | ||
from datetime import datetime | ||
from giturlparse import parse as ghparse | ||
import requests | ||
|
||
def add_gh_file(crate, url): | ||
datafile = url.replace("/raw/", "/blob/") | ||
gh_parts = ghparse(datafile) | ||
|
||
# API url to get the latest commit for this file | ||
gh_commit_url = f"https://api.github.com/repos/{gh_parts.owner}/{gh_parts.repo}/commits?path={gh_parts.path_raw.split('/')[-1]}" | ||
try: | ||
response = requests.get(gh_commit_url) | ||
|
||
# Get the date of the last commit | ||
date = response.json()[0]["commit"]["committer"]["date"][:10] | ||
|
||
except (IndexError, KeyError): | ||
date = None | ||
|
||
# Different API endpoint for file data | ||
gh_file_url = f"https://api.github.com/repos/{gh_parts.owner}/{gh_parts.repo}/contents/{gh_parts.path_raw.split('/')[-1]}" | ||
try: | ||
response = requests.get(gh_file_url) | ||
contents_data = response.json() | ||
# Get the file size | ||
try: | ||
size = contents_data["size"] | ||
except TypeError: | ||
size = None | ||
|
||
except KeyError: | ||
size = None | ||
obj_properties = { | ||
"@type": [ | ||
"File", | ||
"Dataset" | ||
], | ||
"contentSize": size, | ||
"dateModified": date, | ||
"name": gh_parts.path_raw.split('/')[-1], | ||
"url": datafile | ||
} | ||
crate.add_file(datafile, properties=obj_properties) | ||
|
||
def create_rocrate(subject, file_path, start_date, end_date): | ||
""" | ||
Create an RO-Crate metadata file describing the downloaded dataset. | ||
""" | ||
crate = ROCrate() | ||
|
||
# Initialise crate with dataset | ||
crate.add_file(file_path) | ||
|
||
# Add notebook details | ||
nb_path = ipynbname.path() | ||
nb = nbformat.read(nb_path, nbformat.NO_CONVERT) | ||
metadata = nb.metadata.rocrate | ||
nb_url = metadata.get("url", "") | ||
nb_properties = { | ||
"@type": ["File", "SoftwareSourceCode"], | ||
"name": metadata.get("name", ""), | ||
"description": metadata.get("description", ""), | ||
"encodingFormat": "application/x-ipynb+json", | ||
"codeRepository": metadata.get("codeRepository", ""), | ||
"url": nb_url, | ||
} | ||
crate.add(ContextEntity(crate, nb_url, properties=nb_properties)) | ||
|
||
# Add action | ||
action_id = f"{nb_path.stem}_run" | ||
action_properties = { | ||
"@type": "CreateAction", | ||
"instrument": {"@id": nb_url}, | ||
"actionStatus": {"@id": "http://schema.org/CompletedActionStatus"}, | ||
"name": f"Run of notebook: {nb_path.name}", | ||
"result": {"@id": f"{file_path.name}/"}, | ||
"object": [{"@id": o["url"]} for o in metadata["action"][0]["object"]], | ||
"query": f"{subject['id']} ({subject['name']})", | ||
"startDate": start_date, | ||
"endDate": end_date, | ||
} | ||
|
||
# If there are any GitHub references in action objects, add them to the crate | ||
for obj in metadata["action"][0]["object"]: | ||
if "github.com" in obj["url"]: | ||
add_gh_file(crate, obj["url"]) | ||
|
||
# Update dataset details | ||
encoding = mimetypes.guess_type(file_path)[0] | ||
stats = file_path.stat() | ||
size = stats.st_size | ||
date = datetime.fromtimestamp(stats.st_mtime).strftime("%Y-%m-%d") | ||
rows = 0 | ||
with file_path.open("r") as df: | ||
for line in df: | ||
rows += 1 | ||
crate.update_jsonld( | ||
{ | ||
"@id": file_path.name, | ||
"dateModified": date, | ||
"contentSize": size, | ||
"size": rows, | ||
"encodingFormat": encoding, | ||
} | ||
) | ||
crate.add(ContextEntity(crate, action_id, properties=action_properties)) | ||
|
||
# Save the crate | ||
crate.write(file_path.parent) | ||
crate.write_zip(file_path.parent) |
Oops, something went wrong.