Skip to content

Commit

Permalink
Merge branch 'main' into simple-field-json
Browse files Browse the repository at this point in the history
  • Loading branch information
joecorall authored Nov 11, 2024
2 parents 574199d + 12a81dc commit efa9808
Show file tree
Hide file tree
Showing 113 changed files with 3,114 additions and 1,234 deletions.
13 changes: 13 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Keep GitHub Actions up to date with GitHub's Dependabot...
# https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot
# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#package-ecosystem
version: 2
updates:
- package-ecosystem: github-actions
directory: /
groups:
github-actions:
patterns:
- "*" # Group all Actions updates into a single larger pull request
schedule:
interval: weekly
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ jobs:
OUT_FILE_NAME: workbench.exe
ASSET_MIME: application/vnd.microsoft.portable-executable
steps:
- uses: actions/checkout@v1
- uses: actions/checkout@v4
- name: Set up Python 3.8
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: 3.8
- name: Install dependencies
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
42 changes: 33 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,44 @@
FROM python:3.10.6

# Build the image and name it workbench-docker-image-name (or whatever you want)
# docker build -t workbench-docker-image-name .
# docker build --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t workbench-docker .

# Build the container from the built image and run workbench:
# docker run -it --rm --network="host" -v $(pwd):/workbench --name workbench-docker-container-name workbench-docker-image-name bash -lc "(./workbench --config example.yml --check)"
# docker run -it --rm --network="host" -v .:/workbench --name update_existing_objects workbench-docker bash -lc "./workbench --config /workbench/prod/update_islandora_objects.yml --check"
# Another example but with mounted directories for the migration to have access to.
# docker run -it --rm --network="host" -v .:/workbench -v /path/to/your/tmp:/tmp -v /path/to/your/files:/mnt/data/local --name update_existing_objects workbench-docker bash -lc "./workbench --config /workbench/prod/update_islandora_objects.yml --check"
# To export a CSV file, that includes the available Drupal fields, run:
# docker run -it --rm --network="host" -v .:/workbench --name workbench-docker-container-name workbench-docker-image-name bash -lc "./workbench --config /workbench/islandora_workbench_demo_content/idc_example_geo.yml --get_csv_template"
# The directory this file is in is mounted within the container at /workbench
# Rename example.yml to your YML file.

ADD . /workbench/
# Create a non-root user and set up the environment
ARG USER_ID
ARG GROUP_ID

# Create a group with the specified GID
RUN groupadd -g $GROUP_ID dockeruser || true

# Create a user with the specified UID and GID
RUN useradd -m -u $USER_ID -g $GROUP_ID -s /bin/bash dockeruser

# Set the working directory
WORKDIR /workbench

# Works with and without this line
RUN python -m pip install setuptools
# Copy the current directory contents into the container at /workbench
COPY . /workbench/

# Set ownership and permissions for the non-root user
RUN chown -R $USER_ID:$GROUP_ID /workbench

# Set the PATH environment variable to include .local/bin
ENV PATH=/home/dockeruser/.local/bin:$PATH

# Switch to the non-root user
USER dockeruser

# RUN pip install filemagic
RUN pip install urllib3>=1.21.1
RUN pip install libmagic
RUN python setup.py install
# Install dependencies and setup the environment
RUN python -m pip install --user --upgrade pip setuptools build && \
python -m pip install --user --no-cache-dir "urllib3>=1.21.1" libmagic && \
python -m build && \
python -m pip install --user dist/*.whl
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ Note that this tool is not related in any way to the Drupal contrib module calle
* Allows assignment of Drupal vocabulary terms using term IDs, term names, or term URIs
* Allows creation of new taxonomy terms from CSV field data, including complex and hierarchical terms
* Allows the assignment of URL aliases
* Allows creation of URL redirects
* Allows adding alt text to images
* Supports transmission fixity auditing for media files
* Cross platform (written in Python, tested on Linux, Mac, and Windows)
* Well tested
* Well documented
* Provides both sensible default configuration values and rich configuration options for power users
* A companion project under development, [Islandora Workbench Desktop](https://github.com/mjordan/islandora_workbench_desktop), will add a graphical user interface that enables users not familiar or comfortable with the command line to use Workbench.
* Run from within a Docker container.

## Documentation
Expand Down
24 changes: 23 additions & 1 deletion WorkbenchConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ def get_config(self):
config["temp_dir"], "csv_id_to_node_id_map.db"
)

if "page_files_source_dir_field" in user_mods:
config["page_files_source_dir_field"] = user_mods[
"page_files_source_dir_field"
]
else:
config["page_files_source_dir_field"] = config["id_field"]

config["config_file"] = self.args.config

return config
Expand All @@ -93,7 +100,11 @@ def get_user_config(self):
try:
loaded = yaml.load(stream)
except YAMLError as exc:
print(exc)
print(
f"There appears to be a YAML syntax error in your configuration file, {self.args.config}. Remove the username and\npassword, and run the file through https://codebeautify.org/yaml-validator/ or your YAML validator of choice."
)
sys.exit()

# 'media_file_fields' has been replaced with 'media_fields' and 'media_type_file_fields'.
# This is aliasing code that can be removed at some point in the future.
if "media_file_fields" in loaded:
Expand Down Expand Up @@ -194,6 +205,8 @@ def get_default_config(self):
"paged_content_from_directories": False,
"delete_media_with_nodes": True,
"allow_adding_terms": False,
"columns_with_term_names": [],
"protected_vocabularies": [],
"nodes_only": False,
"log_response_time": False,
"adaptive_pause_threshold": 2,
Expand All @@ -204,6 +217,7 @@ def get_default_config(self):
"log_response_status_code": False,
"log_headers": False,
"log_term_creation": True,
"log_file_name_and_line_number": False,
"progress_bar": False,
"user_agent": "Islandora Workbench",
"allow_redirects": True,
Expand All @@ -219,6 +233,8 @@ def get_default_config(self):
"field_for_remote_filename": False,
"field_for_media_title": False,
"delete_tmp_upload": False,
"input_data_zip_archives": [],
"delete_zip_archive_after_extraction": True,
"list_missing_drupal_fields": False,
"secondary_tasks": None,
"sqlite_db_filename": "workbench_temp_data.db",
Expand Down Expand Up @@ -264,6 +280,12 @@ def get_default_config(self):
"ignore_existing_parent_ids": True,
"query_csv_id_to_node_id_map_for_parents": False,
"ignore_duplicate_parent_ids": True,
"redirect_status_code": 301,
"csv_value_templates": [],
"csv_value_templates_for_paged_content": [],
"csv_value_templates_rand_length": 5,
"allow_csv_value_templates_if_field_empty": [],
"remind_user_to_run_check": False,
}

# Tests validity and existence of configuration file path.
Expand Down
12 changes: 9 additions & 3 deletions i7Import/get_islandora_7_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
parser = argparse.ArgumentParser()
parser.add_argument("--config", required=True, help="Configuration file to use.")
parser.add_argument(
"--metadata_solr_request", required=False, help="Option to solr metadata request."
"--metadata_solr_request",
required=False,
help="Option to supply solr metadata request.",
)
args = parser.parse_args()
utils = i7ImportUtilities(args.config)
Expand All @@ -44,6 +46,8 @@
metadata_solr_request = utils.get_metadata_solr_request(args.metadata_solr_request)
else:
metadata_solr_request = utils.get_default_metadata_solr_request()
if config["secure_ssl_only"] is False:
requests.packages.urllib3.disable_warnings()
if config["debug"]:
pretty_print = metadata_solr_request.replace("&", "\n&")
print(f"Solr request: {pretty_print}")
Expand Down Expand Up @@ -74,7 +78,7 @@
headers.append("file")
if config["id_field"] not in headers:
headers.append(config["id_field"])
index = config["id_start_number"]
index = config["id_start_number"]

if config["fetch_files"] is True:
if not os.path.exists(config["obj_directory"]):
Expand All @@ -83,12 +87,14 @@
row_count = 0
pbar = InitBar()
num_csv_rows = len(rows)
print(f"Processing {num_csv_rows -1}.")
print(f"Processing {num_csv_rows - 1}.")
with open(config["csv_output_path"], "w", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=headers)
writer.writeheader()
failed_pids = []
for row in reader:
if row["PID"] in config["pids_to_skip"]:
continue
rels_ext = utils.parse_rels_ext(row["PID"])
if rels_ext:
for key, value in rels_ext.items():
Expand Down
15 changes: 12 additions & 3 deletions i7Import/i7ImportUtilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def __init__(self, config_location):
"solr_filters": False,
"start": 0,
"rows": 100000,
"secure_ssl_only": True,
}

def get_config(self):
Expand Down Expand Up @@ -97,7 +98,9 @@ def parse_rels_ext(self, pid):
print(f"\n{rels_ext_url}")
try:
rels_ext_download_response = requests.get(
url=rels_ext_url, allow_redirects=True
verify=self.config["secure_ssl_only"],
url=rels_ext_url,
allow_redirects=True,
)
if rels_ext_download_response.ok:
rel_ext = {}
Expand Down Expand Up @@ -136,7 +139,9 @@ def get_default_metadata_solr_request(self):
# then used in another query to get the populated CSV data.
try:
field_list_response = requests.get(
url=fields_solr_url, allow_redirects=True
verify=self.config["secure_ssl_only"],
url=fields_solr_url,
allow_redirects=True,
)
raw_field_list = field_list_response.content.decode()
except requests.exceptions.RequestException as e:
Expand Down Expand Up @@ -196,7 +201,11 @@ def get_i7_asset(self, pid, datastream):
if self.config["get_file_url"]:
obj_download_response = requests.head(url=obj_url, allow_redirects=True)
else:
obj_download_response = requests.get(url=obj_url, allow_redirects=True)
obj_download_response = requests.get(
verify=self.config["secure_ssl_only"],
url=obj_url,
allow_redirects=True,
)
if obj_download_response.status_code == 200:
# Get MIMETYPE from 'Content-Type' header
obj_mimetype = obj_download_response.headers["content-type"]
Expand Down
41 changes: 41 additions & 0 deletions scripts/email_log_if_errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env python3

"""Islandora Workbench shutdown script to send a the log file to someone
if it contains any ERROR entries.
"""

import smtplib
import re
import sys
from ruamel.yaml import YAML

workbench_config_file_path = sys.argv[1]

yaml = YAML()
config_yaml = open(workbench_config_file_path, "r")
config = yaml.load(config_yaml)
if "log_file_path" in config:
workbench_log_file_path = config["log_file_path"]
else:
log_file_path = "workbench.log"

fromaddr = "someaddr@example.com"
toaddrs = "anotheraddr@example.com"
# toaddrs = "anotheraddr@example.com,yetanotheraddr@example.com"

msg = (
"Subject: Islandora Workbench log file - there were errors!\r\nFrom: %s\r\nTo: %s\r\n\r\n"
% (fromaddr, toaddrs)
)

# If the log contains any ERROR entries, mail it.
log_file = open(workbench_log_file_path, "r")
log_file_text = log_file.read()
log_file.close()
matches = re.findall("ERROR", log_file_text)

if len(matches) > 0:
msg = msg + log_file_text
server = smtplib.SMTP("localhost")
server.sendmail(fromaddr, toaddrs, msg.encode("utf-8"))
server.quit()
70 changes: 70 additions & 0 deletions scripts/generate_iiif_manifests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env python3

"""Iterates over all nodes that are parents and warms the cache of the
View that generates the node's IIIF Manifest at /node/xxx/book-manifest.
"""

import sys
import os
import logging
import sqlite3
import logging
import tempfile
from ruamel.yaml import YAML
import requests
from requests.exceptions import ConnectTimeout, ReadTimeout, ConnectionError

logging.basicConfig(
filename="iiif_generation.log",
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%d-%b-%y %H:%M:%S",
)

current_config_file_path = sys.argv[1]

yaml = YAML()
config_yaml = open(current_config_file_path, "r")
config = yaml.load(config_yaml)
if "csv_id_to_node_id_map_path" in config:
csv_id_to_node_id_map_path = config["csv_id_to_node_id_map_path"]
else:
csv_id_to_node_id_map_path = os.path.join(
tempfile.gettempdir(), "csv_id_to_node_id_map.db"
)

if os.path.exists(csv_id_to_node_id_map_path) is False:
logging.error(
f"Can't find CSV ID to node ID map database at {csv_id_to_node_id_map_path}"
)
sys.exit(1)

query = (
"select config_file,csv_id,node_id from csv_id_to_node_id_map where node_id in"
+ f" (select parent_node_id from csv_id_to_node_id_map where parent_node_id != '') and config_file = '{current_config_file_path}'"
)

try:
params = ()
con = sqlite3.connect(csv_id_to_node_id_map_path)
con.row_factory = sqlite3.Row
cur = con.cursor()
res = cur.execute(query, params).fetchall()
con.close()
except sqlite3.OperationalError as e:
logging.error(f"Error executing database query: {e}")
sys.exit(1)

for row in res:
try:
# row[1] is CSV ID, row[2] is node ID
url = f"{config['host']}/node/{row[2]}/book-manifest"
r = requests.get(url, timeout=60)
if r.status_code == 200:
logging.info(f"Generated IIIF Manifest {url} (CSV ID {row[1]}).")
else:
logging.error(
f"Problem hitting IIIF Manfiest for {url} (CSV ID {row[1]}): HTTP response code was {r.status_code}."
)
except Exception as e:
logging.error(f"Problem accessing {url}: {e}")
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
"requests>=2.22,<3",
"requests_cache>=1.1",
"ruamel.yaml<=0.17.21",
"ruamel.yaml.clib<=0.2.8",
"pyparsing<3.2",
"progress_bar",
"openpyxl",
"unidecode",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ task: create
host: https://islandora.traefik.me
username: admin
password: password
secure_ssl_only: false
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ host: https://islandora.traefik.me
username: admin
password: password
content_type: invalid_content_type
secure_ssl_only: false
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ host: https://islandora.traefik.me
username: admin
password: password
use_node_title_for_media: true
use_nid_in_media_title: true
use_nid_in_media_title: true
secure_ssl_only: false
Loading

0 comments on commit efa9808

Please sign in to comment.