Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding annotations from djfu file #73

Merged
merged 7 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion iiify/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from flask_caching import Cache
from iiif2 import iiif, web
from .resolver import ia_resolver, create_manifest, create_manifest3, getids, collection, \
purify_domain, cantaloupe_resolver, create_collection3, IsCollection
purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations
from .configs import options, cors, approot, cache_root, media_root, \
cache_expr, version, image_server, cache_timeouts
from urllib.parse import quote
Expand Down Expand Up @@ -191,6 +191,11 @@ def manifest3(identifier):
raise excpt
# abort(404)

@app.route('/iiif/<version>/annotations/<identifier>/<fileName>/<canvas_no>.json')
@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
def annnotations(version, identifier, fileName, canvas_no):
domain = purify_domain(request.args.get('domain', request.url_root))
return ldjsonify(create_annotations(version, identifier, fileName, canvas_no, domain=domain))

@app.route('/iiif/<identifier>/manifest.json')
@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
Expand Down
69 changes: 68 additions & 1 deletion iiify/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
import requests
from iiif2 import iiif, web
from .configs import options, cors, approot, cache_root, media_root, apiurl, LINKS
from iiif_prezi3 import Manifest, config, Annotation, AnnotationPage, Canvas, Manifest, ResourceItem, ServiceItem, Choice, Collection, ManifestRef, CollectionRef
from iiif_prezi3 import Manifest, config, Annotation, AnnotationPage,AnnotationPageRef, Canvas, Manifest, ResourceItem, ServiceItem, Choice, Collection, ManifestRef, CollectionRef

from urllib.parse import urlparse, parse_qs, quote
import json
import math
import re
import xml.etree.ElementTree as ET

IMG_CTX = 'http://iiif.io/api/image/2/context.json'
PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json'
Expand Down Expand Up @@ -468,9 +470,12 @@ def create_manifest3(identifier, domain=None, page=None):
# subprefix can be different from the identifier use the scandata filename to find the correct prefix
# if not present fall back to identifier
subprefix = identifier
djvuFile = ""
for fileMd in metadata['files']:
if fileMd['name'].endswith('_scandata.xml'):
subprefix = fileMd['name'].replace('_scandata.xml', '')
if fileMd['format'] == 'Djvu XML':
djvuFile = fileMd['name']

bookReaderURL = f"https://{metadata.get('server')}/BookReader/BookReaderJSIA.php?id={identifier}&itemPath={metadata.get('dir')}&server={metadata.get('server')}&format=jsonp&subPrefix={subprefix}"

Expand Down Expand Up @@ -531,7 +536,20 @@ def create_manifest3(identifier, domain=None, page=None):
except:
pass

# Add annotations if djvu file is present
if djvuFile:
count = 1
for canvas in manifest.items:
if 'annotations' in canvas:
annotations = canvas.annotations
else:
annotations = []

annotations.append(
AnnotationPageRef(id=f"{domain}3/annotations/{identifier}/{quote(djvuFile, safe='()')}/{count}.json", type="AnnotationPage")
)
canvas.annotations = annotations
count += 1
elif mediatype == 'image':
(multiFile, format) = checkMultiItem(metadata)
print (f"Checking multiFile {multiFile} {format}")
Expand Down Expand Up @@ -710,6 +728,55 @@ def create_manifest3(identifier, domain=None, page=None):

return json.loads(manifest.jsonld())

def create_annotations(version, identifier, fileName, canvas_no, domain=None):
annotationPage = AnnotationPage(id=f"{domain}{version}/annotations/{identifier}/{quote(fileName, safe='()')}/{canvas_no}.json")
annotationPage.items = []
index = int(canvas_no) - 1
url = f"{ARCHIVE}/download/{identifier}/{fileName}"
try:
# Fetch the remote XML file
response = requests.get(url)
response.raise_for_status() # Raise an error for bad status codes

# Parse the XML content
djfu = ET.fromstring(response.content)
page = djfu.findall(f".//OBJECT[{canvas_no}]")[0]
words = page.findall(".//WORD")
count = 1
for word in words:
# <WORD coords="444,1353,635,1294" x-confidence="10">[David </WORD>
# <WORD coords="lx,by,rx,ty" x-confidence="10">[David </WORD>
# x = lx
# y = ty
# w = rx - lx
# h = by - ty
(left_x, bottom_y, right_x, top_y) = word.attrib['coords'].split(',')
x = left_x
y = top_y
width = int(right_x) - int(left_x)
height = int(bottom_y) - int(top_y)
annotationPage.items.append({
"id": f"https://iiif.archive.org/iiif/{identifier}/canvas/{index}/anno/{count}",
"type": "Annotation",
"motivation": "commenting",
"body": {
"type": "TextualBody",
"format": "text/plain",
"value": word.text
},
"target": f"https://iiif.archive.org/iiif/{identifier}${index}/canvas#xywh={x},{y},{width},{height}"
})
count += 1

except requests.exceptions.RequestException as e:
print(f"Error fetching the XML file: {e}")
raise ValueError("Failed to retrieve {url}")
except ET.ParseError as e:
print(f"Error parsing the XML content: {e}")
raise ValueError("Failed to process {url}")

return json.loads(annotationPage.jsonld())

def coerce_list(value):
if isinstance(value, list):
return ". ".join(value)
Expand Down
57 changes: 57 additions & 0 deletions tests/test_annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import unittest
from flask.testing import FlaskClient
from iiify.app import app

class TestAnnotations(unittest.TestCase):

def setUp(self) -> None:
self.test_app = FlaskClient(app)

def test_v3_manifest_has_annotations(self):
resp = self.test_app.get("/iiif/3/journalofexpedit00ford/manifest.json?recache=true")
self.assertEqual(resp.status_code, 200)
manifest = resp.json

count = 1
for canvas in manifest['items']:
self.assertTrue('annotations' in canvas, f"Expected annotations in canvas {canvas['id']}")
annotations_url = f"https://localhost/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/{count}.json"
found=False
for anno in canvas['annotations']:
if anno['id'] == annotations_url:
found=True
self.assertFalse('items' in anno, "As a referenced AnnotationPage it shouldn't contain items.")
self.assertTrue('type' in anno and anno['type'] == "AnnotationPage",f"Expected annotation page to have a type {anno}")

self.assertTrue(found, f"Expected to find {annotations_url} in {canvas['annotations']}")
count += 1

def test_v3_annotations(self):
resp = self.test_app.get("/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/1.json?recache=true")
self.assertEqual(resp.status_code, 200)
annotations = resp.json

self.assertEqual(annotations['id'], "https://localhost/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/1.json", "Unexpected id")
self.assertEqual(annotations['@context'], "http://iiif.io/api/presentation/3/context.json", "Unexpected context")
self.assertEqual(annotations['type'], "AnnotationPage", "Unexpected type, expected AnnotationPage")
annotationList = annotations['items']
self.assertEqual(len(annotationList), 6, "Unexpected number of annotations")

ids = []
first=True
for anno in annotationList:
self.assertTrue(anno['id'] not in ids,"Duplicate ID: {anno['id']}")
ids.append(anno['id'])
self.assertEqual(anno['type'], "Annotation", "Expected type of Annotation")
self.assertTrue("body" in anno and "target" in anno, "Body or target missing from annotation {anno}")
self.assertEqual(anno['body']['type'], "TextualBody", "Expected body to be a TextualBody")
self.assertEqual(anno['body']['format'], "text/plain", "Expected format to be a text/plain")
self.assertEqual(anno['target'].split('#')[0], "https://iiif.archive.org/iiif/journalofexpedit00ford$0/canvas")
if first:
self.assertEqual(anno['target'].split('#')[1],"xywh=592,1742,460,118")
self.assertEqual(anno['body']['value'],"JOURNAL ")

self.assertEqual(anno['motivation'], "supplementing", "Expected motivation of supplementing")
first=False


Loading