Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow running with incomplete descriptions #58

Merged
merged 35 commits into from
Jan 10, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
61c35bd
fix MODS name without roles, ht@kba #51
bertsky Dec 3, 2021
499c3cc
fallback to empty publicationStmt/date and encodingDesc if metsHdr is…
bertsky Dec 3, 2021
8984b1b
get_text_in_line: append HYP content if available
bertsky Dec 3, 2021
7b136c8
log to stderr instead of stdout (to prevent mixing with TEI)
bertsky Dec 3, 2021
6545b16
improve makefile
bertsky Dec 3, 2021
711025a
improve CI
bertsky Dec 3, 2021
605dd89
mets.fromfile: allow missing logical structmap
bertsky Dec 5, 2021
3bfa7c2
mets.fromfile: allow missing mods originInfo
bertsky Dec 5, 2021
559e4c1
mets.fromfile: allow missing mods physicalDescription
bertsky Dec 5, 2021
1a7fe59
mets.fromfile: allow missing mets amdSec provenance dv
bertsky Dec 5, 2021
af1740e
mets.fromfile: simplify physical struct map, allow missing @ORDER
bertsky Dec 5, 2021
18a2dde
mets.fromfile: allow missing struct link
bertsky Dec 5, 2021
dbcc1fe
teil.fill_from_mets: allow empty logical struct map and struct link
bertsky Dec 5, 2021
61c4624
METS to TEI structure: comment urging for more+better mappings
bertsky Dec 5, 2021
15022f5
rename changelog
bertsky Dec 6, 2021
553e0fd
improve+update changelog
bertsky Dec 6, 2021
27dffe8
differentiate image number and page number
bertsky Dec 6, 2021
c39b6c7
allow passing image fileGrp other than DEFAULT
bertsky Dec 6, 2021
71fd269
add params for image fileGrp and output file, more logging
bertsky Dec 6, 2021
5c20f90
update changelog
bertsky Dec 6, 2021
ad261ff
generalize passing URN and VD ID to all identifiers
bertsky Dec 12, 2021
93fb684
improve level, title and idno metadata…
bertsky Dec 13, 2021
9a5f486
fall back to biblFull title level u
bertsky Dec 13, 2021
55353e5
keep going if there is no author and div type
bertsky Dec 14, 2021
0bf8bd3
fix tei:collection
bertsky Dec 20, 2021
7962b8c
fix tei:repository (from list-valued mods:physicalLocation), add tei:…
bertsky Dec 20, 2021
073f2b1
fix 7962b8c5
bertsky Dec 20, 2021
8d2fc41
add tei:notesStmt/tei:note from mods:note
bertsky Dec 20, 2021
06f1ccf
fix tei:editionStmt (does not belong under titleStmt)
bertsky Dec 20, 2021
c49c2a4
add tei:keywords | tei:classCode under tei:textClass (for mods:subjec…
bertsky Dec 20, 2021
27127fe
chdir to METS dir if not URL
bertsky Dec 20, 2021
8ac0747
fix mods:location (only once, but multiple contents)
bertsky Dec 20, 2021
20546af
fix regression in 27127febd
bertsky Dec 20, 2021
f33a4ca
drop Python 3.5
bertsky Jan 6, 2022
8204bfc
Revert regression fix in README.md
wrznr Jan 6, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions mets_mods2tei/api/mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ def __init__(self):
self.license_url = None
self.encoding_date = None
self.encoding_desc = None
self.owner_manuscript = None
self.location_phys = None
self.location_urls = None
self.shelf_locators = None
self.identifiers = None
wrznr marked this conversation as resolved.
Show resolved Hide resolved
self.scripts = None
Expand Down Expand Up @@ -339,7 +340,9 @@ def norm_title_first(titleInfo):
if location.get_shelfLocator():
self.shelf_locators.extend([shelf_locator.get_valueOf_() for shelf_locator in location.get_shelfLocator()])
elif location.get_physicalLocation():
self.owner_manuscript = location.get_physicalLocation()
self.location_phys = location.get_physicalLocation()[0]
elif location.get_url():
self.location_urls = location.get_url()

#
# URN and VD ID
Expand Down Expand Up @@ -513,11 +516,17 @@ def get_encoding_description(self):
"""
return self.encoding_desc

def get_owner_manuscript(self):
def get_location_phys(self):
"""
Return the owner of the original manuscript
Return the physical location of the original manuscript
wrznr marked this conversation as resolved.
Show resolved Hide resolved
"""
return self.owner_manuscript
return self.location_phys

def get_location_urls(self):
"""
Return the URL location of the original manuscript
"""
return self.location_urls

def get_shelf_locators(self):
"""
Expand Down
29 changes: 24 additions & 5 deletions mets_mods2tei/api/tei.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import logging
import copy
import re

from contextlib import closing
from urllib.request import urlopen
Expand Down Expand Up @@ -99,8 +100,26 @@ def fill_from_mets(self, mets, ocr=True):
self.set_encoding_description(mets.get_encoding_description())

# repository
if mets.get_owner_manuscript():
self.add_repository(mets.get_owner_manuscript())
if mets.get_location_phys():
# hard to distinguish between settlement, institution and repository at this point
self.add_repository(mets.get_location_phys())
if mets.get_location_urls():
for url in mets.get_location_urls():
# hard to determine type of URL at this point – could be (some form of) presentation,
# URN, PPN, EPN, DOI, URLWeb, URLCatalogue, URLImages, URLText, URLHTML, URLXML, URLTCF, URLIIIF
if url.startswith("urn:"):
typ = "URN"
elif re.fullmatch("10[.][0-9]*/.*", url):
typ = "DOI"
elif re.fullmatch("[0-9]{8}[0-9X]{1,2}", url):
typ = "PPN"
elif re.fullmatch("([0-9]+-)+[0-9]+", url):
typ = "ISBN"
elif re.fullmatch("[0-9]{4}-[0-9]{3}[0-9xX]", url):
typ = "ISSN"
else:
typ = "URL"
self.add_identifier(typ, url)

# shelf locator
for shelf_locator in mets.get_shelf_locators():
Expand Down Expand Up @@ -534,13 +553,13 @@ def set_encoding_description(self, creator):
encoding_desc_details = etree.SubElement(encoding_desc, "%sp" % TEI)
encoding_desc_details.text = "Encoded with the help of %s." % creator

def add_repository(self, repository):
def add_repository(self, name):
"""
Add the repository of the (original) manuscript
"""
ms_ident = self.tree.xpath('//tei:msDesc/tei:msIdentifier', namespaces=ns)[0]
repository_node = etree.SubElement(ms_ident, "%srepository" % TEI)
repository_node.text = repository
repository = etree.SubElement(ms_ident, "%srepository" % TEI)
repository.text = name

def add_identifier(self, type_, value):
"""
Expand Down