From 61c35bd545b70fb8102b82465577a3efd7888bbf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 3 Dec 2021 13:17:12 +0100 Subject: [PATCH 01/35] fix MODS name without roles, ht@kba #51 --- mets_mods2tei/api/mets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 56359ba..b7a0eab 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -145,7 +145,7 @@ def __spur(self): person[name_part.get_type()] = name_part.get_valueOf_() # either author or editor - roles = name.get_role()[0].get_roleTerm() + roles = name.get_role()[0].get_roleTerm() if name.get_role() else [] # TODO: handle the complete set of allowed roles for role in roles: if role.get_valueOf_() == "edt": From 499c3ccc064c3e5b1c7c4f70dade68bed100f11a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 3 Dec 2021 15:27:59 +0100 Subject: [PATCH 02/35] fallback to empty publicationStmt/date and encodingDesc if metsHdr is missing --- mets_mods2tei/api/mets.py | 26 ++++++++++++++++++++------ mets_mods2tei/api/tei.py | 8 +++++--- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index b7a0eab..351f93f 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -237,12 +237,26 @@ def __spur(self): # # metsHdr header = self.mets.get_metsHdr() - - # encoding date - self.encoding_date = header.get_CREATEDATE().isoformat() - - # encoding description - self.encoding_desc = list(filter(lambda x: x.get_OTHERTYPE() == "SOFTWARE", header.get_agent()))[0].get_name() + if header: + # encoding date + self.encoding_date = header.get_CREATEDATE() + # encoding description + self.encoding_desc = [agent.get_name() + for agent in header.get_agent() + if agent.get_TYPE() == "OTHER" and agent.get_OTHERTYPE() == "SOFTWARE"] + else: + self.encoding_date = None + self.encoding_desc = None + + if self.encoding_date: + self.encoding_date = self.encoding_date.isoformat() + else: + self.logger.error("Found no @CREATEDATE for publicationStmt/date") + if self.encoding_desc: + self.encoding_desc = self.encoding_desc[0] # or -1? + # what about agent.get_OTHERROLE() and agent.get_note()? + else: + self.logger.error("Found no mets:agent for encodingDesc") # # location of manuscript diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index 741e72b..6a4d260 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -462,15 +462,17 @@ def add_encoding_date(self, date): publication_stmt = self.tree.xpath('//tei:publicationStmt', namespaces=ns)[0] encoding_date = etree.SubElement(publication_stmt, "%sdate" % TEI) encoding_date.set("type", "publication") - encoding_date.text = date + if date: + encoding_date.text = date def set_encoding_description(self, creator): """ Set some details on the encoding of the digital edition """ encoding_desc = self.tree.xpath('//tei:encodingDesc', namespaces=ns)[0] - encoding_desc_details = etree.SubElement(encoding_desc, "%sp" % TEI) - encoding_desc_details.text = "Encoded with the help of %s." % creator + if creator: + encoding_desc_details = etree.SubElement(encoding_desc, "%sp" % TEI) + encoding_desc_details.text = "Encoded with the help of %s." % creator def add_repository(self, repository): """ From 8984b1b0c52c2981a55cf53f375c1db877e5409d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 3 Dec 2021 17:10:26 +0100 Subject: [PATCH 03/35] get_text_in_line: append HYP content if available --- mets_mods2tei/api/alto.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mets_mods2tei/api/alto.py b/mets_mods2tei/api/alto.py index e1a2cec..4da9af9 100644 --- a/mets_mods2tei/api/alto.py +++ b/mets_mods2tei/api/alto.py @@ -92,7 +92,11 @@ def get_text_in_line(self, line): Returns the ALTO-encoded text . :param Element line: The line to extract the text from. """ - return " ".join(element.get("CONTENT") for element in line.xpath("./alto:String", namespaces=ns)) + text = " ".join(element.get("CONTENT") for element in line.xpath("./alto:String", namespaces=ns)) + hyp = line.find("alto:HYP", namespaces=ns) + if hyp is not None: + text += hyp.get("CONTENT") + return text def __compute_fuzzy_distance(self, text1, text2): """ From 7b136c8603587c5bd3c323f467e645aebc57637c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 3 Dec 2021 17:11:06 +0100 Subject: [PATCH 04/35] log to stderr instead of stdout (to prevent mixing with TEI) --- README.md | 2 +- mets_mods2tei/scripts/mets_mods2tei.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 63df7e0..2df14b2 100644 --- a/README.md +++ b/README.md @@ -118,5 +118,5 @@ including the extracted information from the MODS part of the METS. Example: - mm2tei "https://digital.slub-dresden.de/oai/?verb=GetRecord&metadataPrefix=mets&identifier=oai:de:slub-dresden:db:id-453779263" + mm2tei "https://digital.slub-dresden.de/oai/?verb=GetRecord&metadataPrefix=mets&identifier=oai:de:slub-dresden:db:id-453779263" > tei.xml diff --git a/mets_mods2tei/scripts/mets_mods2tei.py b/mets_mods2tei/scripts/mets_mods2tei.py index 28a3bdc..35e4b1e 100644 --- a/mets_mods2tei/scripts/mets_mods2tei.py +++ b/mets_mods2tei/scripts/mets_mods2tei.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import +import sys import os import logging import click @@ -19,7 +20,7 @@ def cli(mets, ocr, text_group, log_level): # # logging level - logging.basicConfig(level=logging.getLevelName(log_level)) + logging.basicConfig(level=logging.getLevelName(log_level), stream=sys.stderr) # # interpret mets argument From 6545b162aacbcc173fe03c1e093fc914b9f87f68 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 3 Dec 2021 18:20:27 +0100 Subject: [PATCH 05/35] improve makefile --- Makefile | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index fdc422a..3750276 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ # Python interpreter. Default: '$(PYTHON)' -PYTHON = python +PYTHON ?= python +PIP ?= pip # BEGIN-EVAL makefile-parser --make-help Makefile @@ -7,12 +8,16 @@ help: @echo "" @echo " Targets" @echo "" + @echo " install Install this package" + @echo " deps Install dependencies only" + @echo " deps-test Install dependencies for testing only" @echo " test Run all unit tests" @echo " coverage Run coverage tests" @echo "" @echo " Variables" @echo "" @echo " PYTHON Python interpreter. Default: '$(PYTHON)'" + @echo " PIP Python packager. Default: '$(PIP)'" # END-EVAL @@ -20,7 +25,16 @@ help: # Tests # -.PHONY: test coverage +.PHONY: install test coverage deps deps-test + +install: + $(PIP) install . + +deps: + $(PIP) install -r requirements.txt + +deps-test: + $(PIP) install -r requirements-test.txt # Run all unit tests test: From 711025a833d46f4e9c0ab8b065dc25e9c72ab803 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 3 Dec 2021 18:47:57 +0100 Subject: [PATCH 06/35] improve CI --- .circleci/config.yml | 45 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 30a331e..668e893 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,19 +1,54 @@ # Python CircleCI 2.1 configuration file # for mets-mods2tei # -# Check https://circleci.com/docs/2.1/language-python/ for more details +# Check https://circleci.com/docs/2.0/language-python/ for more details # version: 2.1 orbs: codecov: codecov/codecov@1.0.5 jobs: - build: + test: + parameters: + version: + type: string docker: - - image: python:3.6 + - image: circleci/python:<< parameters.version >> working_directory: ~/repo steps: - checkout - - run: pip install -r requirements-test.txt - - run: pip install . + - run: make deps deps-test + - run: make install + - run: make test - run: make coverage - codecov/upload + pypi: + docker: + - image: circleci/python:3.6 + working_directory: ~/repo + steps: + - checkout + - setup_remote_docker + - run: make install + - run: python setup.py sdist + - run: | + pip install cibuildwheel + cibuildwheel --output-dir dist + - store_artifacts: + path: dist/ + destination: artifacts + # later: upload to PyPI... + +workflows: + version: 2 + test-all: + jobs: + - test: + matrix: + parameters: + version: [3.5.10, 3.6.15, 3.7.12, 3.8.12, 3.9.9] + deploy: + jobs: + - pypi: + filters: + branches: + only: master From 605dd898b597ae242858edf34f2da720a489a916 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Dec 2021 01:04:46 +0100 Subject: [PATCH 07/35] mets.fromfile: allow missing logical structmap --- mets_mods2tei/api/mets.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 351f93f..a503d2d 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -123,10 +123,9 @@ def __spur(self): # # main title and manuscript type - struct_map_logical = list(filter(lambda x: x.get_TYPE() == "LOGICAL", self.mets.get_structMap()))[0] - title = struct_map_logical.get_div() - self.title = title.get_LABEL() - self.type = title.get_TYPE() + div = self.get_div_structure() + self.title = div.get_LABEL() if div else "" + self.type = div.get_TYPE() if div else "" # # sub titles From 3bfa7c265488c5043c213e2adadb9ec0813a1802 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Dec 2021 01:05:46 +0100 Subject: [PATCH 08/35] mets.fromfile: allow missing mods originInfo --- mets_mods2tei/api/mets.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index a503d2d..1fc8a38 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -154,29 +154,34 @@ def __spur(self): # # orgin info - origin_info = self.mods.get_originInfo()[0] + origin_info = self.mods.get_originInfo() # publication place self.places = [] - for place in origin_info.get_place(): - place_ext = {} - for place_term in place.get_placeTerm(): - place_ext[place_term.get_type()] = place_term.get_valueOf_() - self.places.append(place_ext) + if origin_info: + for place in origin_info[0].get_place(): + place_ext = {} + for place_term in place.get_placeTerm(): + place_ext[place_term.get_type()] = place_term.get_valueOf_() + self.places.append(place_ext) # publication dates self.dates = {} - for date_issued in origin_info.get_dateIssued(): - date_type = date_issued.get_point() if date_issued.get_point() != None else "unspecified" - self.dates[date_type] = date_issued.get_valueOf_() + if origin_info: + for date_issued in origin_info[0].get_dateIssued(): + date_type = date_issued.get_point() if date_issued.get_point() != None else "unspecified" + self.dates[date_type] = date_issued.get_valueOf_() # publishers self.publishers = [] - for publisher in origin_info.get_publisher(): - self.publishers.append(publisher.get_valueOf_()) + if origin_info: + for publisher in origin_info[0].get_publisher(): + self.publishers.append(publisher.get_valueOf_()) # edition of the manuscript - self.edition = origin_info.get_edition()[0].get_valueOf_() if origin_info.get_edition() else "" + self.edition = "" + if origin_info and origin_info[0].get_edition(): + self.edition = origin_info[0].get_edition()[0].get_valueOf_() # # languages and scripts From 559e4c1f02916a2b6a2a514c0c9d5741beef61ff Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Dec 2021 01:07:42 +0100 Subject: [PATCH 09/35] mets.fromfile: allow missing mods physicalDescription --- mets_mods2tei/api/mets.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 1fc8a38..88d004e 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -205,15 +205,18 @@ def __spur(self): # # physical description - physical_description = self.mods.get_physicalDescription()[0] + physical_description = self.mods.get_physicalDescription() # digital origin - self.digital_origin = physical_description.get_digitalOrigin()[0] if physical_description.get_digitalOrigin() else "" + self.digital_origin = "" + if physical_description and physical_description[0].get_digitalOrigin(): + self.digital_origin = physical_description[0].get_digitalOrigin()[0] # extent self.extents = [] - for extent in physical_description.get_extent(): - self.extents.append(extent.get_valueOf_()) + if physical_description: + for extent in physical_description[0].get_extent(): + self.extents.append(extent.get_valueOf_()) # # dv FIXME: replace with generated code as soon as schema is available From 1a7fe59038bed7228e381de244cb5593e77d48a0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Dec 2021 01:08:20 +0100 Subject: [PATCH 10/35] mets.fromfile: allow missing mets amdSec provenance dv --- mets_mods2tei/api/mets.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 88d004e..9da0496 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -220,16 +220,20 @@ def __spur(self): # # dv FIXME: replace with generated code as soon as schema is available - dv = etree.fromstring(self.mets.get_amdSec()[0].get_rightsMD()[0].get_mdWrap().get_xmlData().get_anytypeobjs_()[0]) + amdsec = self.mets.get_amdSec() + if amdsec and amdsec[0].get_rightsMD(): + dv = etree.fromstring(amdsec[0].get_rightsMD()[0].get_mdWrap().get_xmlData().get_anytypeobjs_()[0]) + else: + dv = [] # owner of the digital edition - self.owner_digital = dv.xpath("//dv:owner", namespaces=ns)[0].text + self.owner_digital = dv.xpath("//dv:owner", namespaces=ns)[0].text if dv else "" # availability/license # common case self.license = "" self.license_url = "" - license_nodes = dv.xpath("//dv:license", namespaces=ns) + license_nodes = dv.xpath("//dv:license", namespaces=ns) if dv else [] if license_nodes != []: self.license = license_nodes[0].text self.license_url = "" From af1740e07afddf9bf0533f517777fa324aca5060 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Dec 2021 01:09:40 +0100 Subject: [PATCH 11/35] mets.fromfile: simplify physical struct map, allow missing @ORDER --- mets_mods2tei/api/mets.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 9da0496..409cea6 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -52,6 +52,7 @@ def __init__(self): self.tree = None self.mets = None self.mods = None + self.page_map = {} self.order_map = {} self.img_map = {} self.alto_map = {} @@ -319,8 +320,10 @@ def __spur(self): default_map[entry.get("ID")] = entry.find("./" + METS + "FLocat").get("%shref" % XLINK) # struct map physical - for div in list(filter(lambda x: x.get_TYPE() == 'PHYSICAL', self.mets.get_structMap()))[0].get_div().get_div(): - self.order_map[div.get_ID()] = div.get_ORDER() + for div in self.get_page_structure().get_div(): + self.page_map[div.get_ID()] = div + if div.get_ORDER(): + self.order_map[div.get_ID()] = div.get_ORDER() for fptr in div.get_fptr(): if fptr.get_FILEID() in fulltext_map: self.alto_map[div.get_ID()] = fulltext_map[fptr.get_FILEID()] @@ -472,6 +475,15 @@ def get_languages(self): """ return self.languages + def get_page_structure(self): + """ + Return the div structure from the physical struct map + """ + for struct_map in self.mets.get_structMap(): + if struct_map.get_TYPE() == "PHYSICAL": + return struct_map.get_div() + return None + def get_div_structure(self): """ Return the div structure from the logical struct map From 18a2ddeafe9df312c7dc5ff51f05114fcd699c21 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Dec 2021 01:10:34 +0100 Subject: [PATCH 12/35] mets.fromfile: allow missing struct link --- mets_mods2tei/api/mets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 409cea6..d8f107b 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -331,7 +331,8 @@ def __spur(self): self.img_map[div.get_ID()] = default_map[fptr.get_FILEID()] # struct links - for sm_link in self.tree.xpath("//mets:structLink", namespaces=ns)[0].iterchildren(): + structlinks = self.tree.xpath("//mets:structLink/*", namespaces=ns) + for sm_link in structlinks: if sm_link.get("%sto" % XLINK) in self.alto_map: if sm_link.get("%sfrom" % XLINK) not in self.struct_links: self.struct_links[sm_link.get("%sfrom" % XLINK)] = [] From dbcc1feb9fc0d2310228423981c6402373e1f23f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Dec 2021 01:11:44 +0100 Subject: [PATCH 13/35] teil.fill_from_mets: allow empty logical struct map and struct link (fall back to physical pages, sorted if possible, or error with empty text) --- mets_mods2tei/api/mets.py | 2 +- mets_mods2tei/api/tei.py | 34 ++++++++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index d8f107b..18fa66c 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -492,7 +492,7 @@ def get_div_structure(self): for struct_map in self.mets.get_structMap(): if struct_map.get_TYPE() == "LOGICAL": return struct_map.get_div() - return [] + return None def get_struct_links(self, log_id): """ diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index 6a4d260..f9ce804 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -132,7 +132,18 @@ def fill_from_mets(self, mets, ocr=True): # text part # div structure - self.add_div_structure(mets.get_div_structure()) + div = mets.get_div_structure() + if div is not None: + self.logger.info("Found logical structMap for %s", div.get_TYPE()) + self.add_div_structure(div) + elif any(mets.alto_map): + self.logger.warning("Found no logical structMap div, falling back to physical") + pages = mets.alto_map.keys() + if any(mets.order_map.values()): + pages = sorted(pages, key=mets.get_order) + self.add_div_structure(None, map(mets.page_map.get, pages)) + else: + self.logger.error("Found no logical or physical structMap div") # OCR if ocr: @@ -597,6 +608,9 @@ def __add_ocr_to_node(self, node, mets): for childnode in node.iterchildren(): self.__add_ocr_to_node(childnode, mets) struct_links = mets.get_struct_links(node.get("id")) + if not struct_links and node.get("id") in mets.page_map: + # already physical + struct_links = [node.get("id")] # a header will always be on the first page of a div first = True @@ -678,15 +692,23 @@ def __add_ocr_to_node(self, node, mets): node.insert(0, par) first = False - def add_div_structure(self, div): + def add_div_structure(self, div, pages=None): """ - Add div elements to the text body according to the given list of divs + Add logical div elements to the text font/body/back according to the given div hierarchy """ # div structure has to be added to text text = self.tree.xpath('//tei:text', namespaces=ns)[0] + front = etree.SubElement(text, "%sfront" % TEI) + body = etree.SubElement(text, "%sbody" % TEI) + back = etree.SubElement(text, "%sback" % TEI) + + if pages: + for page in pages: + self.__add_div(body, page, 1) + return - # decent to the deepest AMD + # descend to the deepest AMD while div.get_ADMID() is None: div = div.get_div()[0] start_div = div.get_div()[0] @@ -694,10 +716,6 @@ def add_div_structure(self, div): div = start_div start_div = start_div.get_div()[0] - front = etree.SubElement(text, "%sfront" % TEI) - body = etree.SubElement(text, "%sbody" % TEI) - back = etree.SubElement(text, "%sback" % TEI) - entry_point = front for sub_div in div.get_div(): From 61c46249b4f08f5c17a65a510f9fab425b8cfd18 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Dec 2021 01:13:48 +0100 Subject: [PATCH 14/35] METS to TEI structure: comment urging for more+better mappings --- mets_mods2tei/api/tei.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index f9ce804..a46a7b1 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -724,8 +724,48 @@ def add_div_structure(self, div, pages=None): elif sub_div.get_TYPE() == "title_page": self.__add_div(entry_point, sub_div, 1, "titlePage") else: + # FIXME: if title_page gets preceded by figure/preface/contents/..., they *all* will end up in body entry_point = body self.__add_div(entry_point, sub_div, 1) + # FIXME: add more structural mappings from METS-Anwendungsprofil (DFG Strukturdatenset) to TEI-P5 tagset (DTAbf) + # ...for example: + # contents → contents + # corrigenda → corrigenda + # dedication → dedication + # index → index + # imprint → imprint + # ? → imprimatur + # priviledges? → copyright + # provenance → ? + # ? → appendix + # ? → advertisement + # preface → preface + # ? → postface + # chapter → chapter + # letter → letter + # verse → poem + # ? → diaryEntry + # ? → recipe + # ? → scene + # ? → act + # ? → frontispiece + # ? → bibliography + # list_illustrations? → figures + # ? → abbreviations + # ? → edition + # cover → ? + # cover_front → ? + # cover_back → ? + # table → ? + # manuscript → ? + # illustration → ? + # section → ? + # article → ? + # issue → ? + # day → ? + # month → ? + # volume → ? + # year → ? def __add_div(self, insert_node, div, n, tag="div"): """ From 15022f59c71c4f42bbca822fe55501ed97eb6768 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Dec 2021 09:02:35 +0100 Subject: [PATCH 15/35] rename changelog --- Changelog => CHANGELOG.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Changelog => CHANGELOG.md (100%) diff --git a/Changelog b/CHANGELOG.md similarity index 100% rename from Changelog rename to CHANGELOG.md From 553e0fd71ffc4a1d57083496aaa97d929dc4f5c3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Dec 2021 10:43:27 +0100 Subject: [PATCH 16/35] improve+update changelog --- CHANGELOG.md | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 82258e3..54f686d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,31 +8,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Added tests for TEI API - Added tests for insertion index identification -- Evaluate texts from all struct types but `binding` and `colour_checker` - Add `front`, `body` and `back` per default +- Log to stderr instead of stdout ### Fixed -- https://github.com/slub/mets-mods2tei/issues/43 -- https://github.com/slub/mets-mods2tei/issues/47 +- Evaluate texts from all struct types but `binding` and `colour_checker`, #43 +- Handle errors during language code expansion, and fallback to `Unbekannt`, #47 +- Add ALTO `HYP` text content if available, #52 +- Allow empty logical structMap and structLink, fallback to physical, or empty, #57 +- Allow partial dmdSec (MODS) or amdSec, fallback to empty, #46, #51 ## [0.1.1] - 2020-05-11 ### Added -- Treat nested AMD-type (non-logical) divs in logical struct map (i.e. -newspaper case) - Make full text file group selectable by user -- Allow for file entries (in addition to URLs) in METS -- Add special treatment for URNs and VD IDs - Add poor man's namespace versioning handling ### Changed - Make extraction of subtitles conditional on their presence -- Use "licence" for all types of licences (even unknown ones) +- Use "licence" for all types of licences (even unknown ones), #39 ### Fixed -- https://github.com/slub/mets-mods2tei/issues/28 -- https://github.com/slub/mets-mods2tei/issues/37 -- https://github.com/slub/mets-mods2tei/issues/39 -- https://github.com/slub/mets-mods2tei/issues/41 +- Handle nested `@ADMID="AMD"` divs in logical `structMap` (i.e. newspaper case), #43 +- Allow for local path entries (in addition to URLs) in METS, #41 +- Add special treatment for URNs and VD IDs, #37 ## [0.1.0] - 2019-12-04 ### Added @@ -40,13 +38,16 @@ newspaper case) - Set `corresp` and `facs` attributes of `pb` elements - Store links to `DEFAULT` images in METS - Tests for new functionality +- Add Changelog file, #28 ### Changed -- Retrieve ALTO files via a dedicated struct link member of the class Mets -- Move text retrieval to Alto class +- Retrieve ALTO files via a dedicated struct link member of the class `Mets` +- Move text retrieval to `Alto` class ### Removed - Get rid of code artifacts carried over from `tocrify` -### Fixed -- https://github.com/slub/mets-mods2tei/issues/28 + +[unreleased]: ../../compare/v0.1.1...master +[0.1.1]: ../../compare/v0.1.0...v0.1.1 +[0.1.0]: ../../compare/v1.0...v0.1.0 From 27dffe80430b932208d02933f3548aae47479d43 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Dec 2021 14:08:09 +0100 Subject: [PATCH 17/35] differentiate image number and page number --- mets_mods2tei/api/mets.py | 13 +++++++++++-- mets_mods2tei/api/tei.py | 9 ++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 18fa66c..2ec1dac 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -54,6 +54,7 @@ def __init__(self): self.mods = None self.page_map = {} self.order_map = {} + self.orderlabel_map = {} self.img_map = {} self.alto_map = {} self.struct_links = {} @@ -324,6 +325,8 @@ def __spur(self): self.page_map[div.get_ID()] = div if div.get_ORDER(): self.order_map[div.get_ID()] = div.get_ORDER() + if div.get_ORDERLABEL(): + self.orderlabel_map[div.get_ID()] = div.get_ORDERLABEL() for fptr in div.get_fptr(): if fptr.get_FILEID() in fulltext_map: self.alto_map[div.get_ID()] = fulltext_map[fptr.get_FILEID()] @@ -514,6 +517,12 @@ def get_alto(self, phys_id): def get_order(self, phys_id): """ - Return the manually set order for a given physical ID + Return the logical (manually set) page number for a given physical ID """ - return self.order_map.get(phys_id, "-1") + return self.order_map.get(phys_id, "0") + + def get_orderlabel(self, phys_id): + """ + Return the logical (manually set) page label for a given physical ID + """ + return self.orderlabel_map.get(phys_id, "") diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index a46a7b1..0477aa6 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -639,7 +639,14 @@ def __add_ocr_to_node(self, node, mets): self.alto_map[alto_link] = alto pb = etree.SubElement(node, "%spb" % TEI) - pb.set("facs", "#f{:04d}".format(int(mets.get_order(struct_link)))) + try: + pagenum = list(mets.page_map.keys()).index(struct_link) + pb.set("facs", "#f{:04d}".format(pagenum + 1)) + except ValueError: + self.logger.warning("cannot determine image number for '%s'", struct_link) + pagenum = mets.get_orderlabel(struct_link) or mets.get_order(struct_link) + if pagenum: + pb.set("n", str(pagenum)) pb.set("corresp", mets.get_img(struct_link)) for text_block in alto.get_text_blocks(): From c39b6c707171979da2e80603803bb0d01a64f8b5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Dec 2021 14:15:10 +0100 Subject: [PATCH 18/35] allow passing image fileGrp other than DEFAULT --- mets_mods2tei/api/mets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 2ec1dac..77b6dc0 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -59,6 +59,7 @@ def __init__(self): self.alto_map = {} self.struct_links = {} self.fulltext_group_name = 'FULLTEXT' + self.image_group_name = 'DEFAULT' self.title = None self.sub_titles = None @@ -315,7 +316,7 @@ def __spur(self): # default default_map = {} - default_group = self.tree.xpath("//mets:fileGrp[@USE='DEFAULT']", namespaces=ns) + default_group = self.tree.xpath("//mets:fileGrp[@USE='%s']" % self.image_group_name, namespaces=ns) if default_group: for entry in default_group[0].xpath("./mets:file", namespaces=ns): default_map[entry.get("ID")] = entry.find("./" + METS + "FLocat").get("%shref" % XLINK) From 71fd269e2c66dca1504e85e2755b58e97b1aa690 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Dec 2021 23:14:10 +0100 Subject: [PATCH 19/35] add params for image fileGrp and output file, more logging --- README.md | 18 +++++++--- mets_mods2tei/api/mets.py | 50 +++++++++++++++----------- mets_mods2tei/api/tei.py | 19 +++++++--- mets_mods2tei/scripts/mets_mods2tei.py | 22 +++++++++--- 4 files changed, 74 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 2df14b2..4ac616d 100644 --- a/README.md +++ b/README.md @@ -106,11 +106,22 @@ Usage: mm2tei [OPTIONS] METS METS: File containing or URL pointing to the METS/MODS XML to be converted + Parse given METS and its meta-data, and convert it to TEI. + + If `--ocr` is given, then also read the ALTO full-text files from the + fileGrp in `--text-group`, and convert page contents accordingly (in + physical order). Decorate page boundaries with image and page numbers, and + reference the corresponding base image files from `--img-group`. + + Output XML to `--output (use '-' for stdout), log to stderr.` + Options: + -O, --output FILENAME File path to write TEI output to -o, --ocr Serialize OCR into resulting TEI - -T, --text-group TEXT File group which contains the full text + -T, --text-group TEXT File group which contains the full-text + -I, --img-group TEXT File group which contains the images -l, --log-level [DEBUG|INFO|WARN|ERROR|OFF] - --help Show this message and exit. + -h, --help Show this message and exit. ``` It reads METS XML via URL or file argument and prints the resulting TEI, @@ -118,5 +129,4 @@ including the extracted information from the MODS part of the METS. Example: - mm2tei "https://digital.slub-dresden.de/oai/?verb=GetRecord&metadataPrefix=mets&identifier=oai:de:slub-dresden:db:id-453779263" > tei.xml - + mm2tei -O tei.xml "https://digital.slub-dresden.de/oai/?verb=GetRecord&metadataPrefix=mets&identifier=oai:de:slub-dresden:db:id-453779263" diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 77b6dc0..955bf5f 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -227,16 +227,16 @@ def __spur(self): if amdsec and amdsec[0].get_rightsMD(): dv = etree.fromstring(amdsec[0].get_rightsMD()[0].get_mdWrap().get_xmlData().get_anytypeobjs_()[0]) else: - dv = [] + dv = None # owner of the digital edition - self.owner_digital = dv.xpath("//dv:owner", namespaces=ns)[0].text if dv else "" + self.owner_digital = dv.xpath("//dv:owner", namespaces=ns)[0].text if dv is not None else "" # availability/license # common case self.license = "" self.license_url = "" - license_nodes = dv.xpath("//dv:license", namespaces=ns) if dv else [] + license_nodes = dv.xpath("//dv:license", namespaces=ns) if dv is not None else [] if license_nodes != []: self.license = license_nodes[0].text self.license_url = "" @@ -312,35 +312,43 @@ def __spur(self): if fulltext_group: fulltext_map = {} for entry in fulltext_group[0].xpath("./mets:file", namespaces=ns): - fulltext_map[entry.get("ID")] = entry.find("./" + METS + "FLocat").get("%shref" % XLINK) - - # default - default_map = {} - default_group = self.tree.xpath("//mets:fileGrp[@USE='%s']" % self.image_group_name, namespaces=ns) - if default_group: - for entry in default_group[0].xpath("./mets:file", namespaces=ns): - default_map[entry.get("ID")] = entry.find("./" + METS + "FLocat").get("%shref" % XLINK) + url = entry.find("./" + METS + "FLocat").get("%shref" % XLINK) + self.logger.debug("Found full-text file: %s", url) + fulltext_map[entry.get("ID")] = url + + # image + image_map = {} + image_group = self.tree.xpath("//mets:fileGrp[@USE='%s']" % self.image_group_name, namespaces=ns) + if image_group: + for entry in image_group[0].xpath("./mets:file", namespaces=ns): + url = entry.find("./" + METS + "FLocat").get("%shref" % XLINK) + self.logger.debug("Found image file: %s", url) + image_map[entry.get("ID")] = url # struct map physical for div in self.get_page_structure().get_div(): - self.page_map[div.get_ID()] = div + page = div.get_ID() + self.logger.debug("Found physical page: %s", page) + self.page_map[page] = div if div.get_ORDER(): - self.order_map[div.get_ID()] = div.get_ORDER() + self.order_map[page] = div.get_ORDER() if div.get_ORDERLABEL(): - self.orderlabel_map[div.get_ID()] = div.get_ORDERLABEL() + self.orderlabel_map[page] = div.get_ORDERLABEL() for fptr in div.get_fptr(): if fptr.get_FILEID() in fulltext_map: - self.alto_map[div.get_ID()] = fulltext_map[fptr.get_FILEID()] - elif fptr.get_FILEID() in default_map: - self.img_map[div.get_ID()] = default_map[fptr.get_FILEID()] + self.alto_map[page] = fulltext_map[fptr.get_FILEID()] + elif fptr.get_FILEID() in image_map: + self.img_map[page] = image_map[fptr.get_FILEID()] # struct links structlinks = self.tree.xpath("//mets:structLink/*", namespaces=ns) for sm_link in structlinks: - if sm_link.get("%sto" % XLINK) in self.alto_map: - if sm_link.get("%sfrom" % XLINK) not in self.struct_links: - self.struct_links[sm_link.get("%sfrom" % XLINK)] = [] - self.struct_links[sm_link.get("%sfrom" % XLINK)].append(sm_link.get("%sto" % XLINK)) + logical = sm_link.get("%sfrom" % XLINK) + physical = sm_link.get("%sto" % XLINK) + if physical in self.alto_map: + self.logger.debug("Found structLink from %s to physical page: %s", logical, physical) + pages = self.struct_links.setdefault(logical, list()) + pages.append(physical) @property def fulltext_group_name(self): diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index 0477aa6..cf1193a 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -134,7 +134,7 @@ def fill_from_mets(self, mets, ocr=True): # div structure div = mets.get_div_structure() if div is not None: - self.logger.info("Found logical structMap for %s", div.get_TYPE()) + self.logger.debug("Found logical structMap for %s", div.get_TYPE()) self.add_div_structure(div) elif any(mets.alto_map): self.logger.warning("Found no logical structMap div, falling back to physical") @@ -604,13 +604,15 @@ def __add_ocr_to_node(self, node, mets): """ Add text to a given node and recursively add text to children too (post order!) """ - + + node_id = node.get("id") + self.logger.debug("Adding text for %s", node_id) for childnode in node.iterchildren(): self.__add_ocr_to_node(childnode, mets) - struct_links = mets.get_struct_links(node.get("id")) - if not struct_links and node.get("id") in mets.page_map: + struct_links = mets.get_struct_links(node_id) + if not struct_links and node_id in mets.page_map: # already physical - struct_links = [node.get("id")] + struct_links = [node_id] # a header will always be on the first page of a div first = True @@ -712,14 +714,18 @@ def add_div_structure(self, div, pages=None): if pages: for page in pages: + self.logger.debug("Found physical page %s", page.get_ID()) self.__add_div(body, page, 1) return # descend to the deepest AMD while div.get_ADMID() is None: + self.logger.debug("Found logical outer div type %s: %s", div.get_TYPE(), div.get_ID()) div = div.get_div()[0] start_div = div.get_div()[0] + self.logger.debug("Found logical inner div type %s: %s", start_div.get_TYPE(), start_div.get_ID()) while start_div.get_div() and start_div.get_div()[0].get_ADMID() is not None: + self.logger.debug("Found logical inner div type %s: %s", start_div.get_TYPE(), start_div.get_ID()) div = start_div start_div = start_div.get_div()[0] @@ -785,6 +791,9 @@ def __add_div(self, insert_node, div, n, tag="div"): #head = etree.SubElement(new_div, "%s%s" % (TEI, "head")) #head.text = div.get_LABEL() new_div.set("rend", div.get_LABEL()) + self.logger.debug("Adding %s[@id=%s,@n=%d,@rend=%s] for %s", + tag, div.get_ID(), n, div.get_LABEL() or "", + insert_node.tag.split('}')[-1]) for sub_div in div.get_div(): self.__add_div(new_div, sub_div, n+1) diff --git a/mets_mods2tei/scripts/mets_mods2tei.py b/mets_mods2tei/scripts/mets_mods2tei.py index 35e4b1e..cb4b059 100644 --- a/mets_mods2tei/scripts/mets_mods2tei.py +++ b/mets_mods2tei/scripts/mets_mods2tei.py @@ -10,13 +10,24 @@ from mets_mods2tei import Mets from mets_mods2tei import Tei -@click.command() +@click.command(context_settings={'help_option_names': ['-h', '--help']}) @click.argument('mets', required=True) +@click.option('-O', '--output', default="-", type=click.File("wb"), help="File path to write TEI output to") @click.option('-o', '--ocr', is_flag=True, default=False, help="Serialize OCR into resulting TEI") -@click.option('-T', '--text-group', default="FULLTEXT", help="File group which contains the full text") +@click.option('-T', '--text-group', default="FULLTEXT", help="File group which contains the full-text") +@click.option('-I', '--img-group', default="DEFAULT", help="File group which contains the images") @click.option('-l', '--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARN', 'ERROR', 'OFF']), default='WARN') -def cli(mets, ocr, text_group, log_level): - """ METS: File containing or URL pointing to the METS/MODS XML to be converted """ +def cli(mets, output, ocr, text_group, img_group, log_level): + """METS: File containing or URL pointing to the METS/MODS XML to be converted + + Parse given METS and its meta-data, and convert it to TEI. + + If `--ocr` is given, then also read the ALTO full-text files from the fileGrp in `--text-group`, + and convert page contents accordingly (in physical order). Decorate page boundaries with image + and page numbers, and reference the corresponding base image files from `--img-group`. + + Output XML to `--output (use '-' for stdout), log to stderr.` + """ # # logging level @@ -33,6 +44,7 @@ def cli(mets, ocr, text_group, log_level): # read in METS mets = Mets() mets.fulltext_group_name = text_group + mets.image_group_name = img_group mets.fromfile(f) # @@ -41,7 +53,7 @@ def cli(mets, ocr, text_group, log_level): tei.fill_from_mets(mets, ocr) - click.echo(tei.tostring()) + output.write(tei.tostring()) if __name__ == '__main__': From 5c20f9051ae439a57a20ad6bb55f27cafe15308f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Dec 2021 23:28:06 +0100 Subject: [PATCH 20/35] update changelog --- CHANGELOG.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 54f686d..7372172 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,11 +5,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- tests for TEI API +- tests for insertion index identification +- more logging +- CLI param for output file +- CLI param for image fileGrp + ### Changed -- Added tests for TEI API -- Added tests for insertion index identification - Add `front`, `body` and `back` per default - Log to stderr instead of stdout +- Differentiate between (physical) image nr and (logical) page nr ### Fixed - Evaluate texts from all struct types but `binding` and `colour_checker`, #43 From ad261ff1580e7170d6ab3bd73879fb404dfd136c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 12 Dec 2021 23:31:17 +0100 Subject: [PATCH 21/35] generalize passing URN and VD ID to all identifiers --- mets_mods2tei/api/mets.py | 26 ++++++++------------------ mets_mods2tei/api/tei.py | 36 +++++++++--------------------------- 2 files changed, 17 insertions(+), 45 deletions(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 955bf5f..85832e9 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -77,8 +77,7 @@ def __init__(self): self.encoding_desc = None self.owner_manuscript = None self.shelf_locators = None - self.urn = None - self.vd_id = None + self.identifiers = None self.scripts = None self.collections = None self.languages = None @@ -285,14 +284,11 @@ def __spur(self): # # URN and VD ID - self.urn = "" - self.vd_id = "" + self.identifiers = dict() identifiers = self.mods.get_identifier() - for identifier in identifiers: - if identifier.get_type().lower() == "urn": - self.urn = identifier.get_valueOf_() - elif identifier.get_type().lower().startswith("vd"): - self.vd_id = identifier.get_valueOf_() + if len(identifiers): + for identifier in identifiers: + self.identifiers[identifier.get_type()] = identifier.get_valueOf_() # # collections (from relatedItem) @@ -458,17 +454,11 @@ def get_shelf_locators(self): """ return self.shelf_locators - def get_urn(self): + def get_identifiers(self): """ - Return the URN of the digital representation + Return the (dict of) identifiers of the digital representation """ - return self.urn - - def get_vd_id(self): - """ - Return the VD ID of the digital representation - """ - return self.vd_id + return self.identifiers def get_scripts(self): """ diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index cf1193a..f8c4de4 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -100,13 +100,12 @@ def fill_from_mets(self, mets, ocr=True): # shelf locator for shelf_locator in mets.get_shelf_locators(): - self.add_shelfmark(shelf_locator) + self.add_ms_identifier("shelfmark", shelf_locator) # identifiers - if mets.get_urn(): - self.add_urn(mets.get_urn()) - if mets.get_vd_id(): - self.add_vd_id(mets.get_vd_id()) + if mets.get_identifiers(): + for type_, value in mets.get_identifiers().items(): + self.add_ms_identifier(type_.upper(), value) # type description if mets.get_scripts(): @@ -493,32 +492,15 @@ def add_repository(self, repository): repository_node = etree.SubElement(ms_ident, "%srepository" % TEI) repository_node.text = repository - def add_shelfmark(self, shelfmark): + def add_ms_identifier(self, type_, value): """ - Add the shelf mark of the (original) manuscript + Add the URN, PURL, VD ID, shelfmark etc. of the digital edition """ ms_ident_idno = self.tree.xpath('//tei:msDesc/tei:msIdentifier/tei:idno', namespaces=ns)[0] + # FIXME: URN, DTAID, ... should go to /tei:fileDesc/tei:publicationStmt/tei:idno instead idno = etree.SubElement(ms_ident_idno, "%sidno" % TEI) - idno.set("type", "shelfmark") - idno.text = shelfmark - - def add_urn(self, urn): - """ - Add the URN of the digital edition - """ - ms_ident_idno = self.tree.xpath('//tei:msDesc/tei:msIdentifier/tei:idno', namespaces=ns)[0] - idno = etree.SubElement(ms_ident_idno, "%sidno" % TEI) - idno.set("type", "URN") - idno.text = urn - - def add_vd_id(self, vd_id): - """ - Add the VD ID of the digital edition - """ - ms_ident_idno = self.tree.xpath('//tei:msDesc/tei:msIdentifier/tei:idno', namespaces=ns)[0] - idno = etree.SubElement(ms_ident_idno, "%sidno" % TEI) - idno.set("type", "VD") - idno.text = vd_id + idno.set("type", type_) + idno.text = value def set_type_desc(self, description): """ From 93fb68469356779ba73d08c05dc6572542e61372 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 13 Dec 2021 13:19:50 +0100 Subject: [PATCH 22/35] =?UTF-8?q?improve=20level,=20title=20and=20idno=20m?= =?UTF-8?q?etadata=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - for `sourceDesc/biblFull/titleStmt/title/@level`, only use allowed values (m/a/j/s/u), and try mapping from top-level logical `div/@TYPE` - for `sourceDesc/bibl/@type`, try mapping from top-level logical `div/@TYPE` - instead of ignoring `titleInfo` main and part/volume titles, - prefer main title from titleInfo over top-level logical `div/@LABEL` - prefer `titleInfo/@type=uniform` or empty over abbrev/alternative/translated - also parse and add `partNumber/partName` or `part` - instead of spilling titleInfo between `fileDesc/titleStmt` and `biblFull/titleStmt`, copy the former to the latter when complete, and then add `@level` etc --- CHANGELOG.md | 5 ++ mets_mods2tei/api/mets.py | 85 ++++++++++++++++++-- mets_mods2tei/api/tei.py | 120 ++++++++++++++++++++-------- mets_mods2tei/data/tei_skeleton.xml | 5 +- tests/test_mets.py | 6 +- tests/test_tei.py | 19 ++--- 6 files changed, 184 insertions(+), 56 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7372172..69ae629 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add ALTO `HYP` text content if available, #52 - Allow empty logical structMap and structLink, fallback to physical, or empty, #57 - Allow partial dmdSec (MODS) or amdSec, fallback to empty, #46, #51 +- Pass all `mods:identifier`s to `msIdentifier/idno` (not just VD and URN) +- Parse full `titleInfo` (main/sub/part/volume), and re-use in `biblFull` +- Prefer `titleInfo/title` over `div/@LABEL` if available +- Map top logical `div/@TYPE` into allowed `biblFull/title/@level` only +- Map top logical `div/@TYPE` into appropriate `bibl/@type` if possible ## [0.1.1] - 2020-05-11 ### Added diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 85832e9..7567bc3 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -63,6 +63,8 @@ def __init__(self): self.title = None self.sub_titles = None + self.part_titles = None + self.volume_titles = None self.authors = None self.editors = None self.places = None @@ -124,17 +126,74 @@ def __spur(self): """ # - # main title and manuscript type + # get publication level + # get main and sub title from top-level logical div as a fallback + self.title = "" + self.biblevel = None + self.bibtype = None div = self.get_div_structure() - self.title = div.get_LABEL() if div else "" - self.type = div.get_TYPE() if div else "" + if div: + self.title = div.get_LABEL() # overridden by any titleInfo + div_type = div.get_TYPE() + # differentiate between analytic and closed, periodic and singular, dependent and indepenent types + # (for use in bibl/@type and biblFull//title/@level): + # FIXME: verify this ruleset is correct/standardized (but criteria do not look orthogonal, e.g. "issue" and "proceeding") + if div_type in ["bachelor_thesis", "diploma_thesis", "magister_thesis", "master_thesis", "doctoral_thesis", "habilitation_thesis", "file", "register", "research_paper", "report", "atlas", "album", "letter", "document", "leaflet", "manuscript", "poster", "plan", "study", "judgement", "preprint", "dossier", "paper"]: + self.biblevel = 'u' # unpublished + self.bibtype = 'M' # monograph + elif div_type in ["contained_work", "folder", ]: + self.biblevel = 'a' + self.bibtype = 'DM' # dependent part of monograph + # ? or 'DS' # dependent part of series + elif div_type in ["article"]: + self.biblevel = 'a' # analytic + self.bibtype = 'JA' # journal article + elif div_type in ["periodical", "newspaper"]: + self.biblevel = 'j' # journal + self.bibtype = 'J' # journal + elif div_type in ["lecture"]: + self.biblevel = 's' # series + self.bibtype = '' # ? + elif div_type in ["monograph", ]: + self.biblevel = 'm' # monograph + self.bibtype = 'M' # monograph + elif div_type in ["multivolume_work", "volume"]: + self.biblevel = 'm' # monograph + self.bibtype = 'MM' # monograph within multi-volume monograph + # ? or 'MS' # monograph within series + # ? or 'MMS' # monograph within multi-volume monograph series # - # sub titles - self.sub_titles = [] - for title_info in self.mods.get_titleInfo(): + # titleInfo (main, sub, part/volume) + self.sub_titles = [] # subtitle (mods:titleInfo[mods:subTitle] + self.part_titles = dict() # part title of multipart subseries (mods:titleInfo[mods:partNumber|mods:partName]) + self.volume_titles = dict() # volume title in multivolume monograph (mods:part[mods:detail]) + title_infos = self.mods.get_titleInfo() + if len(title_infos): + def norm_title_first(titleInfo): + if not titleInfo.get_type() or titleInfo.get_type() == 'simple': + # prefer untyped entry ('simple' most likely is from generateDS) + return -1 + if titleInfo.get_type() == 'uniform': + return 0 + return 1 + title_info = sorted(title_infos, key=norm_title_first)[0] + if title_info.get_title(): + self.title = title_info.get_title()[0].get_valueOf_().strip() for sub_title in title_info.get_subTitle(): self.sub_titles.append(sub_title.get_valueOf_().strip()) + for part_number, part_name in zip(title_info.get_partNumber(), title_info.get_partName()): + self.part_titles[part_number.get_valueOf_().strip()] = part_name.get_valueOf_().strip() + part_infos = self.mods.get_part() + if len(part_infos): + part_info = part_infos[0] + order = str(part_info.get_order() or 0) + for detail in part_info.get_detail(): + typ = detail.get_type() + val = ', '.join([title.get_valueOf_().strip() + for title in detail.get_number() + detail.get_caption() + detail.get_title()]) + self.volume_titles[order, typ] = val + # # authors and editors self.authors = [] @@ -366,10 +425,22 @@ def get_main_title(self): def get_sub_titles(self): """ - Return the main title of the work. + Return the sub-titles of the work. """ return self.sub_titles + def get_part_titles(self): + """ + Return the part titles of the work. + """ + return self.part_titles + + def get_volume_titles(self): + """ + Return the volume titles of the work. + """ + return self.volume_titles + def get_authors(self): """ Return the author of the work. diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index f8c4de4..04bef62 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -38,6 +38,7 @@ def tostring(self): """ Serializes the TEI object as xml string. """ + # needs lxml>=4.5: etree.indent(self.tree, space=" ") return etree.tostring(self.tree, encoding="utf-8") def fill_from_mets(self, mets, ocr=True): @@ -50,13 +51,16 @@ def fill_from_mets(self, mets, ocr=True): # main title self.set_main_title(mets.get_main_title()) + for sub in mets.get_sub_titles(): + self.add_sub_title(sub) + for number, part in mets.get_part_titles().items(): + self.add_part_title(number, part) + for (order, typ), volume in mets.get_volume_titles().items(): + self.add_volume_title(order, typ, volume) + self.init_biblFull() # publication level - self.set_publication_level(mets.type) - - # sub titles - for sub_title in mets.get_sub_titles(): - self.add_sub_title(sub_title) + self.set_publication_level(mets.biblevel) # authors for typ, author in mets.get_authors(): @@ -100,12 +104,14 @@ def fill_from_mets(self, mets, ocr=True): # shelf locator for shelf_locator in mets.get_shelf_locators(): - self.add_ms_identifier("shelfmark", shelf_locator) + self.add_identifier("shelfmark", shelf_locator) # identifiers if mets.get_identifiers(): for type_, value in mets.get_identifiers().items(): - self.add_ms_identifier(type_.upper(), value) + if type_ in ["vd16", "vd17", "vd18"]: + type_ = "VD" + self.add_identifier(type_.upper(), value) # type description if mets.get_scripts(): @@ -125,7 +131,7 @@ def fill_from_mets(self, mets, ocr=True): # # citation - self.compile_bibl() + self.compile_bibl(mets.bibtype) # # text part @@ -156,13 +162,6 @@ def main_title(self): """ return self.tree.xpath('//tei:titleStmt/tei:title[@type="main"]', namespaces=ns)[0].text - @property - def publication_level(self): - """ - Return the level of publication ('monographic' vs. 'analytic') - """ - return self.tree.xpath('//tei:sourceDesc/tei:biblFull/tei:titleStmt/tei:title[@type="main"]', namespaces=ns)[0].get("level") - @property def subtitles(self): """ @@ -182,6 +181,13 @@ def authors(self): authors.append(", ".join(author.xpath('descendant-or-self::*/text()'))) return authors + @property + def publication_level(self): + """ + Return the level of publication ('monographic' vs. 'analytic') + """ + return self.tree.xpath('//tei:sourceDesc/tei:biblFull/tei:titleStmt/tei:title[@type="main"]', namespaces=ns)[0].get("level") + @property def dates(self): """ @@ -330,26 +336,70 @@ def bibl(self): def set_main_title(self, string): """ - Set the main title of the title statements. + Set the main title of the tei:titleStmt. """ - for main_title in self.tree.xpath('//tei:titleStmt/tei:title[@type="main"]', namespaces=ns): - main_title.text = string + titleStmt = self.tree.xpath('//tei:titleStmt', namespaces=ns)[0] + for node in titleStmt.xpath('tei:title[@type="main"]', namespaces=ns): + node.text = string - def set_publication_level(self, level): + def add_sub_title(self, string): """ - Set the level of publication ('monographic' vs. 'analytic') + Add a sub-title of the tei:titleStmt. """ - self.tree.xpath('//tei:sourceDesc/tei:biblFull/tei:titleStmt/tei:title[@type="main"]', namespaces=ns)[0].set("level", level) + titleStmt = self.tree.xpath('//tei:titleStmt', namespaces=ns)[0] + node = etree.Element("%stitle" % TEI) + node.set("type", "sub") + node.text = string + titleStmt.append(copy.deepcopy(node)) - def add_sub_title(self, string): + def add_part_title(self, number, string): """ - Add a sub title to the title statements. + Add a part title of the tei:titleStmt. """ - sub_title = etree.Element("%stitle" % TEI) - sub_title.set("type", "sub") - sub_title.text = string - for title_stmt in self.tree.xpath('//tei:titleStmt', namespaces=ns): - title_stmt.append(copy.deepcopy(sub_title)) + titleStmt = self.tree.xpath('//tei:titleStmt', namespaces=ns)[0] + node = etree.Element("%stitle" % TEI) + node.set("type", "part") + node.set("n", number) + node.text = string + titleStmt.append(copy.deepcopy(node)) + + def add_volume_title(self, number, typ, string): + """ + Add a volume title of the tei:titleStmt. + """ + titleStmt = self.tree.xpath('//tei:titleStmt', namespaces=ns)[0] + node = etree.Element("%stitle" % TEI) + node.set("type", typ) + node.set("n", number) + node.text = string + titleStmt.append(copy.deepcopy(node)) + + def init_biblFull(self): + """ + Set the main, sub, and part/volume titles of the tei:biblFull by copying from tei:titleStmt. + """ + titleStmt = self.tree.xpath('//tei:titleStmt', namespaces=ns)[0] + bibl = self.tree.xpath('//tei:sourceDesc/tei:biblFull', namespaces=ns)[0] + bibl.append(copy.deepcopy(titleStmt)) + + def set_publication_level(self, level): + """ + Set the level of publication: + - 'm': (monographic) the title applies to a monograph such as a book + or other item considered to be a distinct publication, + including single volumes of multi-volume works + - 'a': (analytic) the title applies to an analytic item, such as an article, + poem, or other work published as part of a larger item. + - 'j': (journal) the title applies to any serial or periodical publication + such as a journal, magazine, or newspaper + - 's': (series) the title applies to a series of otherwise distinct publications + such as a collection + - 'u': (unpublished) the title applies to any unpublished material + (including theses and dissertations unless published by a commercial press) + """ + assert level in ['m', 'a', 'j', 's', 'u'] + for title in self.tree.xpath('//tei:sourceDesc/tei:biblFull/tei:titleStmt/tei:title', namespaces=ns): + title.set("level", level) def add_author(self, person, typ): """ @@ -492,13 +542,13 @@ def add_repository(self, repository): repository_node = etree.SubElement(ms_ident, "%srepository" % TEI) repository_node.text = repository - def add_ms_identifier(self, type_, value): + def add_identifier(self, type_, value): """ Add the URN, PURL, VD ID, shelfmark etc. of the digital edition """ - ms_ident_idno = self.tree.xpath('//tei:msDesc/tei:msIdentifier/tei:idno', namespaces=ns)[0] + ms_ident = self.tree.xpath('//tei:msDesc/tei:msIdentifier/tei:idno', namespaces=ns)[0] # FIXME: URN, DTAID, ... should go to /tei:fileDesc/tei:publicationStmt/tei:idno instead - idno = etree.SubElement(ms_ident_idno, "%sidno" % TEI) + idno = etree.SubElement(ms_ident, "%sidno" % TEI) idno.set("type", type_) idno.text = value @@ -542,16 +592,16 @@ def add_collection(self, collection): creation = etree.SubElement(profile_desc, "%screation" % TEI) creation.text = collection - def compile_bibl(self): + def compile_bibl(self, type_): """ Compile the content of the short citation element 'bibl' based on the current state """ - if self.publication_level: - self.bibl.set("type", self.publication_level) + if type_: + self.bibl.set("type", type_) bibl_text = "" if self.authors: bibl_text += "; ".join(self.authors) + ": " - elif self.publication_level == "monograph": + elif type_.startswith("M"): bibl_text = "[N. N.], " bibl_text += self.main_title + "." if self.places: diff --git a/mets_mods2tei/data/tei_skeleton.xml b/mets_mods2tei/data/tei_skeleton.xml index 1e04da9..142e491 100644 --- a/mets_mods2tei/data/tei_skeleton.xml +++ b/mets_mods2tei/data/tei_skeleton.xml @@ -8,13 +8,12 @@ [Haupttitel] + + [Zitiertitel] - - [Haupttitel einer Monographie] - diff --git a/tests/test_mets.py b/tests/test_mets.py index bf71438..984847e 100644 --- a/tests/test_mets.py +++ b/tests/test_mets.py @@ -136,7 +136,9 @@ def test_data_assignment(subtests, datadir): assert(mets.get_shelf_locators() == ['Hist.Amer.1497']) with subtests.test("Check URN"): - assert(mets.get_urn() == 'urn:nbn:de:bsz:14-db-id4971666239') + assert "urn" in mets.get_identifiers() + assert mets.get_identifiers()["urn"] == 'urn:nbn:de:bsz:14-db-id4971666239' with subtests.test("Check VD ID"): - assert(mets.get_vd_id() == 'VD18 11413883') + assert "vd18" in mets.get_identifiers() + assert mets.get_identifiers()["vd18"] == 'VD18 11413883' diff --git a/tests/test_tei.py b/tests/test_tei.py index 575c2c6..f6655ca 100644 --- a/tests/test_tei.py +++ b/tests/test_tei.py @@ -59,10 +59,6 @@ def test_data_assignment(subtests): tei.set_main_title("Testbuch") assert(tei.main_title == "Testbuch") - with subtests.test("Check publication level"): - tei.set_publication_level("m") - assert(tei.publication_level == "m") - with subtests.test("Check first subtitle"): tei.add_sub_title("Untertitel 1") assert(tei.subtitles == ["Untertitel 1"]) @@ -71,6 +67,11 @@ def test_data_assignment(subtests): tei.add_sub_title("Untertitel 2") assert(tei.subtitles == ["Untertitel 1", "Untertitel 2"]) + with subtests.test("Check publication level"): + tei.init_biblFull() + tei.set_publication_level("m") + assert(tei.publication_level == "m") + with subtests.test("Check first author"): tei.add_author({'family': 'Mustermann', 'given': 'Max', 'date': '12.10.1956', 'title': 'Dr.'}, "personal") assert(tei.authors == ["Mustermann, Max, Dr."]) @@ -133,16 +134,16 @@ def test_data_assignment(subtests): assert(tei.repositories == ["Kitodo.Production", "Saxonica"]) with subtests.test("Check shelfmarks"): - tei.add_shelfmark("Foo 25") - tei.add_shelfmark("HAL 9000") + tei.add_identifier("shelfmark", "Foo 25") + tei.add_identifier("shelfmark", "HAL 9000") assert(tei.shelfmarks == ["Foo 25", "HAL 9000"]) with subtests.test("Check VD ID"): - tei.add_vd_id("VD18 11413883") + tei.add_identifier("VD", "VD18 11413883") assert(tei.vd_id == "VD18 11413883") with subtests.test("Check URN"): - tei.add_urn("urn:nbn:de:bsz:14-db-id4971666239") + tei.add_identifier("URN", "urn:nbn:de:bsz:14-db-id4971666239") assert(tei.urn == "urn:nbn:de:bsz:14-db-id4971666239") with subtests.test("Check first extent"): @@ -158,5 +159,5 @@ def test_data_assignment(subtests): assert(tei.collections == ["LDP"]) with subtests.test("Check bibl"): - tei.compile_bibl() + tei.compile_bibl('M') assert(tei.bibl.text == "Mustermann, Max, Dr.; Mustermann Max 12.10.1956 Dr.: Testbuch. Dresden u. a., 01.01.1823.") From 9a5f48627984f19ccc4f0790c37ab3ca66553aae Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 13 Dec 2021 18:12:54 +0100 Subject: [PATCH 23/35] fall back to biblFull title level u --- mets_mods2tei/api/tei.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index 04bef62..ba380d5 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -60,7 +60,7 @@ def fill_from_mets(self, mets, ocr=True): self.init_biblFull() # publication level - self.set_publication_level(mets.biblevel) + self.set_publication_level(mets.biblevel or 'u') # authors for typ, author in mets.get_authors(): From 55353e53f0a92d8c25c4c774e4256fef677c4114 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 14 Dec 2021 01:16:54 +0100 Subject: [PATCH 24/35] keep going if there is no author and div type --- mets_mods2tei/api/tei.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index ba380d5..bc787a8 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -601,7 +601,7 @@ def compile_bibl(self, type_): bibl_text = "" if self.authors: bibl_text += "; ".join(self.authors) + ": " - elif type_.startswith("M"): + elif type_ and type_.startswith("M"): bibl_text = "[N. N.], " bibl_text += self.main_title + "." if self.places: From 0bf8bd350395457e2cc5f6b0378f2ae02793be62 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Dec 2021 01:16:42 +0100 Subject: [PATCH 25/35] fix tei:collection --- mets_mods2tei/api/tei.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index bc787a8..5a989a4 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -324,7 +324,7 @@ def collections(self): Return information on the collections of the work represented by the TEI Header. """ - return [collection.text for collection in self.tree.xpath('//tei:profileDesc/tei:creation', namespaces=ns)] + return [collection.text for collection in self.tree.xpath('//tei:msDesc/tei:msIdentifier/tei:collection', namespaces=ns)] @property def bibl(self): @@ -588,9 +588,9 @@ def add_collection(self, collection): """ Add a (free-text) collection of the digital document """ - profile_desc = self.tree.xpath('//tei:profileDesc', namespaces=ns)[0] - creation = etree.SubElement(profile_desc, "%screation" % TEI) - creation.text = collection + profile_desc = self.tree.xpath('//tei:msDesc/tei:msIdentifier', namespaces=ns)[0] + coll = etree.SubElement(profile_desc, "%scollection" % TEI) + coll.text = collection def compile_bibl(self, type_): """ From 7962b8c5d43a836c96d26df1e104c5e90698859d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Dec 2021 01:19:27 +0100 Subject: [PATCH 26/35] fix tei:repository (from list-valued mods:physicalLocation), add tei:idno from mods:url --- mets_mods2tei/api/mets.py | 19 ++++++++++++++----- mets_mods2tei/api/tei.py | 29 ++++++++++++++++++++++++----- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 7567bc3..8221bf1 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -77,7 +77,8 @@ def __init__(self): self.license_url = None self.encoding_date = None self.encoding_desc = None - self.owner_manuscript = None + self.location_phys = None + self.location_urls = None self.shelf_locators = None self.identifiers = None self.scripts = None @@ -339,7 +340,9 @@ def norm_title_first(titleInfo): if location.get_shelfLocator(): self.shelf_locators.extend([shelf_locator.get_valueOf_() for shelf_locator in location.get_shelfLocator()]) elif location.get_physicalLocation(): - self.owner_manuscript = location.get_physicalLocation() + self.location_phys = location.get_physicalLocation()[0] + elif location.get_url(): + self.location_urls = location.get_url() # # URN and VD ID @@ -513,11 +516,17 @@ def get_encoding_description(self): """ return self.encoding_desc - def get_owner_manuscript(self): + def get_location_phys(self): """ - Return the owner of the original manuscript + Return the physical location of the original manuscript """ - return self.owner_manuscript + return self.location_phys + + def get_location_urls(self): + """ + Return the URL location of the original manuscript + """ + return self.location_urls def get_shelf_locators(self): """ diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index 5a989a4..0a3f65c 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -5,6 +5,7 @@ import os import logging import copy +import re from contextlib import closing from urllib.request import urlopen @@ -99,8 +100,26 @@ def fill_from_mets(self, mets, ocr=True): self.set_encoding_description(mets.get_encoding_description()) # repository - if mets.get_owner_manuscript(): - self.add_repository(mets.get_owner_manuscript()) + if mets.get_location_phys(): + # hard to distinguish between settlement, institution and repository at this point + self.add_repository(mets.get_location_phys()) + if mets.get_location_urls(): + for url in mets.get_location_urls(): + # hard to determine type of URL at this point – could be (some form of) presentation, + # URN, PPN, EPN, DOI, URLWeb, URLCatalogue, URLImages, URLText, URLHTML, URLXML, URLTCF, URLIIIF + if url.startswith("urn:"): + typ = "URN" + elif re.fullmatch("10[.][0-9]*/.*", url): + typ = "DOI" + elif re.fullmatch("[0-9]{8}[0-9X]{1,2}", url): + typ = "PPN" + elif re.fullmatch("([0-9]+-)+[0-9]+", url): + typ = "ISBN" + elif re.fullmatch("[0-9]{4}-[0-9]{3}[0-9xX]", url): + typ = "ISSN" + else: + typ = "URL" + self.add_identifier(typ, url) # shelf locator for shelf_locator in mets.get_shelf_locators(): @@ -534,13 +553,13 @@ def set_encoding_description(self, creator): encoding_desc_details = etree.SubElement(encoding_desc, "%sp" % TEI) encoding_desc_details.text = "Encoded with the help of %s." % creator - def add_repository(self, repository): + def add_repository(self, name): """ Add the repository of the (original) manuscript """ ms_ident = self.tree.xpath('//tei:msDesc/tei:msIdentifier', namespaces=ns)[0] - repository_node = etree.SubElement(ms_ident, "%srepository" % TEI) - repository_node.text = repository + repository = etree.SubElement(ms_ident, "%srepository" % TEI) + repository.text = name def add_identifier(self, type_, value): """ From 073f2b1c816c4eb822f20aa81d25e1f611f3b19b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Dec 2021 01:59:23 +0100 Subject: [PATCH 27/35] fix 7962b8c5 --- mets_mods2tei/api/mets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 8221bf1..8126081 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -340,9 +340,9 @@ def norm_title_first(titleInfo): if location.get_shelfLocator(): self.shelf_locators.extend([shelf_locator.get_valueOf_() for shelf_locator in location.get_shelfLocator()]) elif location.get_physicalLocation(): - self.location_phys = location.get_physicalLocation()[0] + self.location_phys = location.get_physicalLocation()[0].get_valueOf_() elif location.get_url(): - self.location_urls = location.get_url() + self.location_urls = [url.get_valueOf_() for url in location.get_url()] # # URN and VD ID From 8d2fc4163848e016f7f06f48e1638d19eae7c2a7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Dec 2021 02:42:56 +0100 Subject: [PATCH 28/35] add tei:notesStmt/tei:note from mods:note --- mets_mods2tei/api/mets.py | 9 ++++++++- mets_mods2tei/api/tei.py | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 8126081..5691910 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -69,6 +69,7 @@ def __init__(self): self.editors = None self.places = None self.dates = None + self.notes = None self.publishers = None self.edition = None self.digital_origin = None @@ -214,6 +215,12 @@ def norm_title_first(titleInfo): elif role.get_valueOf_() == "aut": self.authors.append((typ, person)) + notes = self.mods.get_note() + if notes: + self.notes = [note.get_valueOf_() for note in notes] + else: + self.notes = [] + # # orgin info origin_info = self.mods.get_originInfo() @@ -224,7 +231,7 @@ def norm_title_first(titleInfo): for place in origin_info[0].get_place(): place_ext = {} for place_term in place.get_placeTerm(): - place_ext[place_term.get_type()] = place_term.get_valueOf_() + place_ext[place_term.get_type() or 'text'] = place_term.get_valueOf_() self.places.append(place_ext) # publication dates diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index 0a3f65c..b6676cc 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -67,6 +67,10 @@ def fill_from_mets(self, mets, ocr=True): for typ, author in mets.get_authors(): self.add_author(author,typ) + # notes + for note in mets.get_notes(): + self.add_note(note) + # places for place in mets.get_places(): self.add_place(place) @@ -445,6 +449,19 @@ def add_author(self, person, typ): for title_stmt in self.tree.xpath('//tei:titleStmt', namespaces=ns): title_stmt.append(copy.deepcopy(author)) + def add_note(self, note): + """ + Add a note with details about the document. + """ + fileDesc = self.tree.xpath('//tei:fileDesc', namespaces=ns)[0] + if not fileDesc.xpath('/tei:notesStmt', namespaces=ns): + notes = etree.SubElement(fileDesc, "%snotesStmt" % TEI) + else: + notes = fileDesc.xpath('/tei:notesStmt', namespaces=ns)[0] + node = etree.SubElement(notes, "%snote" % TEI) + node.text = note + node.set("type", "remarkDocument") + def add_place(self, place): """ Add a publication place to the publication statement. From 06f1ccfc66a2fb45852e312e44a54ef42982667b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Dec 2021 04:31:07 +0100 Subject: [PATCH 29/35] fix tei:editionStmt (does not belong under titleStmt) --- mets_mods2tei/api/tei.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index b6676cc..a7c893d 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -68,7 +68,7 @@ def fill_from_mets(self, mets, ocr=True): self.add_author(author,typ) # notes - for note in mets.get_notes(): + for note in mets.notes: self.add_note(note) # places @@ -277,7 +277,7 @@ def digital_editions(self): Return information on the editions of the digitalized work represented by the TEI Header. """ - return [digital_edition.text for digital_edition in self.tree.xpath('//tei:fileDesc/tei:titleStmt/tei:editionStmt/tei:edition', namespaces=ns)] + return [digital_edition.text for digital_edition in self.tree.xpath('//tei:fileDesc/tei:editionStmt/tei:edition', namespaces=ns)] @property def encoding_dates(self): @@ -509,7 +509,7 @@ def add_digital_edition(self, digital_edition): """ Add an edition statement with details on the digital edition. """ - title_stmt = self.tree.xpath('//tei:titleStmt', namespaces=ns)[0] + title_stmt = self.tree.xpath('//tei:fileDesc', namespaces=ns)[0] edition_stmt = etree.SubElement(title_stmt, "%seditionStmt" % TEI) edition = etree.SubElement(edition_stmt, "%sedition" % TEI) edition.text = digital_edition From c49c2a4e47be1afd946f767892efe3e57833829e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Dec 2021 04:32:34 +0100 Subject: [PATCH 30/35] add tei:keywords | tei:classCode under tei:textClass (for mods:subject | mods:classification) --- mets_mods2tei/api/mets.py | 22 ++++++++++++++++++++++ mets_mods2tei/api/tei.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 5691910..fe77cab 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -85,6 +85,8 @@ def __init__(self): self.scripts = None self.collections = None self.languages = None + self.classifications = None + self.subjects = None self.extents = None self.series = None @@ -272,6 +274,26 @@ def norm_title_first(titleInfo): if not self.scripts: self.scripts.append(self.script_iso.get('Unknown')) + # + # classifications and subjects + classifications = self.mods.get_classification() + self.classifications = dict() + if classifications: + for classification in classifications: + codes = self.classifications.setdefault(classification.get_authority(), list()) + codes.append(classification.get_valueOf_()) + subjects = self.mods.get_subject() + self.subjects = dict() + if subjects: + for subject in subjects: + keywords = self.subjects.setdefault(subject.get_authority(), list()) + for topic in subject.topic: + keywords.append(('topic', topic.get_valueOf_())) + for geographic in subject.geographic: + keywords.append(('geographic', geographic.get_valueOf_())) + for temporal in subject.temporal: + keywords.append(('temporal', temporal.get_valueOf_())) + # # physical description physical_description = self.mods.get_physicalDescription() diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index a7c893d..ccdda13 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -144,6 +144,15 @@ def fill_from_mets(self, mets, ocr=True): for ident_name in mets.get_languages().items(): self.add_language(ident_name) + # classes + for scheme in mets.classifications: + classes = mets.classifications[scheme] + for code in classes: + self.add_classcode(scheme, code) + for scheme in mets.subjects: + keywords = mets.subjects[scheme] + self.add_keywords(scheme, keywords) + # extents for extent in mets.extents: self.add_extent(extent) @@ -598,6 +607,36 @@ def set_type_desc(self, description): par = etree.SubElement(type_desc, "%sp" % TEI) par.text = line + def add_classcode(self, scheme, code): + """ + Add a document classification code. + """ + profile_desc = self.tree.xpath('//tei:profileDesc', namespaces=ns)[0] + if not profile_desc.xpath('/tei:textClass', namespaces=ns): + textclass = etree.SubElement(profile_desc, "%stextClass" % TEI) + else: + textclass = profile_desc.xpath('/tei:textClass', namespaces=ns)[0] + classcode = etree.SubElement(textclass, "%sclassCode" % TEI) + classcode.set("scheme", scheme) + classcode.text = code + + def add_keywords(self, scheme, terms): + """ + Add a document classification list of terms. + """ + profile_desc = self.tree.xpath('//tei:profileDesc', namespaces=ns)[0] + if not profile_desc.xpath('/tei:textClass', namespaces=ns): + textclass = etree.SubElement(profile_desc, "%stextClass" % TEI) + else: + textclass = profile_desc.xpath('/tei:textClass', namespaces=ns)[0] + keywords = etree.SubElement(textclass, "%skeywords" % TEI) + keywords.set("scheme", scheme) + for type_, term in terms: + node = etree.SubElement(keywords, "%sterm" % TEI) + node.text = term + if type_: + node.set("type", type_) + def add_language(self, language): """ Add a language of the source document From 27127febd228a73d2da5d3568e4c61f05fdee939 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Dec 2021 10:25:04 +0100 Subject: [PATCH 31/35] chdir to METS dir if not URL --- mets_mods2tei/scripts/mets_mods2tei.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mets_mods2tei/scripts/mets_mods2tei.py b/mets_mods2tei/scripts/mets_mods2tei.py index cb4b059..7d08253 100644 --- a/mets_mods2tei/scripts/mets_mods2tei.py +++ b/mets_mods2tei/scripts/mets_mods2tei.py @@ -39,6 +39,8 @@ def cli(mets, output, ocr, text_group, img_group, log_level): f = urlopen(mets) except: f = open(mets, "rb") + # physical file: enter METS directory for relative FLocat refs + os.chdir(os.path.dirname(mets)) # # read in METS From 8ac0747e086174d12041e52c5e558796f200ebf3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Dec 2021 10:30:51 +0100 Subject: [PATCH 32/35] fix mods:location (only once, but multiple contents) --- mets_mods2tei/api/mets.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index fe77cab..1533bdb 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -365,12 +365,13 @@ def norm_title_first(titleInfo): # location-related elements are optional or conditional self.shelf_locators = [] - for location in self.mods.get_location(): + if self.mods.get_location(): + location = self.mods.get_location()[0] if location.get_shelfLocator(): self.shelf_locators.extend([shelf_locator.get_valueOf_() for shelf_locator in location.get_shelfLocator()]) - elif location.get_physicalLocation(): + if location.get_physicalLocation(): self.location_phys = location.get_physicalLocation()[0].get_valueOf_() - elif location.get_url(): + if location.get_url(): self.location_urls = [url.get_valueOf_() for url in location.get_url()] # From 20546afbb2b98dccfcf28cf7ecef2050290c1c33 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Dec 2021 10:33:39 +0100 Subject: [PATCH 33/35] fix regression in 27127febd --- mets_mods2tei/scripts/mets_mods2tei.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mets_mods2tei/scripts/mets_mods2tei.py b/mets_mods2tei/scripts/mets_mods2tei.py index 7d08253..da677f1 100644 --- a/mets_mods2tei/scripts/mets_mods2tei.py +++ b/mets_mods2tei/scripts/mets_mods2tei.py @@ -40,7 +40,7 @@ def cli(mets, output, ocr, text_group, img_group, log_level): except: f = open(mets, "rb") # physical file: enter METS directory for relative FLocat refs - os.chdir(os.path.dirname(mets)) + os.chdir(os.path.normpath(os.path.dirname(mets))) # # read in METS From f33a4ca30269e204b35f78ed95256d5eb5681acd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 6 Jan 2022 12:46:04 +0100 Subject: [PATCH 34/35] drop Python 3.5 --- .circleci/config.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 668e893..9d95978 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -45,7 +45,7 @@ workflows: - test: matrix: parameters: - version: [3.5.10, 3.6.15, 3.7.12, 3.8.12, 3.9.9] + version: [3.6.15, 3.7.12, 3.8.12, 3.9.9] deploy: jobs: - pypi: diff --git a/setup.py b/setup.py index 4e368e1..9374c1b 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ packages=find_packages(exclude=('tests', 'docs')), package_data={'mets_mods2tei' : ['data/tei_skeleton.xml', 'data/iso15924-utf8-20180827.txt']}, install_requires=open('requirements.txt').read().split('\n'), - python_requires=">=3.5", + python_requires=">=3.6", classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', From 8204bfc04dc6dd39f71af62a5772b5f73b20f4ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kay-Michael=20W=C3=BCrzner?= Date: Thu, 6 Jan 2022 13:58:30 +0100 Subject: [PATCH 35/35] Revert regression fix in README.md https://www.capstoneediting.com.au/blog/how-to-hyphenate-a-compound-noun --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4ac616d..0f9ce70 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ Usage: mm2tei [OPTIONS] METS Options: -O, --output FILENAME File path to write TEI output to -o, --ocr Serialize OCR into resulting TEI - -T, --text-group TEXT File group which contains the full-text + -T, --text-group TEXT File group which contains the full text -I, --img-group TEXT File group which contains the images -l, --log-level [DEBUG|INFO|WARN|ERROR|OFF] -h, --help Show this message and exit.