diff --git a/.circleci/config.yml b/.circleci/config.yml index 30a331e..9d95978 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,19 +1,54 @@ # Python CircleCI 2.1 configuration file # for mets-mods2tei # -# Check https://circleci.com/docs/2.1/language-python/ for more details +# Check https://circleci.com/docs/2.0/language-python/ for more details # version: 2.1 orbs: codecov: codecov/codecov@1.0.5 jobs: - build: + test: + parameters: + version: + type: string docker: - - image: python:3.6 + - image: circleci/python:<< parameters.version >> working_directory: ~/repo steps: - checkout - - run: pip install -r requirements-test.txt - - run: pip install . + - run: make deps deps-test + - run: make install + - run: make test - run: make coverage - codecov/upload + pypi: + docker: + - image: circleci/python:3.6 + working_directory: ~/repo + steps: + - checkout + - setup_remote_docker + - run: make install + - run: python setup.py sdist + - run: | + pip install cibuildwheel + cibuildwheel --output-dir dist + - store_artifacts: + path: dist/ + destination: artifacts + # later: upload to PyPI... + +workflows: + version: 2 + test-all: + jobs: + - test: + matrix: + parameters: + version: [3.6.15, 3.7.12, 3.8.12, 3.9.9] + deploy: + jobs: + - pypi: + filters: + branches: + only: master diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..69ae629 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,64 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] +### Added +- tests for TEI API +- tests for insertion index identification +- more logging +- CLI param for output file +- CLI param for image fileGrp + +### Changed +- Add `front`, `body` and `back` per default +- Log to stderr instead of stdout +- Differentiate between (physical) image nr and (logical) page nr + +### Fixed +- Evaluate texts from all struct types but `binding` and `colour_checker`, #43 +- Handle errors during language code expansion, and fallback to `Unbekannt`, #47 +- Add ALTO `HYP` text content if available, #52 +- Allow empty logical structMap and structLink, fallback to physical, or empty, #57 +- Allow partial dmdSec (MODS) or amdSec, fallback to empty, #46, #51 +- Pass all `mods:identifier`s to `msIdentifier/idno` (not just VD and URN) +- Parse full `titleInfo` (main/sub/part/volume), and re-use in `biblFull` +- Prefer `titleInfo/title` over `div/@LABEL` if available +- Map top logical `div/@TYPE` into allowed `biblFull/title/@level` only +- Map top logical `div/@TYPE` into appropriate `bibl/@type` if possible + +## [0.1.1] - 2020-05-11 +### Added +- Make full text file group selectable by user +- Add poor man's namespace versioning handling + +### Changed +- Make extraction of subtitles conditional on their presence +- Use "licence" for all types of licences (even unknown ones), #39 + +### Fixed +- Handle nested `@ADMID="AMD"` divs in logical `structMap` (i.e. newspaper case), #43 +- Allow for local path entries (in addition to URLs) in METS, #41 +- Add special treatment for URNs and VD IDs, #37 + +## [0.1.0] - 2019-12-04 +### Added +- Correctly place structures which are not on top of a page +- Set `corresp` and `facs` attributes of `pb` elements +- Store links to `DEFAULT` images in METS +- Tests for new functionality +- Add Changelog file, #28 + +### Changed +- Retrieve ALTO files via a dedicated struct link member of the class `Mets` +- Move text retrieval to `Alto` class + +### Removed +- Get rid of code artifacts carried over from `tocrify` + + +[unreleased]: ../../compare/v0.1.1...master +[0.1.1]: ../../compare/v0.1.0...v0.1.1 +[0.1.0]: ../../compare/v1.0...v0.1.0 diff --git a/Changelog b/Changelog deleted file mode 100644 index 82258e3..0000000 --- a/Changelog +++ /dev/null @@ -1,52 +0,0 @@ -# Changelog -All notable changes to this project will be documented in this file. - -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## [Unreleased] -### Changed -- Added tests for TEI API -- Added tests for insertion index identification -- Evaluate texts from all struct types but `binding` and `colour_checker` -- Add `front`, `body` and `back` per default - -### Fixed -- https://github.com/slub/mets-mods2tei/issues/43 -- https://github.com/slub/mets-mods2tei/issues/47 - -## [0.1.1] - 2020-05-11 -### Added -- Treat nested AMD-type (non-logical) divs in logical struct map (i.e. -newspaper case) -- Make full text file group selectable by user -- Allow for file entries (in addition to URLs) in METS -- Add special treatment for URNs and VD IDs -- Add poor man's namespace versioning handling - -### Changed -- Make extraction of subtitles conditional on their presence -- Use "licence" for all types of licences (even unknown ones) - -### Fixed -- https://github.com/slub/mets-mods2tei/issues/28 -- https://github.com/slub/mets-mods2tei/issues/37 -- https://github.com/slub/mets-mods2tei/issues/39 -- https://github.com/slub/mets-mods2tei/issues/41 - -## [0.1.0] - 2019-12-04 -### Added -- Correctly place structures which are not on top of a page -- Set `corresp` and `facs` attributes of `pb` elements -- Store links to `DEFAULT` images in METS -- Tests for new functionality - -### Changed -- Retrieve ALTO files via a dedicated struct link member of the class Mets -- Move text retrieval to Alto class - -### Removed -- Get rid of code artifacts carried over from `tocrify` - -### Fixed -- https://github.com/slub/mets-mods2tei/issues/28 diff --git a/Makefile b/Makefile index fdc422a..3750276 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ # Python interpreter. Default: '$(PYTHON)' -PYTHON = python +PYTHON ?= python +PIP ?= pip # BEGIN-EVAL makefile-parser --make-help Makefile @@ -7,12 +8,16 @@ help: @echo "" @echo " Targets" @echo "" + @echo " install Install this package" + @echo " deps Install dependencies only" + @echo " deps-test Install dependencies for testing only" @echo " test Run all unit tests" @echo " coverage Run coverage tests" @echo "" @echo " Variables" @echo "" @echo " PYTHON Python interpreter. Default: '$(PYTHON)'" + @echo " PIP Python packager. Default: '$(PIP)'" # END-EVAL @@ -20,7 +25,16 @@ help: # Tests # -.PHONY: test coverage +.PHONY: install test coverage deps deps-test + +install: + $(PIP) install . + +deps: + $(PIP) install -r requirements.txt + +deps-test: + $(PIP) install -r requirements-test.txt # Run all unit tests test: diff --git a/README.md b/README.md index 63df7e0..0f9ce70 100644 --- a/README.md +++ b/README.md @@ -106,11 +106,22 @@ Usage: mm2tei [OPTIONS] METS METS: File containing or URL pointing to the METS/MODS XML to be converted + Parse given METS and its meta-data, and convert it to TEI. + + If `--ocr` is given, then also read the ALTO full-text files from the + fileGrp in `--text-group`, and convert page contents accordingly (in + physical order). Decorate page boundaries with image and page numbers, and + reference the corresponding base image files from `--img-group`. + + Output XML to `--output (use '-' for stdout), log to stderr.` + Options: + -O, --output FILENAME File path to write TEI output to -o, --ocr Serialize OCR into resulting TEI -T, --text-group TEXT File group which contains the full text + -I, --img-group TEXT File group which contains the images -l, --log-level [DEBUG|INFO|WARN|ERROR|OFF] - --help Show this message and exit. + -h, --help Show this message and exit. ``` It reads METS XML via URL or file argument and prints the resulting TEI, @@ -118,5 +129,4 @@ including the extracted information from the MODS part of the METS. Example: - mm2tei "https://digital.slub-dresden.de/oai/?verb=GetRecord&metadataPrefix=mets&identifier=oai:de:slub-dresden:db:id-453779263" - + mm2tei -O tei.xml "https://digital.slub-dresden.de/oai/?verb=GetRecord&metadataPrefix=mets&identifier=oai:de:slub-dresden:db:id-453779263" diff --git a/mets_mods2tei/api/alto.py b/mets_mods2tei/api/alto.py index e1a2cec..4da9af9 100644 --- a/mets_mods2tei/api/alto.py +++ b/mets_mods2tei/api/alto.py @@ -92,7 +92,11 @@ def get_text_in_line(self, line): Returns the ALTO-encoded text . :param Element line: The line to extract the text from. """ - return " ".join(element.get("CONTENT") for element in line.xpath("./alto:String", namespaces=ns)) + text = " ".join(element.get("CONTENT") for element in line.xpath("./alto:String", namespaces=ns)) + hyp = line.find("alto:HYP", namespaces=ns) + if hyp is not None: + text += hyp.get("CONTENT") + return text def __compute_fuzzy_distance(self, text1, text2): """ diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 56359ba..1533bdb 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -52,18 +52,24 @@ def __init__(self): self.tree = None self.mets = None self.mods = None + self.page_map = {} self.order_map = {} + self.orderlabel_map = {} self.img_map = {} self.alto_map = {} self.struct_links = {} self.fulltext_group_name = 'FULLTEXT' + self.image_group_name = 'DEFAULT' self.title = None self.sub_titles = None + self.part_titles = None + self.volume_titles = None self.authors = None self.editors = None self.places = None self.dates = None + self.notes = None self.publishers = None self.edition = None self.digital_origin = None @@ -72,13 +78,15 @@ def __init__(self): self.license_url = None self.encoding_date = None self.encoding_desc = None - self.owner_manuscript = None + self.location_phys = None + self.location_urls = None self.shelf_locators = None - self.urn = None - self.vd_id = None + self.identifiers = None self.scripts = None self.collections = None self.languages = None + self.classifications = None + self.subjects = None self.extents = None self.series = None @@ -122,18 +130,74 @@ def __spur(self): """ # - # main title and manuscript type - struct_map_logical = list(filter(lambda x: x.get_TYPE() == "LOGICAL", self.mets.get_structMap()))[0] - title = struct_map_logical.get_div() - self.title = title.get_LABEL() - self.type = title.get_TYPE() + # get publication level + # get main and sub title from top-level logical div as a fallback + self.title = "" + self.biblevel = None + self.bibtype = None + div = self.get_div_structure() + if div: + self.title = div.get_LABEL() # overridden by any titleInfo + div_type = div.get_TYPE() + # differentiate between analytic and closed, periodic and singular, dependent and indepenent types + # (for use in bibl/@type and biblFull//title/@level): + # FIXME: verify this ruleset is correct/standardized (but criteria do not look orthogonal, e.g. "issue" and "proceeding") + if div_type in ["bachelor_thesis", "diploma_thesis", "magister_thesis", "master_thesis", "doctoral_thesis", "habilitation_thesis", "file", "register", "research_paper", "report", "atlas", "album", "letter", "document", "leaflet", "manuscript", "poster", "plan", "study", "judgement", "preprint", "dossier", "paper"]: + self.biblevel = 'u' # unpublished + self.bibtype = 'M' # monograph + elif div_type in ["contained_work", "folder", ]: + self.biblevel = 'a' + self.bibtype = 'DM' # dependent part of monograph + # ? or 'DS' # dependent part of series + elif div_type in ["article"]: + self.biblevel = 'a' # analytic + self.bibtype = 'JA' # journal article + elif div_type in ["periodical", "newspaper"]: + self.biblevel = 'j' # journal + self.bibtype = 'J' # journal + elif div_type in ["lecture"]: + self.biblevel = 's' # series + self.bibtype = '' # ? + elif div_type in ["monograph", ]: + self.biblevel = 'm' # monograph + self.bibtype = 'M' # monograph + elif div_type in ["multivolume_work", "volume"]: + self.biblevel = 'm' # monograph + self.bibtype = 'MM' # monograph within multi-volume monograph + # ? or 'MS' # monograph within series + # ? or 'MMS' # monograph within multi-volume monograph series # - # sub titles - self.sub_titles = [] - for title_info in self.mods.get_titleInfo(): + # titleInfo (main, sub, part/volume) + self.sub_titles = [] # subtitle (mods:titleInfo[mods:subTitle] + self.part_titles = dict() # part title of multipart subseries (mods:titleInfo[mods:partNumber|mods:partName]) + self.volume_titles = dict() # volume title in multivolume monograph (mods:part[mods:detail]) + title_infos = self.mods.get_titleInfo() + if len(title_infos): + def norm_title_first(titleInfo): + if not titleInfo.get_type() or titleInfo.get_type() == 'simple': + # prefer untyped entry ('simple' most likely is from generateDS) + return -1 + if titleInfo.get_type() == 'uniform': + return 0 + return 1 + title_info = sorted(title_infos, key=norm_title_first)[0] + if title_info.get_title(): + self.title = title_info.get_title()[0].get_valueOf_().strip() for sub_title in title_info.get_subTitle(): self.sub_titles.append(sub_title.get_valueOf_().strip()) + for part_number, part_name in zip(title_info.get_partNumber(), title_info.get_partName()): + self.part_titles[part_number.get_valueOf_().strip()] = part_name.get_valueOf_().strip() + part_infos = self.mods.get_part() + if len(part_infos): + part_info = part_infos[0] + order = str(part_info.get_order() or 0) + for detail in part_info.get_detail(): + typ = detail.get_type() + val = ', '.join([title.get_valueOf_().strip() + for title in detail.get_number() + detail.get_caption() + detail.get_title()]) + self.volume_titles[order, typ] = val + # # authors and editors self.authors = [] @@ -145,7 +209,7 @@ def __spur(self): person[name_part.get_type()] = name_part.get_valueOf_() # either author or editor - roles = name.get_role()[0].get_roleTerm() + roles = name.get_role()[0].get_roleTerm() if name.get_role() else [] # TODO: handle the complete set of allowed roles for role in roles: if role.get_valueOf_() == "edt": @@ -153,31 +217,42 @@ def __spur(self): elif role.get_valueOf_() == "aut": self.authors.append((typ, person)) + notes = self.mods.get_note() + if notes: + self.notes = [note.get_valueOf_() for note in notes] + else: + self.notes = [] + # # orgin info - origin_info = self.mods.get_originInfo()[0] + origin_info = self.mods.get_originInfo() # publication place self.places = [] - for place in origin_info.get_place(): - place_ext = {} - for place_term in place.get_placeTerm(): - place_ext[place_term.get_type()] = place_term.get_valueOf_() - self.places.append(place_ext) + if origin_info: + for place in origin_info[0].get_place(): + place_ext = {} + for place_term in place.get_placeTerm(): + place_ext[place_term.get_type() or 'text'] = place_term.get_valueOf_() + self.places.append(place_ext) # publication dates self.dates = {} - for date_issued in origin_info.get_dateIssued(): - date_type = date_issued.get_point() if date_issued.get_point() != None else "unspecified" - self.dates[date_type] = date_issued.get_valueOf_() + if origin_info: + for date_issued in origin_info[0].get_dateIssued(): + date_type = date_issued.get_point() if date_issued.get_point() != None else "unspecified" + self.dates[date_type] = date_issued.get_valueOf_() # publishers self.publishers = [] - for publisher in origin_info.get_publisher(): - self.publishers.append(publisher.get_valueOf_()) + if origin_info: + for publisher in origin_info[0].get_publisher(): + self.publishers.append(publisher.get_valueOf_()) # edition of the manuscript - self.edition = origin_info.get_edition()[0].get_valueOf_() if origin_info.get_edition() else "" + self.edition = "" + if origin_info and origin_info[0].get_edition(): + self.edition = origin_info[0].get_edition()[0].get_valueOf_() # # languages and scripts @@ -199,30 +274,57 @@ def __spur(self): if not self.scripts: self.scripts.append(self.script_iso.get('Unknown')) + # + # classifications and subjects + classifications = self.mods.get_classification() + self.classifications = dict() + if classifications: + for classification in classifications: + codes = self.classifications.setdefault(classification.get_authority(), list()) + codes.append(classification.get_valueOf_()) + subjects = self.mods.get_subject() + self.subjects = dict() + if subjects: + for subject in subjects: + keywords = self.subjects.setdefault(subject.get_authority(), list()) + for topic in subject.topic: + keywords.append(('topic', topic.get_valueOf_())) + for geographic in subject.geographic: + keywords.append(('geographic', geographic.get_valueOf_())) + for temporal in subject.temporal: + keywords.append(('temporal', temporal.get_valueOf_())) + # # physical description - physical_description = self.mods.get_physicalDescription()[0] + physical_description = self.mods.get_physicalDescription() # digital origin - self.digital_origin = physical_description.get_digitalOrigin()[0] if physical_description.get_digitalOrigin() else "" + self.digital_origin = "" + if physical_description and physical_description[0].get_digitalOrigin(): + self.digital_origin = physical_description[0].get_digitalOrigin()[0] # extent self.extents = [] - for extent in physical_description.get_extent(): - self.extents.append(extent.get_valueOf_()) + if physical_description: + for extent in physical_description[0].get_extent(): + self.extents.append(extent.get_valueOf_()) # # dv FIXME: replace with generated code as soon as schema is available - dv = etree.fromstring(self.mets.get_amdSec()[0].get_rightsMD()[0].get_mdWrap().get_xmlData().get_anytypeobjs_()[0]) + amdsec = self.mets.get_amdSec() + if amdsec and amdsec[0].get_rightsMD(): + dv = etree.fromstring(amdsec[0].get_rightsMD()[0].get_mdWrap().get_xmlData().get_anytypeobjs_()[0]) + else: + dv = None # owner of the digital edition - self.owner_digital = dv.xpath("//dv:owner", namespaces=ns)[0].text + self.owner_digital = dv.xpath("//dv:owner", namespaces=ns)[0].text if dv is not None else "" # availability/license # common case self.license = "" self.license_url = "" - license_nodes = dv.xpath("//dv:license", namespaces=ns) + license_nodes = dv.xpath("//dv:license", namespaces=ns) if dv is not None else [] if license_nodes != []: self.license = license_nodes[0].text self.license_url = "" @@ -237,34 +339,48 @@ def __spur(self): # # metsHdr header = self.mets.get_metsHdr() - - # encoding date - self.encoding_date = header.get_CREATEDATE().isoformat() - - # encoding description - self.encoding_desc = list(filter(lambda x: x.get_OTHERTYPE() == "SOFTWARE", header.get_agent()))[0].get_name() + if header: + # encoding date + self.encoding_date = header.get_CREATEDATE() + # encoding description + self.encoding_desc = [agent.get_name() + for agent in header.get_agent() + if agent.get_TYPE() == "OTHER" and agent.get_OTHERTYPE() == "SOFTWARE"] + else: + self.encoding_date = None + self.encoding_desc = None + + if self.encoding_date: + self.encoding_date = self.encoding_date.isoformat() + else: + self.logger.error("Found no @CREATEDATE for publicationStmt/date") + if self.encoding_desc: + self.encoding_desc = self.encoding_desc[0] # or -1? + # what about agent.get_OTHERROLE() and agent.get_note()? + else: + self.logger.error("Found no mets:agent for encodingDesc") # # location of manuscript # location-related elements are optional or conditional self.shelf_locators = [] - for location in self.mods.get_location(): + if self.mods.get_location(): + location = self.mods.get_location()[0] if location.get_shelfLocator(): self.shelf_locators.extend([shelf_locator.get_valueOf_() for shelf_locator in location.get_shelfLocator()]) - elif location.get_physicalLocation(): - self.owner_manuscript = location.get_physicalLocation() + if location.get_physicalLocation(): + self.location_phys = location.get_physicalLocation()[0].get_valueOf_() + if location.get_url(): + self.location_urls = [url.get_valueOf_() for url in location.get_url()] # # URN and VD ID - self.urn = "" - self.vd_id = "" + self.identifiers = dict() identifiers = self.mods.get_identifier() - for identifier in identifiers: - if identifier.get_type().lower() == "urn": - self.urn = identifier.get_valueOf_() - elif identifier.get_type().lower().startswith("vd"): - self.vd_id = identifier.get_valueOf_() + if len(identifiers): + for identifier in identifiers: + self.identifiers[identifier.get_type()] = identifier.get_valueOf_() # # collections (from relatedItem) @@ -284,30 +400,43 @@ def __spur(self): if fulltext_group: fulltext_map = {} for entry in fulltext_group[0].xpath("./mets:file", namespaces=ns): - fulltext_map[entry.get("ID")] = entry.find("./" + METS + "FLocat").get("%shref" % XLINK) - - # default - default_map = {} - default_group = self.tree.xpath("//mets:fileGrp[@USE='DEFAULT']", namespaces=ns) - if default_group: - for entry in default_group[0].xpath("./mets:file", namespaces=ns): - default_map[entry.get("ID")] = entry.find("./" + METS + "FLocat").get("%shref" % XLINK) + url = entry.find("./" + METS + "FLocat").get("%shref" % XLINK) + self.logger.debug("Found full-text file: %s", url) + fulltext_map[entry.get("ID")] = url + + # image + image_map = {} + image_group = self.tree.xpath("//mets:fileGrp[@USE='%s']" % self.image_group_name, namespaces=ns) + if image_group: + for entry in image_group[0].xpath("./mets:file", namespaces=ns): + url = entry.find("./" + METS + "FLocat").get("%shref" % XLINK) + self.logger.debug("Found image file: %s", url) + image_map[entry.get("ID")] = url # struct map physical - for div in list(filter(lambda x: x.get_TYPE() == 'PHYSICAL', self.mets.get_structMap()))[0].get_div().get_div(): - self.order_map[div.get_ID()] = div.get_ORDER() + for div in self.get_page_structure().get_div(): + page = div.get_ID() + self.logger.debug("Found physical page: %s", page) + self.page_map[page] = div + if div.get_ORDER(): + self.order_map[page] = div.get_ORDER() + if div.get_ORDERLABEL(): + self.orderlabel_map[page] = div.get_ORDERLABEL() for fptr in div.get_fptr(): if fptr.get_FILEID() in fulltext_map: - self.alto_map[div.get_ID()] = fulltext_map[fptr.get_FILEID()] - elif fptr.get_FILEID() in default_map: - self.img_map[div.get_ID()] = default_map[fptr.get_FILEID()] + self.alto_map[page] = fulltext_map[fptr.get_FILEID()] + elif fptr.get_FILEID() in image_map: + self.img_map[page] = image_map[fptr.get_FILEID()] # struct links - for sm_link in self.tree.xpath("//mets:structLink", namespaces=ns)[0].iterchildren(): - if sm_link.get("%sto" % XLINK) in self.alto_map: - if sm_link.get("%sfrom" % XLINK) not in self.struct_links: - self.struct_links[sm_link.get("%sfrom" % XLINK)] = [] - self.struct_links[sm_link.get("%sfrom" % XLINK)].append(sm_link.get("%sto" % XLINK)) + structlinks = self.tree.xpath("//mets:structLink/*", namespaces=ns) + for sm_link in structlinks: + logical = sm_link.get("%sfrom" % XLINK) + physical = sm_link.get("%sto" % XLINK) + if physical in self.alto_map: + self.logger.debug("Found structLink from %s to physical page: %s", logical, physical) + pages = self.struct_links.setdefault(logical, list()) + pages.append(physical) @property def fulltext_group_name(self): @@ -329,10 +458,22 @@ def get_main_title(self): def get_sub_titles(self): """ - Return the main title of the work. + Return the sub-titles of the work. """ return self.sub_titles + def get_part_titles(self): + """ + Return the part titles of the work. + """ + return self.part_titles + + def get_volume_titles(self): + """ + Return the volume titles of the work. + """ + return self.volume_titles + def get_authors(self): """ Return the author of the work. @@ -405,29 +546,29 @@ def get_encoding_description(self): """ return self.encoding_desc - def get_owner_manuscript(self): + def get_location_phys(self): """ - Return the owner of the original manuscript + Return the physical location of the original manuscript """ - return self.owner_manuscript + return self.location_phys - def get_shelf_locators(self): + def get_location_urls(self): """ - Return the shelf locators of the original manuscript + Return the URL location of the original manuscript """ - return self.shelf_locators + return self.location_urls - def get_urn(self): + def get_shelf_locators(self): """ - Return the URN of the digital representation + Return the shelf locators of the original manuscript """ - return self.urn + return self.shelf_locators - def get_vd_id(self): + def get_identifiers(self): """ - Return the VD ID of the digital representation + Return the (dict of) identifiers of the digital representation """ - return self.vd_id + return self.identifiers def get_scripts(self): """ @@ -447,6 +588,15 @@ def get_languages(self): """ return self.languages + def get_page_structure(self): + """ + Return the div structure from the physical struct map + """ + for struct_map in self.mets.get_structMap(): + if struct_map.get_TYPE() == "PHYSICAL": + return struct_map.get_div() + return None + def get_div_structure(self): """ Return the div structure from the logical struct map @@ -454,7 +604,7 @@ def get_div_structure(self): for struct_map in self.mets.get_structMap(): if struct_map.get_TYPE() == "LOGICAL": return struct_map.get_div() - return [] + return None def get_struct_links(self, log_id): """ @@ -476,6 +626,12 @@ def get_alto(self, phys_id): def get_order(self, phys_id): """ - Return the manually set order for a given physical ID + Return the logical (manually set) page number for a given physical ID + """ + return self.order_map.get(phys_id, "0") + + def get_orderlabel(self, phys_id): + """ + Return the logical (manually set) page label for a given physical ID """ - return self.order_map.get(phys_id, "-1") + return self.orderlabel_map.get(phys_id, "") diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index 741e72b..ccdda13 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -5,6 +5,7 @@ import os import logging import copy +import re from contextlib import closing from urllib.request import urlopen @@ -38,6 +39,7 @@ def tostring(self): """ Serializes the TEI object as xml string. """ + # needs lxml>=4.5: etree.indent(self.tree, space=" ") return etree.tostring(self.tree, encoding="utf-8") def fill_from_mets(self, mets, ocr=True): @@ -50,18 +52,25 @@ def fill_from_mets(self, mets, ocr=True): # main title self.set_main_title(mets.get_main_title()) + for sub in mets.get_sub_titles(): + self.add_sub_title(sub) + for number, part in mets.get_part_titles().items(): + self.add_part_title(number, part) + for (order, typ), volume in mets.get_volume_titles().items(): + self.add_volume_title(order, typ, volume) + self.init_biblFull() # publication level - self.set_publication_level(mets.type) - - # sub titles - for sub_title in mets.get_sub_titles(): - self.add_sub_title(sub_title) + self.set_publication_level(mets.biblevel or 'u') # authors for typ, author in mets.get_authors(): self.add_author(author,typ) + # notes + for note in mets.notes: + self.add_note(note) + # places for place in mets.get_places(): self.add_place(place) @@ -95,18 +104,37 @@ def fill_from_mets(self, mets, ocr=True): self.set_encoding_description(mets.get_encoding_description()) # repository - if mets.get_owner_manuscript(): - self.add_repository(mets.get_owner_manuscript()) + if mets.get_location_phys(): + # hard to distinguish between settlement, institution and repository at this point + self.add_repository(mets.get_location_phys()) + if mets.get_location_urls(): + for url in mets.get_location_urls(): + # hard to determine type of URL at this point – could be (some form of) presentation, + # URN, PPN, EPN, DOI, URLWeb, URLCatalogue, URLImages, URLText, URLHTML, URLXML, URLTCF, URLIIIF + if url.startswith("urn:"): + typ = "URN" + elif re.fullmatch("10[.][0-9]*/.*", url): + typ = "DOI" + elif re.fullmatch("[0-9]{8}[0-9X]{1,2}", url): + typ = "PPN" + elif re.fullmatch("([0-9]+-)+[0-9]+", url): + typ = "ISBN" + elif re.fullmatch("[0-9]{4}-[0-9]{3}[0-9xX]", url): + typ = "ISSN" + else: + typ = "URL" + self.add_identifier(typ, url) # shelf locator for shelf_locator in mets.get_shelf_locators(): - self.add_shelfmark(shelf_locator) + self.add_identifier("shelfmark", shelf_locator) # identifiers - if mets.get_urn(): - self.add_urn(mets.get_urn()) - if mets.get_vd_id(): - self.add_vd_id(mets.get_vd_id()) + if mets.get_identifiers(): + for type_, value in mets.get_identifiers().items(): + if type_ in ["vd16", "vd17", "vd18"]: + type_ = "VD" + self.add_identifier(type_.upper(), value) # type description if mets.get_scripts(): @@ -116,6 +144,15 @@ def fill_from_mets(self, mets, ocr=True): for ident_name in mets.get_languages().items(): self.add_language(ident_name) + # classes + for scheme in mets.classifications: + classes = mets.classifications[scheme] + for code in classes: + self.add_classcode(scheme, code) + for scheme in mets.subjects: + keywords = mets.subjects[scheme] + self.add_keywords(scheme, keywords) + # extents for extent in mets.extents: self.add_extent(extent) @@ -126,13 +163,24 @@ def fill_from_mets(self, mets, ocr=True): # # citation - self.compile_bibl() + self.compile_bibl(mets.bibtype) # # text part # div structure - self.add_div_structure(mets.get_div_structure()) + div = mets.get_div_structure() + if div is not None: + self.logger.debug("Found logical structMap for %s", div.get_TYPE()) + self.add_div_structure(div) + elif any(mets.alto_map): + self.logger.warning("Found no logical structMap div, falling back to physical") + pages = mets.alto_map.keys() + if any(mets.order_map.values()): + pages = sorted(pages, key=mets.get_order) + self.add_div_structure(None, map(mets.page_map.get, pages)) + else: + self.logger.error("Found no logical or physical structMap div") # OCR if ocr: @@ -146,13 +194,6 @@ def main_title(self): """ return self.tree.xpath('//tei:titleStmt/tei:title[@type="main"]', namespaces=ns)[0].text - @property - def publication_level(self): - """ - Return the level of publication ('monographic' vs. 'analytic') - """ - return self.tree.xpath('//tei:sourceDesc/tei:biblFull/tei:titleStmt/tei:title[@type="main"]', namespaces=ns)[0].get("level") - @property def subtitles(self): """ @@ -172,6 +213,13 @@ def authors(self): authors.append(", ".join(author.xpath('descendant-or-self::*/text()'))) return authors + @property + def publication_level(self): + """ + Return the level of publication ('monographic' vs. 'analytic') + """ + return self.tree.xpath('//tei:sourceDesc/tei:biblFull/tei:titleStmt/tei:title[@type="main"]', namespaces=ns)[0].get("level") + @property def dates(self): """ @@ -238,7 +286,7 @@ def digital_editions(self): Return information on the editions of the digitalized work represented by the TEI Header. """ - return [digital_edition.text for digital_edition in self.tree.xpath('//tei:fileDesc/tei:titleStmt/tei:editionStmt/tei:edition', namespaces=ns)] + return [digital_edition.text for digital_edition in self.tree.xpath('//tei:fileDesc/tei:editionStmt/tei:edition', namespaces=ns)] @property def encoding_dates(self): @@ -308,7 +356,7 @@ def collections(self): Return information on the collections of the work represented by the TEI Header. """ - return [collection.text for collection in self.tree.xpath('//tei:profileDesc/tei:creation', namespaces=ns)] + return [collection.text for collection in self.tree.xpath('//tei:msDesc/tei:msIdentifier/tei:collection', namespaces=ns)] @property def bibl(self): @@ -320,26 +368,70 @@ def bibl(self): def set_main_title(self, string): """ - Set the main title of the title statements. + Set the main title of the tei:titleStmt. """ - for main_title in self.tree.xpath('//tei:titleStmt/tei:title[@type="main"]', namespaces=ns): - main_title.text = string + titleStmt = self.tree.xpath('//tei:titleStmt', namespaces=ns)[0] + for node in titleStmt.xpath('tei:title[@type="main"]', namespaces=ns): + node.text = string - def set_publication_level(self, level): + def add_sub_title(self, string): """ - Set the level of publication ('monographic' vs. 'analytic') + Add a sub-title of the tei:titleStmt. """ - self.tree.xpath('//tei:sourceDesc/tei:biblFull/tei:titleStmt/tei:title[@type="main"]', namespaces=ns)[0].set("level", level) + titleStmt = self.tree.xpath('//tei:titleStmt', namespaces=ns)[0] + node = etree.Element("%stitle" % TEI) + node.set("type", "sub") + node.text = string + titleStmt.append(copy.deepcopy(node)) - def add_sub_title(self, string): + def add_part_title(self, number, string): """ - Add a sub title to the title statements. + Add a part title of the tei:titleStmt. """ - sub_title = etree.Element("%stitle" % TEI) - sub_title.set("type", "sub") - sub_title.text = string - for title_stmt in self.tree.xpath('//tei:titleStmt', namespaces=ns): - title_stmt.append(copy.deepcopy(sub_title)) + titleStmt = self.tree.xpath('//tei:titleStmt', namespaces=ns)[0] + node = etree.Element("%stitle" % TEI) + node.set("type", "part") + node.set("n", number) + node.text = string + titleStmt.append(copy.deepcopy(node)) + + def add_volume_title(self, number, typ, string): + """ + Add a volume title of the tei:titleStmt. + """ + titleStmt = self.tree.xpath('//tei:titleStmt', namespaces=ns)[0] + node = etree.Element("%stitle" % TEI) + node.set("type", typ) + node.set("n", number) + node.text = string + titleStmt.append(copy.deepcopy(node)) + + def init_biblFull(self): + """ + Set the main, sub, and part/volume titles of the tei:biblFull by copying from tei:titleStmt. + """ + titleStmt = self.tree.xpath('//tei:titleStmt', namespaces=ns)[0] + bibl = self.tree.xpath('//tei:sourceDesc/tei:biblFull', namespaces=ns)[0] + bibl.append(copy.deepcopy(titleStmt)) + + def set_publication_level(self, level): + """ + Set the level of publication: + - 'm': (monographic) the title applies to a monograph such as a book + or other item considered to be a distinct publication, + including single volumes of multi-volume works + - 'a': (analytic) the title applies to an analytic item, such as an article, + poem, or other work published as part of a larger item. + - 'j': (journal) the title applies to any serial or periodical publication + such as a journal, magazine, or newspaper + - 's': (series) the title applies to a series of otherwise distinct publications + such as a collection + - 'u': (unpublished) the title applies to any unpublished material + (including theses and dissertations unless published by a commercial press) + """ + assert level in ['m', 'a', 'j', 's', 'u'] + for title in self.tree.xpath('//tei:sourceDesc/tei:biblFull/tei:titleStmt/tei:title', namespaces=ns): + title.set("level", level) def add_author(self, person, typ): """ @@ -366,6 +458,19 @@ def add_author(self, person, typ): for title_stmt in self.tree.xpath('//tei:titleStmt', namespaces=ns): title_stmt.append(copy.deepcopy(author)) + def add_note(self, note): + """ + Add a note with details about the document. + """ + fileDesc = self.tree.xpath('//tei:fileDesc', namespaces=ns)[0] + if not fileDesc.xpath('/tei:notesStmt', namespaces=ns): + notes = etree.SubElement(fileDesc, "%snotesStmt" % TEI) + else: + notes = fileDesc.xpath('/tei:notesStmt', namespaces=ns)[0] + node = etree.SubElement(notes, "%snote" % TEI) + node.text = note + node.set("type", "remarkDocument") + def add_place(self, place): """ Add a publication place to the publication statement. @@ -413,7 +518,7 @@ def add_digital_edition(self, digital_edition): """ Add an edition statement with details on the digital edition. """ - title_stmt = self.tree.xpath('//tei:titleStmt', namespaces=ns)[0] + title_stmt = self.tree.xpath('//tei:fileDesc', namespaces=ns)[0] edition_stmt = etree.SubElement(title_stmt, "%seditionStmt" % TEI) edition = etree.SubElement(edition_stmt, "%sedition" % TEI) edition.text = digital_edition @@ -462,50 +567,35 @@ def add_encoding_date(self, date): publication_stmt = self.tree.xpath('//tei:publicationStmt', namespaces=ns)[0] encoding_date = etree.SubElement(publication_stmt, "%sdate" % TEI) encoding_date.set("type", "publication") - encoding_date.text = date + if date: + encoding_date.text = date def set_encoding_description(self, creator): """ Set some details on the encoding of the digital edition """ encoding_desc = self.tree.xpath('//tei:encodingDesc', namespaces=ns)[0] - encoding_desc_details = etree.SubElement(encoding_desc, "%sp" % TEI) - encoding_desc_details.text = "Encoded with the help of %s." % creator + if creator: + encoding_desc_details = etree.SubElement(encoding_desc, "%sp" % TEI) + encoding_desc_details.text = "Encoded with the help of %s." % creator - def add_repository(self, repository): + def add_repository(self, name): """ Add the repository of the (original) manuscript """ ms_ident = self.tree.xpath('//tei:msDesc/tei:msIdentifier', namespaces=ns)[0] - repository_node = etree.SubElement(ms_ident, "%srepository" % TEI) - repository_node.text = repository - - def add_shelfmark(self, shelfmark): - """ - Add the shelf mark of the (original) manuscript - """ - ms_ident_idno = self.tree.xpath('//tei:msDesc/tei:msIdentifier/tei:idno', namespaces=ns)[0] - idno = etree.SubElement(ms_ident_idno, "%sidno" % TEI) - idno.set("type", "shelfmark") - idno.text = shelfmark - - def add_urn(self, urn): - """ - Add the URN of the digital edition - """ - ms_ident_idno = self.tree.xpath('//tei:msDesc/tei:msIdentifier/tei:idno', namespaces=ns)[0] - idno = etree.SubElement(ms_ident_idno, "%sidno" % TEI) - idno.set("type", "URN") - idno.text = urn + repository = etree.SubElement(ms_ident, "%srepository" % TEI) + repository.text = name - def add_vd_id(self, vd_id): + def add_identifier(self, type_, value): """ - Add the VD ID of the digital edition + Add the URN, PURL, VD ID, shelfmark etc. of the digital edition """ - ms_ident_idno = self.tree.xpath('//tei:msDesc/tei:msIdentifier/tei:idno', namespaces=ns)[0] - idno = etree.SubElement(ms_ident_idno, "%sidno" % TEI) - idno.set("type", "VD") - idno.text = vd_id + ms_ident = self.tree.xpath('//tei:msDesc/tei:msIdentifier/tei:idno', namespaces=ns)[0] + # FIXME: URN, DTAID, ... should go to /tei:fileDesc/tei:publicationStmt/tei:idno instead + idno = etree.SubElement(ms_ident, "%sidno" % TEI) + idno.set("type", type_) + idno.text = value def set_type_desc(self, description): """ @@ -517,6 +607,36 @@ def set_type_desc(self, description): par = etree.SubElement(type_desc, "%sp" % TEI) par.text = line + def add_classcode(self, scheme, code): + """ + Add a document classification code. + """ + profile_desc = self.tree.xpath('//tei:profileDesc', namespaces=ns)[0] + if not profile_desc.xpath('/tei:textClass', namespaces=ns): + textclass = etree.SubElement(profile_desc, "%stextClass" % TEI) + else: + textclass = profile_desc.xpath('/tei:textClass', namespaces=ns)[0] + classcode = etree.SubElement(textclass, "%sclassCode" % TEI) + classcode.set("scheme", scheme) + classcode.text = code + + def add_keywords(self, scheme, terms): + """ + Add a document classification list of terms. + """ + profile_desc = self.tree.xpath('//tei:profileDesc', namespaces=ns)[0] + if not profile_desc.xpath('/tei:textClass', namespaces=ns): + textclass = etree.SubElement(profile_desc, "%stextClass" % TEI) + else: + textclass = profile_desc.xpath('/tei:textClass', namespaces=ns)[0] + keywords = etree.SubElement(textclass, "%skeywords" % TEI) + keywords.set("scheme", scheme) + for type_, term in terms: + node = etree.SubElement(keywords, "%sterm" % TEI) + node.text = term + if type_: + node.set("type", type_) + def add_language(self, language): """ Add a language of the source document @@ -543,20 +663,20 @@ def add_collection(self, collection): """ Add a (free-text) collection of the digital document """ - profile_desc = self.tree.xpath('//tei:profileDesc', namespaces=ns)[0] - creation = etree.SubElement(profile_desc, "%screation" % TEI) - creation.text = collection + profile_desc = self.tree.xpath('//tei:msDesc/tei:msIdentifier', namespaces=ns)[0] + coll = etree.SubElement(profile_desc, "%scollection" % TEI) + coll.text = collection - def compile_bibl(self): + def compile_bibl(self, type_): """ Compile the content of the short citation element 'bibl' based on the current state """ - if self.publication_level: - self.bibl.set("type", self.publication_level) + if type_: + self.bibl.set("type", type_) bibl_text = "" if self.authors: bibl_text += "; ".join(self.authors) + ": " - elif self.publication_level == "monograph": + elif type_ and type_.startswith("M"): bibl_text = "[N. N.], " bibl_text += self.main_title + "." if self.places: @@ -591,10 +711,15 @@ def __add_ocr_to_node(self, node, mets): """ Add text to a given node and recursively add text to children too (post order!) """ - + + node_id = node.get("id") + self.logger.debug("Adding text for %s", node_id) for childnode in node.iterchildren(): self.__add_ocr_to_node(childnode, mets) - struct_links = mets.get_struct_links(node.get("id")) + struct_links = mets.get_struct_links(node_id) + if not struct_links and node_id in mets.page_map: + # already physical + struct_links = [node_id] # a header will always be on the first page of a div first = True @@ -623,7 +748,14 @@ def __add_ocr_to_node(self, node, mets): self.alto_map[alto_link] = alto pb = etree.SubElement(node, "%spb" % TEI) - pb.set("facs", "#f{:04d}".format(int(mets.get_order(struct_link)))) + try: + pagenum = list(mets.page_map.keys()).index(struct_link) + pb.set("facs", "#f{:04d}".format(pagenum + 1)) + except ValueError: + self.logger.warning("cannot determine image number for '%s'", struct_link) + pagenum = mets.get_orderlabel(struct_link) or mets.get_order(struct_link) + if pagenum: + pb.set("n", str(pagenum)) pb.set("corresp", mets.get_img(struct_link)) for text_block in alto.get_text_blocks(): @@ -676,26 +808,34 @@ def __add_ocr_to_node(self, node, mets): node.insert(0, par) first = False - def add_div_structure(self, div): + def add_div_structure(self, div, pages=None): """ - Add div elements to the text body according to the given list of divs + Add logical div elements to the text font/body/back according to the given div hierarchy """ # div structure has to be added to text text = self.tree.xpath('//tei:text', namespaces=ns)[0] + front = etree.SubElement(text, "%sfront" % TEI) + body = etree.SubElement(text, "%sbody" % TEI) + back = etree.SubElement(text, "%sback" % TEI) + + if pages: + for page in pages: + self.logger.debug("Found physical page %s", page.get_ID()) + self.__add_div(body, page, 1) + return - # decent to the deepest AMD + # descend to the deepest AMD while div.get_ADMID() is None: + self.logger.debug("Found logical outer div type %s: %s", div.get_TYPE(), div.get_ID()) div = div.get_div()[0] start_div = div.get_div()[0] + self.logger.debug("Found logical inner div type %s: %s", start_div.get_TYPE(), start_div.get_ID()) while start_div.get_div() and start_div.get_div()[0].get_ADMID() is not None: + self.logger.debug("Found logical inner div type %s: %s", start_div.get_TYPE(), start_div.get_ID()) div = start_div start_div = start_div.get_div()[0] - front = etree.SubElement(text, "%sfront" % TEI) - body = etree.SubElement(text, "%sbody" % TEI) - back = etree.SubElement(text, "%sback" % TEI) - entry_point = front for sub_div in div.get_div(): @@ -704,8 +844,48 @@ def add_div_structure(self, div): elif sub_div.get_TYPE() == "title_page": self.__add_div(entry_point, sub_div, 1, "titlePage") else: + # FIXME: if title_page gets preceded by figure/preface/contents/..., they *all* will end up in body entry_point = body self.__add_div(entry_point, sub_div, 1) + # FIXME: add more structural mappings from METS-Anwendungsprofil (DFG Strukturdatenset) to TEI-P5 tagset (DTAbf) + # ...for example: + # contents → contents + # corrigenda → corrigenda + # dedication → dedication + # index → index + # imprint → imprint + # ? → imprimatur + # priviledges? → copyright + # provenance → ? + # ? → appendix + # ? → advertisement + # preface → preface + # ? → postface + # chapter → chapter + # letter → letter + # verse → poem + # ? → diaryEntry + # ? → recipe + # ? → scene + # ? → act + # ? → frontispiece + # ? → bibliography + # list_illustrations? → figures + # ? → abbreviations + # ? → edition + # cover → ? + # cover_front → ? + # cover_back → ? + # table → ? + # manuscript → ? + # illustration → ? + # section → ? + # article → ? + # issue → ? + # day → ? + # month → ? + # volume → ? + # year → ? def __add_div(self, insert_node, div, n, tag="div"): """ @@ -718,6 +898,9 @@ def __add_div(self, insert_node, div, n, tag="div"): #head = etree.SubElement(new_div, "%s%s" % (TEI, "head")) #head.text = div.get_LABEL() new_div.set("rend", div.get_LABEL()) + self.logger.debug("Adding %s[@id=%s,@n=%d,@rend=%s] for %s", + tag, div.get_ID(), n, div.get_LABEL() or "", + insert_node.tag.split('}')[-1]) for sub_div in div.get_div(): self.__add_div(new_div, sub_div, n+1) diff --git a/mets_mods2tei/data/tei_skeleton.xml b/mets_mods2tei/data/tei_skeleton.xml index 1e04da9..142e491 100644 --- a/mets_mods2tei/data/tei_skeleton.xml +++ b/mets_mods2tei/data/tei_skeleton.xml @@ -8,13 +8,12 @@