From 73ced1f3a73faf7b33e71f5fc7acccf3835f252e Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 2 Jan 2024 14:08:48 +0100 Subject: [PATCH] import: add ios, refs #24731 --- cmd/span-import/main.go | 33 +++++++-- fixtures/.gitignore | 1 + formats/ios/article.go | 149 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 178 insertions(+), 5 deletions(-) create mode 100644 fixtures/.gitignore create mode 100644 formats/ios/article.go diff --git a/cmd/span-import/main.go b/cmd/span-import/main.go index 7b6e2642..a880c4de 100644 --- a/cmd/span-import/main.go +++ b/cmd/span-import/main.go @@ -30,6 +30,7 @@ import ( "github.com/miku/span/formats/highwire" "github.com/miku/span/formats/ieee" "github.com/miku/span/formats/imslp" + "github.com/miku/span/formats/ios" "github.com/miku/span/formats/jstor" "github.com/miku/span/formats/mediarep" "github.com/miku/span/formats/olms" @@ -73,6 +74,7 @@ var FormatMap = map[string]Factory{ "hhbd": func() interface{} { return new(hhbd.Record) }, "highwire": func() interface{} { return new(highwire.Record) }, "ieee": func() interface{} { return new(ieee.Publication) }, + "ios": func() interface{} { return new(ios.Article) }, "imslp": func() interface{} { return new(imslp.Data) }, "jstor": func() interface{} { return new(jstor.Article) }, "mediarep-dim": func() interface{} { return new(mediarep.Dim) }, @@ -251,14 +253,35 @@ func main() { switch *name { // XXX: Configure this in one place. - case "highwire", "ceeol", "ieee", "genios", "jstor", "thieme-tm", - "zvdd", "degruyter", "zvdd-mets", "hhbd", "thieme-nlm", "olms", - "olms-mets", "ssoar", "genderopen", "mediarep-dim", - "ceeol-marcxml", "doaj-oai", "dblp": + case + "ceeol", + "ceeol-marcxml", + "dblp", + "degruyter", + "doaj-oai", + "genderopen", + "genios", + "hhbd", + "highwire", + "ieee", + "ios", + "jstor", + "mediarep-dim", + "olms", + "olms-mets", + "ssoar", + "thieme-nlm", + "thieme-tm", + "zvdd", + "zvdd-mets": if err := processXML(reader, w, *name); err != nil { log.Fatal(err) } - case "doaj", "doaj-api", "crossref", "dummy": + case + "crossref", + "doaj", + "doaj-api", + "dummy": if err := processJSON(reader, w, *name); err != nil { log.Fatal(err) } diff --git a/fixtures/.gitignore b/fixtures/.gitignore new file mode 100644 index 00000000..dca7207d --- /dev/null +++ b/fixtures/.gitignore @@ -0,0 +1 @@ +ios/* diff --git a/formats/ios/article.go b/formats/ios/article.go new file mode 100644 index 00000000..f5392238 --- /dev/null +++ b/formats/ios/article.go @@ -0,0 +1,149 @@ +package ios + +// Copyright 2023 by Leipzig University Library, http://ub.uni-leipzig.de +// The Finc Authors, http://finc.info +// Martin Czygan, +// +// This file is part of some open source application. +// +// Some open source application is free software: you can redistribute +// it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either +// version 3 of the License, or (at your option) any later version. +// +// Some open source application is distributed in the hope that it will +// be useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Foobar. If not, see . +// +// @license GPL-3.0+ + +import ( + "encoding/base64" + "encoding/xml" + "fmt" + "regexp" + "strings" + + "github.com/miku/span" + "github.com/miku/span/container" + "github.com/miku/span/formats/finc" + "github.com/miku/span/formats/jats" + "golang.org/x/text/language" +) + +const ( + // SourceID for internal bookkeeping, refs #24731 + SourceID = "219" + // SourceName for finc.mega_collection. + SourceName = "IOS Press" + // Format for intermediate schema. + Format = "ElectronicArticle" +) + +var ( + // ArticleTitleBlockPatterns + ArticleTitleBlockPatterns = []*regexp.Regexp{ + regexp.MustCompile(`(?i)(front|back)\s*matter`), + regexp.MustCompile(`(?i)table\s*of\s*content[s]?`), + } + DOIPattern = regexp.MustCompile(`10\.[0-9]+\/\S+`) +) + +type Article struct { + XMLName xml.Name `xml:"article"` + jats.Article +} + +// Identifiers returns the doi and the dependent url and recordID in a struct. +// Records from this source do not need a DOI necessarily. +func (article *Article) Identifiers() (jats.Identifiers, error) { + locator := article.Front.Article.SelfURI.Value + + doi := DOIPattern.FindString(locator) + id := fmt.Sprintf("ai-%s-%s", SourceID, base64.RawURLEncoding.EncodeToString([]byte(locator))) + return jats.Identifiers{DOI: doi, URL: locator, ID: id}, nil +} + +// Authors returns the authors as slice. +func (article *Article) Authors() []finc.Author { + var authors []finc.Author + group := article.Front.Article.ContribGroup + for _, contrib := range group.Contrib { + authors = append(authors, finc.Author{ + LastName: contrib.StringName.Surname.Value, + FirstName: contrib.StringName.GivenNames.Value}) + } + return authors +} + +// Languages returns a list of language in 3-letter format. +func (article *Article) Languages() []string { + set := container.NewStringSet() + for _, cm := range article.Front.Article.CustomMetaGroup.CustomMeta { + if cm.Name.Value == "lang" { + base, err := language.ParseBase(cm.Value.Value) + if err == nil { + set.Add(base.ISO3()) + } + } + } + return set.Values() +} + +// ToIntermediateSchema converts an article into an internal schema. There are a +// couple of content-dependent choices here. +func (article *Article) ToIntermediateSchema() (*finc.IntermediateSchema, error) { + output, err := article.Article.ToIntermediateSchema() + if err != nil { + return output, err + } + ids, err := article.Identifiers() + if err != nil { + return output, err + } + output.DOI = ids.DOI + id := ids.ID + if len(id) > span.KeyLengthLimit { + return output, span.Skip{Reason: fmt.Sprintf("id too long: %s", id)} + } + if len(id) == 0 { + return nil, span.Skip{Reason: fmt.Sprintf("no doi or identifier: %v", article)} + } + output.ID = id + output.RecordID = ids.DOI + output.URL = append(output.URL, ids.URL) + output.Authors = article.Authors() + output.Format = Format + output.Languages = article.Languages() + output.MegaCollections = []string{SourceName} + output.SourceID = SourceID + var normalized []string + for _, issn := range output.ISSN { + if len(issn) == 8 && !strings.Contains(issn, "-") { + normalized = append(normalized, fmt.Sprintf("%s-%s", issn[:4], issn[4:])) + } + } + output.ISSN = normalized + // refs #5686 + if output.Date.IsZero() { + return output, span.Skip{Reason: fmt.Sprintf("zero date: %s", output.ID)} + } + // refs #5686 + for _, p := range ArticleTitleBlockPatterns { + if p.MatchString(output.ArticleTitle) { + return output, span.Skip{Reason: fmt.Sprintf("title blacklisted: %s", output.ArticleTitle)} + } + } + // refs #5686, approx. article type distribution: https://git.io/vzlCr + // switch article.Type { + // case "book-review", "book-reviews", "Book Review": + // output.ArticleTitle = fmt.Sprintf("Review: %s", article.ReviewedProduct()) + // case "misc", "other", "front-matter", "back-matter", "announcement", "font-matter", "fm", "fornt-matter": + // return output, span.Skip{Reason: fmt.Sprintf("suppressed format: %s", article.Type)} + // } + return output, nil +}