Skip to content

Commit

Permalink
import: add ios, refs #24731
Browse files Browse the repository at this point in the history
  • Loading branch information
miku committed Jan 2, 2024
1 parent 1bde957 commit 73ced1f
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 5 deletions.
33 changes: 28 additions & 5 deletions cmd/span-import/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/miku/span/formats/highwire"
"github.com/miku/span/formats/ieee"
"github.com/miku/span/formats/imslp"
"github.com/miku/span/formats/ios"
"github.com/miku/span/formats/jstor"
"github.com/miku/span/formats/mediarep"
"github.com/miku/span/formats/olms"
Expand Down Expand Up @@ -73,6 +74,7 @@ var FormatMap = map[string]Factory{
"hhbd": func() interface{} { return new(hhbd.Record) },
"highwire": func() interface{} { return new(highwire.Record) },
"ieee": func() interface{} { return new(ieee.Publication) },
"ios": func() interface{} { return new(ios.Article) },
"imslp": func() interface{} { return new(imslp.Data) },
"jstor": func() interface{} { return new(jstor.Article) },
"mediarep-dim": func() interface{} { return new(mediarep.Dim) },
Expand Down Expand Up @@ -251,14 +253,35 @@ func main() {

switch *name {
// XXX: Configure this in one place.
case "highwire", "ceeol", "ieee", "genios", "jstor", "thieme-tm",
"zvdd", "degruyter", "zvdd-mets", "hhbd", "thieme-nlm", "olms",
"olms-mets", "ssoar", "genderopen", "mediarep-dim",
"ceeol-marcxml", "doaj-oai", "dblp":
case
"ceeol",
"ceeol-marcxml",
"dblp",
"degruyter",
"doaj-oai",
"genderopen",
"genios",
"hhbd",
"highwire",
"ieee",
"ios",
"jstor",
"mediarep-dim",
"olms",
"olms-mets",
"ssoar",
"thieme-nlm",
"thieme-tm",
"zvdd",
"zvdd-mets":
if err := processXML(reader, w, *name); err != nil {
log.Fatal(err)
}
case "doaj", "doaj-api", "crossref", "dummy":
case
"crossref",
"doaj",
"doaj-api",
"dummy":
if err := processJSON(reader, w, *name); err != nil {
log.Fatal(err)
}
Expand Down
1 change: 1 addition & 0 deletions fixtures/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ios/*
149 changes: 149 additions & 0 deletions formats/ios/article.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
package ios

// Copyright 2023 by Leipzig University Library, http://ub.uni-leipzig.de
// The Finc Authors, http://finc.info
// Martin Czygan, <martin.czygan@uni-leipzig.de>
//
// This file is part of some open source application.
//
// Some open source application is free software: you can redistribute
// it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either
// version 3 of the License, or (at your option) any later version.
//
// Some open source application is distributed in the hope that it will
// be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Foobar. If not, see <http://www.gnu.org/licenses/>.
//
// @license GPL-3.0+ <http://spdx.org/licenses/GPL-3.0+>

import (
"encoding/base64"
"encoding/xml"
"fmt"
"regexp"
"strings"

"github.com/miku/span"
"github.com/miku/span/container"
"github.com/miku/span/formats/finc"
"github.com/miku/span/formats/jats"
"golang.org/x/text/language"
)

const (
// SourceID for internal bookkeeping, refs #24731
SourceID = "219"
// SourceName for finc.mega_collection.
SourceName = "IOS Press"
// Format for intermediate schema.
Format = "ElectronicArticle"
)

var (
// ArticleTitleBlockPatterns
ArticleTitleBlockPatterns = []*regexp.Regexp{
regexp.MustCompile(`(?i)(front|back)\s*matter`),
regexp.MustCompile(`(?i)table\s*of\s*content[s]?`),
}
DOIPattern = regexp.MustCompile(`10\.[0-9]+\/\S+`)
)

type Article struct {
XMLName xml.Name `xml:"article"`
jats.Article
}

// Identifiers returns the doi and the dependent url and recordID in a struct.
// Records from this source do not need a DOI necessarily.
func (article *Article) Identifiers() (jats.Identifiers, error) {
locator := article.Front.Article.SelfURI.Value

doi := DOIPattern.FindString(locator)
id := fmt.Sprintf("ai-%s-%s", SourceID, base64.RawURLEncoding.EncodeToString([]byte(locator)))
return jats.Identifiers{DOI: doi, URL: locator, ID: id}, nil
}

// Authors returns the authors as slice.
func (article *Article) Authors() []finc.Author {
var authors []finc.Author
group := article.Front.Article.ContribGroup
for _, contrib := range group.Contrib {
authors = append(authors, finc.Author{
LastName: contrib.StringName.Surname.Value,
FirstName: contrib.StringName.GivenNames.Value})
}
return authors
}

// Languages returns a list of language in 3-letter format.
func (article *Article) Languages() []string {
set := container.NewStringSet()
for _, cm := range article.Front.Article.CustomMetaGroup.CustomMeta {
if cm.Name.Value == "lang" {
base, err := language.ParseBase(cm.Value.Value)
if err == nil {
set.Add(base.ISO3())
}
}
}
return set.Values()
}

// ToIntermediateSchema converts an article into an internal schema. There are a
// couple of content-dependent choices here.
func (article *Article) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
output, err := article.Article.ToIntermediateSchema()
if err != nil {
return output, err
}
ids, err := article.Identifiers()
if err != nil {
return output, err
}
output.DOI = ids.DOI
id := ids.ID
if len(id) > span.KeyLengthLimit {
return output, span.Skip{Reason: fmt.Sprintf("id too long: %s", id)}
}
if len(id) == 0 {
return nil, span.Skip{Reason: fmt.Sprintf("no doi or identifier: %v", article)}
}
output.ID = id
output.RecordID = ids.DOI
output.URL = append(output.URL, ids.URL)
output.Authors = article.Authors()
output.Format = Format
output.Languages = article.Languages()
output.MegaCollections = []string{SourceName}
output.SourceID = SourceID
var normalized []string
for _, issn := range output.ISSN {
if len(issn) == 8 && !strings.Contains(issn, "-") {
normalized = append(normalized, fmt.Sprintf("%s-%s", issn[:4], issn[4:]))
}
}
output.ISSN = normalized
// refs #5686
if output.Date.IsZero() {
return output, span.Skip{Reason: fmt.Sprintf("zero date: %s", output.ID)}
}
// refs #5686
for _, p := range ArticleTitleBlockPatterns {
if p.MatchString(output.ArticleTitle) {
return output, span.Skip{Reason: fmt.Sprintf("title blacklisted: %s", output.ArticleTitle)}
}
}
// refs #5686, approx. article type distribution: https://git.io/vzlCr
// switch article.Type {
// case "book-review", "book-reviews", "Book Review":
// output.ArticleTitle = fmt.Sprintf("Review: %s", article.ReviewedProduct())
// case "misc", "other", "front-matter", "back-matter", "announcement", "font-matter", "fm", "fornt-matter":
// return output, span.Skip{Reason: fmt.Sprintf("suppressed format: %s", article.Type)}
// }
return output, nil
}

0 comments on commit 73ced1f

Please sign in to comment.