From d0ab900bb21622fdc9bfde58f9388c16455fe257 Mon Sep 17 00:00:00 2001 From: Sergio Vera Date: Thu, 8 Aug 2024 17:10:40 +0100 Subject: [PATCH] Use epub lib method for metadata --- internal/index/bleve_write.go | 2 +- internal/metadata/epub.go | 122 ++++++++++++++++------------------ internal/metadata/metadata.go | 1 - 3 files changed, 57 insertions(+), 68 deletions(-) diff --git a/internal/index/bleve_write.go b/internal/index/bleve_write.go index 5100eeaa..1cee28eb 100644 --- a/internal/index/bleve_write.go +++ b/internal/index/bleve_write.go @@ -135,7 +135,7 @@ func (b *BleveIndexer) createDocument(meta metadata.Metadata, fullPath string, b // processed in the current batch in memory to also compare the current doc slug against them. func (b *BleveIndexer) Slug(document DocumentWrite, batchSlugs map[string]struct{}) string { docSlug := makeSlug(document) - exp, err := regexp.Compile(`^[a-zA-Z0-9\-]+(--)\d$`) + exp, err := regexp.Compile(`^[a-zA-Z0-9\-]+(--)[0-9]+$`) if err != nil { log.Fatal(err) } diff --git a/internal/metadata/epub.go b/internal/metadata/epub.go index 562b85cf..ace34881 100644 --- a/internal/metadata/epub.go +++ b/internal/metadata/epub.go @@ -22,26 +22,24 @@ type EpubReader struct{} func (e EpubReader) Metadata(file string) (Metadata, error) { bk := Metadata{} - opf, err := epub.GetPackageFromFile(file) + meta, err := epub.GetMetadataFromFile(file) if err != nil { return bk, err } title := strings.TrimSuffix(filepath.Base(file), filepath.Ext(file)) - if len(opf.Metadata.Title) > 0 && len(opf.Metadata.Title[0].Value) > 0 { - title = opf.Metadata.Title[0].Value + if len(meta.Title) > 0 && len(meta.Title[0]) > 0 { + title = meta.Title[0] } var authors []string - if len(opf.Metadata.Creator) > 0 { - for _, creator := range opf.Metadata.Creator { - if creator.Role == "aut" || creator.Role == "" { - // Some epub files mistakenly put all authors in a single field instead of using a field for each one. - // We want to identify those cases looking for specific separators and then indexing each author properly. - names := strings.Split(creator.Value, "&") - for i := range names { - names[i] = strings.TrimSpace(names[i]) - } - authors = append(authors, names...) + for _, creator := range meta.Creator { + if creator.Role == "aut" || creator.Role == "" { + // Some epub files mistakenly put all authors in a single field instead of using a field for each one. + // We want to identify those cases looking for specific separators and then indexing each author properly. + names := strings.Split(creator.FullName, "&") + for i := range names { + names[i] = strings.TrimSpace(names[i]) } + authors = append(authors, names...) } } @@ -50,72 +48,51 @@ func (e EpubReader) Metadata(file string) (Metadata, error) { } var subjects []string - if len(opf.Metadata.Subject) > 0 { - for _, subject := range opf.Metadata.Subject { - subject.Value = strings.TrimSpace(subject.Value) - if subject.Value == "" { - continue - } - // Some epub files mistakenly put all subjects in a single field instead of using a field for each one. - // We want to identify those cases looking for specific separators and then indexing each subject properly. - names := strings.Split(subject.Value, ",") - for i := range names { - names[i] = strings.TrimSpace(names[i]) - } - subjects = append(subjects, names...) + for _, subject := range meta.Subject { + subject = strings.TrimSpace(subject) + if subject == "" { + continue + } + // Some epub files mistakenly put all subjects in a single field instead of using a field for each one. + // We want to identify those cases looking for specific separators and then indexing each subject properly. + names := strings.Split(subject, ",") + for i := range names { + names[i] = strings.TrimSpace(names[i]) } + subjects = append(subjects, names...) } description := "" - if len(opf.Metadata.Description) > 0 { + if len(meta.Description) > 0 { strict := bluemonday.StrictPolicy() - noHTMLDescription := strict.Sanitize(opf.Metadata.Description[0].Value) - if noHTMLDescription == opf.Metadata.Description[0].Value { - paragraphs := strings.Split(opf.Metadata.Description[0].Value, "\n") + noHTMLDescription := strict.Sanitize(meta.Description[0]) + if noHTMLDescription == meta.Description[0] { + paragraphs := strings.Split(meta.Description[0], "\n") description = "

" + strings.Join(paragraphs, "

") + "

" } else { p := bluemonday.UGCPolicy() - description = p.Sanitize(opf.Metadata.Description[0].Value) + description = p.Sanitize(meta.Description[0]) } } lang := "" - if len(opf.Metadata.Language) > 0 { - lang = opf.Metadata.Language[0].Value + if len(meta.Language) > 0 { + lang = meta.Language[0] } year := "" - if len(opf.Metadata.Date) > 0 { - for _, date := range opf.Metadata.Date { - if date.Event == "publication" || date.Event == "" { - t, err := time.Parse("2006-01-02", date.Value) - if err == nil { - year = strings.TrimLeft(t.Format("2006"), "0") - break - } + for _, date := range meta.Date { + if date.Event == "publication" || date.Event == "" { + t, err := time.Parse("2006-01-02", date.Stamp) + if err == nil { + year = strings.TrimLeft(t.Format("2006"), "0") + break } } } - cover := "" - series := "" var seriesIndex float64 = 0 - for _, val := range opf.Metadata.Meta { - if val.Name == "cover" { - id := val.Content - for _, item := range opf.Manifest.Items { - if item.ID == id { - cover = item.Href - break - } - } - } - if val.Name == "calibre:series" { - series = val.Content - } - if val.Name == "calibre:series_index" { - seriesIndex, _ = strconv.ParseFloat(val.Content, 64) - } - } + + seriesIndex, _ = strconv.ParseFloat(meta.SeriesIndex, 64) bk = Metadata{ Title: title, @@ -123,8 +100,7 @@ func (e EpubReader) Metadata(file string) (Metadata, error) { Description: template.HTML(description), Language: lang, Year: year, - Cover: cover, - Series: series, + Series: meta.Series, SeriesIndex: seriesIndex, Type: "EPUB", Subjects: subjects, @@ -141,12 +117,26 @@ func (e EpubReader) Metadata(file string) (Metadata, error) { func (e EpubReader) Cover(documentFullPath string, coverMaxWidth int) ([]byte, error) { var cover []byte - reader := EpubReader{} - meta, err := reader.Metadata(documentFullPath) + coverFileName := "" + + opf, err := epub.GetPackageFromFile(documentFullPath) if err != nil { return nil, err } - if meta.Cover == "" { + + for _, val := range opf.Metadata.Meta { + if val.Name != "cover" { + continue + } + for _, item := range opf.Manifest.Items { + if item.ID == val.Content { + coverFileName = item.Href + break + } + } + } + + if coverFileName == "" { return nil, fmt.Errorf("no cover image set in %s", documentFullPath) } @@ -155,7 +145,7 @@ func (e EpubReader) Cover(documentFullPath string, coverMaxWidth int) ([]byte, e return nil, err } defer r.Close() - cover, err = extractCover(r, meta.Cover, coverMaxWidth) + cover, err = extractCover(r, coverFileName, coverMaxWidth) if err != nil { return nil, err } diff --git a/internal/metadata/metadata.go b/internal/metadata/metadata.go index 55a4f865..aac25193 100644 --- a/internal/metadata/metadata.go +++ b/internal/metadata/metadata.go @@ -13,7 +13,6 @@ type Metadata struct { Language string Year string Words float64 - Cover string Series string SeriesIndex float64 Pages int