From 60e37bef1a7ea4a5970f2eaf574170a56d45cb82 Mon Sep 17 00:00:00 2001 From: Sergio Vera Date: Sun, 18 Feb 2024 11:34:22 +0100 Subject: [PATCH] Improved search results by taking into account indexed languages --- internal/index/bleve.go | 2 +- internal/index/bleve_read.go | 89 +++++++++++++------ internal/index/bleve_write.go | 24 +++++ .../webserver/controller/document/detail.go | 3 +- 4 files changed, 86 insertions(+), 32 deletions(-) diff --git a/internal/index/bleve.go b/internal/index/bleve.go index f7d5684..fce5595 100644 --- a/internal/index/bleve.go +++ b/internal/index/bleve.go @@ -23,7 +23,7 @@ import ( // Version identifies the mapping used for indexing. Any changes in the mapping requires an increase // of version, to signal that a new index needs to be created. -const Version = "v1" +const Version = "v2" var noStopWordsFilters = map[string][]string{ es.AnalyzerName: {es.NormalizeName, lowercase.Name, es.LightStemmerName}, diff --git a/internal/index/bleve_read.go b/internal/index/bleve_read.go index f8a77de..eda8ad9 100644 --- a/internal/index/bleve_read.go +++ b/internal/index/bleve_read.go @@ -43,33 +43,42 @@ func (b *BleveIndexer) Search(keywords string, page, resultsPerPage int) (result return b.runPaginatedQuery(qb, page, resultsPerPage) } - compound := composeQuery(keywords) + analyzers, err := b.analyzers() + if err != nil { + return result.Paginated[[]Document]{}, err + } + compound := composeQuery(keywords, analyzers) return b.runPaginatedQuery(compound, page, resultsPerPage) } -func composeQuery(keywords string) *query.DisjunctionQuery { +func composeQuery(keywords string, analyzers []string) *query.DisjunctionQuery { langCompoundQuery := bleve.NewDisjunctionQuery() - for lang := range noStopWordsFilters { + for _, analyzer := range analyzers { + noStopWordsAnalyzer := analyzer + if analyzer != defaultAnalyzer { + noStopWordsAnalyzer = analyzer + "_no_stop_words" + } + qt := bleve.NewMatchPhraseQuery(keywords) - qt.Analyzer = lang + "_no_stop_words" + qt.Analyzer = noStopWordsAnalyzer qt.SetField("Title") langCompoundQuery.AddQuery(qt) qs := bleve.NewMatchQuery(keywords) - qs.Analyzer = lang + "_no_stop_words" + qs.Analyzer = noStopWordsAnalyzer qs.SetField("Series") qs.Operator = query.MatchQueryOperatorAnd langCompoundQuery.AddQuery(qs) qu := bleve.NewMatchQuery(keywords) - qu.Analyzer = lang + qu.Analyzer = analyzer qu.SetField("Subjects") qu.Operator = query.MatchQueryOperatorAnd langCompoundQuery.AddQuery(qu) qd := bleve.NewMatchQuery(keywords) - qd.Analyzer = lang + qd.Analyzer = analyzer qd.SetField("Description") qd.Operator = query.MatchQueryOperatorAnd langCompoundQuery.AddQuery(qd) @@ -211,28 +220,35 @@ func (b *BleveIndexer) Documents(IDs []string) (map[string]Document, error) { // SameSubjects returns an array of metadata of documents by other authors, different between each other, // which have similar subjects as the passed one and does not belong to the same collection -func (b *BleveIndexer) SameSubjects(slug string, quantity int) ([]Document, error) { - doc, err := b.Document(slug) +func (b *BleveIndexer) SameSubjects(slugID string, quantity int) ([]Document, error) { + doc, err := b.Document(slugID) if err != nil { return []Document{}, err } + bq := bleve.NewBooleanQuery() subjectsCompoundQuery := bleve.NewDisjunctionQuery() + for _, subject := range doc.Subjects { - qu := bleve.NewMatchPhraseQuery(subject) - qu.SetField("Subjects") + subject = strings.ReplaceAll(slug.Make(subject), "-", "") + qu := bleve.NewTermQuery(subject) + qu.SetField("SubjectsEq") subjectsCompoundQuery.AddQuery(qu) } - bq := bleve.NewBooleanQuery() + + series := strings.ReplaceAll(slug.Make(doc.Series), "-", "") + sq := bleve.NewTermQuery(series) + sq.SetField("SeriesEq") + bq.AddMustNot(sq) + bq.AddMust(subjectsCompoundQuery) bq.AddMustNot(bleve.NewDocIDQuery([]string{doc.ID})) - sq := bleve.NewMatchPhraseQuery(doc.Series) - sq.SetField("Series") - bq.AddMustNot(sq) + authorsCompoundQuery := bleve.NewDisjunctionQuery() for _, author := range doc.Authors { - qa := bleve.NewMatchPhraseQuery(author) - qa.SetField("Authors") + author = strings.ReplaceAll(slug.Make(author), "-", "") + qa := bleve.NewTermQuery(author) + qa.SetField("AuthorsEq") authorsCompoundQuery.AddQuery(qa) } bq.AddMustNot(authorsCompoundQuery) @@ -248,8 +264,9 @@ func (b *BleveIndexer) SameSubjects(slug string, quantity int) ([]Document, erro } res = append(res, doc[0]) for _, author := range doc[0].Authors { - qa := bleve.NewMatchPhraseQuery(author) - qa.SetField("Authors") + author = strings.ReplaceAll(slug.Make(author), "-", "") + qa := bleve.NewTermQuery(author) + qa.SetField("AuthorsEq") authorsCompoundQuery.AddQuery(qa) } bq.AddMustNot(authorsCompoundQuery) @@ -260,44 +277,58 @@ func (b *BleveIndexer) SameSubjects(slug string, quantity int) ([]Document, erro // SameAuthors returns an array of metadata of documents by the same authors which // does not belong to the same collection -func (b *BleveIndexer) SameAuthors(slug string, quantity int) ([]Document, error) { - doc, err := b.Document(slug) +func (b *BleveIndexer) SameAuthors(slugID string, quantity int) ([]Document, error) { + doc, err := b.Document(slugID) if err != nil { return []Document{}, err } authorsCompoundQuery := bleve.NewDisjunctionQuery() for _, author := range doc.Authors { - qu := bleve.NewMatchPhraseQuery(author) - qu.SetField("Authors") + author = strings.ReplaceAll(slug.Make(author), "-", "") + qu := bleve.NewTermQuery(author) + qu.SetField("AuthorsEq") authorsCompoundQuery.AddQuery(qu) } bq := bleve.NewBooleanQuery() bq.AddMust(authorsCompoundQuery) bq.AddMustNot(bleve.NewDocIDQuery([]string{doc.ID})) - sq := bleve.NewMatchPhraseQuery(doc.Series) - sq.SetField("Series") + + series := strings.ReplaceAll(slug.Make(doc.Series), "-", "") + sq := bleve.NewTermQuery(series) + sq.SetField("SeriesEq") + bq.AddMustNot(sq) return b.runQuery(bq, quantity) } // SameSeries returns an array of metadata of documents in the same series -func (b *BleveIndexer) SameSeries(slug string, quantity int) ([]Document, error) { - doc, err := b.Document(slug) +func (b *BleveIndexer) SameSeries(slugID string, quantity int) ([]Document, error) { + doc, err := b.Document(slugID) if err != nil { return []Document{}, err } bq := bleve.NewBooleanQuery() bq.AddMustNot(bleve.NewDocIDQuery([]string{doc.ID})) - sq := bleve.NewMatchPhraseQuery(doc.Series) - sq.SetField("Series") + series := strings.ReplaceAll(slug.Make(doc.Series), "-", "") + + sq := bleve.NewMatchPhraseQuery(series) + sq.SetField("SeriesEq") bq.AddMust(sq) return b.runQuery(bq, quantity) } +func (b *BleveIndexer) analyzers() ([]string, error) { + languages, err := b.idx.GetInternal([]byte("languages")) + if err != nil { + return []string{}, err + } + return strings.Split(string(languages), ","), nil +} + func slicer(val interface{}) []string { var ( terms []interface{} diff --git a/internal/index/bleve_write.go b/internal/index/bleve_write.go index 7405860..1169849 100644 --- a/internal/index/bleve_write.go +++ b/internal/index/bleve_write.go @@ -5,6 +5,7 @@ import ( "log" "os" "path/filepath" + "slices" "strings" "github.com/gosimple/slug" @@ -46,6 +47,7 @@ func (b *BleveIndexer) RemoveFile(file string) error { func (b *BleveIndexer) AddLibrary(fs afero.Fs, batchSize int) error { batch := b.idx.NewBatch() batchSlugs := make(map[string]struct{}, batchSize) + languages := []string{} e := afero.Walk(fs, b.libraryPath, func(fullPath string, f os.FileInfo, err error) error { ext := strings.ToLower(filepath.Ext(fullPath)) if _, ok := b.reader[ext]; !ok { @@ -59,6 +61,7 @@ func (b *BleveIndexer) AddLibrary(fs afero.Fs, batchSize int) error { document := b.createDocument(meta, fullPath, batchSlugs) batchSlugs[document.Slug] = struct{}{} + languages = addLanguage(meta.Language, languages) err = batch.Index(document.ID, document) if err != nil { @@ -74,10 +77,31 @@ func (b *BleveIndexer) AddLibrary(fs afero.Fs, batchSize int) error { return nil }) + b.idx.SetInternal([]byte("languages"), []byte(strings.Join(languages, ","))) b.idx.Batch(batch) return e } +func addLanguage(lang string, languages []string) []string { + if !slices.Contains(languages, defaultAnalyzer) && lang == "" { + return append(languages, defaultAnalyzer) + } + + if _, ok := noStopWordsFilters[lang]; ok { + found := false + for i := range languages { + if languages[i] == lang { + found = true + break + } + } + if !found { + languages = append(languages, lang) + } + } + return languages +} + func (b *BleveIndexer) createDocument(meta metadata.Metadata, fullPath string, batchSlugs map[string]struct{}) DocumentWrite { document := DocumentWrite{ Document: Document{ diff --git a/internal/webserver/controller/document/detail.go b/internal/webserver/controller/document/detail.go index c340823..46f89f5 100644 --- a/internal/webserver/controller/document/detail.go +++ b/internal/webserver/controller/document/detail.go @@ -24,8 +24,7 @@ func (d *Controller) Detail(c *fiber.Ctx) error { document, err := d.idx.Document(c.Params("slug")) if err != nil { - fmt.Println(err) - return fiber.ErrBadRequest + return fiber.ErrNotFound } if _, err := os.Stat(filepath.Join(d.config.LibraryPath, document.ID)); err != nil {