From ac27cb3378685ac5c3ef1aefd2db1f2d8d065419 Mon Sep 17 00:00:00 2001 From: Alfex4936 Date: Mon, 14 Oct 2024 19:09:19 +0900 Subject: [PATCH] Refactor: Optimize search result --- pom.xml | 10 +- .../config/lucene/HibernateSearchConfig.java | 40 ++++++- .../main/festival/model/Festival.java | 6 +- .../service/FestivalSearchService.java | 112 +++++++++++------- .../resources/lucene/english_synonyms.txt | 28 +++++ src/main/resources/lucene/korean_synonyms.txt | 30 +++++ 6 files changed, 171 insertions(+), 55 deletions(-) create mode 100644 src/main/resources/lucene/english_synonyms.txt create mode 100644 src/main/resources/lucene/korean_synonyms.txt diff --git a/pom.xml b/pom.xml index 0be044e..6c87176 100644 --- a/pom.xml +++ b/pom.xml @@ -212,6 +212,12 @@ 9.12.0 + + + org.apache.lucene + lucene-analysis-icu + 9.12.0 + @@ -312,11 +318,11 @@ ${project.build.directory}/jacoco - + true false true - + csw/korea/festival/main/common/annotation/** diff --git a/src/main/java/csw/korea/festival/main/config/lucene/HibernateSearchConfig.java b/src/main/java/csw/korea/festival/main/config/lucene/HibernateSearchConfig.java index 9bffde9..172879f 100644 --- a/src/main/java/csw/korea/festival/main/config/lucene/HibernateSearchConfig.java +++ b/src/main/java/csw/korea/festival/main/config/lucene/HibernateSearchConfig.java @@ -7,11 +7,16 @@ import org.apache.lucene.analysis.core.StopFilterFactory; import org.apache.lucene.analysis.en.EnglishPossessiveFilterFactory; import org.apache.lucene.analysis.en.PorterStemFilterFactory; +import org.apache.lucene.analysis.icu.ICUNormalizer2FilterFactory; +import org.apache.lucene.analysis.icu.segmentation.ICUTokenizerFactory; import org.apache.lucene.analysis.ko.KoreanNumberFilterFactory; import org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilterFactory; import org.apache.lucene.analysis.ko.KoreanReadingFormFilterFactory; import org.apache.lucene.analysis.ko.KoreanTokenizerFactory; +import org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory; +import org.apache.lucene.analysis.ngram.NGramFilterFactory; import org.apache.lucene.analysis.standard.StandardTokenizerFactory; +import org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory; import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurer; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @@ -30,28 +35,53 @@ public LuceneAnalysisConfigurer luceneAnalysisConfigurer() { .tokenizer(StandardTokenizerFactory.class) .charFilter(HTMLStripCharFilterFactory.class) .tokenFilter(LowerCaseFilterFactory.class) + .tokenFilter(SynonymGraphFilterFactory.class) + .param("synonyms", "lucene/english_synonyms.txt") + .param("ignoreCase", "true") + .param("expand", "true") + + .tokenFilter(EdgeNGramFilterFactory.class) + .param("minGramSize", "3") + .param("maxGramSize", "10") .tokenFilter(EnglishPossessiveFilterFactory.class) + .tokenFilter(StopFilterFactory.class) - // let it use the default EnglishAnalyzer stopwords .param("ignoreCase", "true") .tokenFilter(PorterStemFilterFactory.class); + // Korean Analyzer context.analyzer("korean").custom() .tokenizer(KoreanTokenizerFactory.class) .charFilter(HTMLStripCharFilterFactory.class) .charFilter(CJKWidthCharFilterFactory.class) + + .tokenFilter(LowerCaseFilterFactory.class) + .tokenFilter(SynonymGraphFilterFactory.class) + .param("synonyms", "lucene/korean_synonyms.txt") + .param("ignoreCase", "true") + .param("expand", "true") + .tokenFilter(KoreanReadingFormFilterFactory.class) .tokenFilter(KoreanPartOfSpeechStopFilterFactory.class) - // FIX ME: If adverbs are not meaningful for your searches, consider adding MAG and MAJ to the list. + // FIX ME: If adverbs are not meaningful for searches, consider adding MAG and MAJ to the list. .param("tags", "E,EP,EF,EC,ETN,ETM,IC,J,MM,SP,SSC,SSO,SC,SE,XPN,SF,SY,XSA,UNKNOWN") - .tokenFilter(KoreanNumberFilterFactory.class); + .tokenFilter(KoreanNumberFilterFactory.class) + + .tokenFilter(NGramFilterFactory.class) + .param("minGramSize", "2") + .param("maxGramSize", "5"); // Multi-lingual Analyzer context.analyzer("multilingual").custom() - .tokenizer(StandardTokenizerFactory.class) + .tokenizer(ICUTokenizerFactory.class) .charFilter(HTMLStripCharFilterFactory.class) - .tokenFilter(LowerCaseFilterFactory.class); + .tokenFilter(LowerCaseFilterFactory.class) + .tokenFilter(ICUNormalizer2FilterFactory.class) + .param("name", "nfkc_cf") + .tokenFilter(StopFilterFactory.class) + .param("ignoreCase", "true"); +// .param("words", "stopwords.txt"); try { context.analyzer("seok").instance(new CustomKoreanAnalyzer()); diff --git a/src/main/java/csw/korea/festival/main/festival/model/Festival.java b/src/main/java/csw/korea/festival/main/festival/model/Festival.java index ee0067c..14792b3 100644 --- a/src/main/java/csw/korea/festival/main/festival/model/Festival.java +++ b/src/main/java/csw/korea/festival/main/festival/model/Festival.java @@ -27,10 +27,10 @@ public class Festival { @Column(unique = true, nullable = false) private String festivalId; // Festival ID - @FullTextField(analyzer = "seok") + @FullTextField(analyzer = "korean") private String name; // Festival Name - @FullTextField(analyzer = "seok") + @FullTextField(analyzer = "korean") @Column(length = 1000) private String summary; // Festival Summary @@ -42,7 +42,7 @@ public class Festival { @Column(name = "end_date") private LocalDate endDate; - @FullTextField(analyzer = "seok") + @FullTextField(analyzer = "korean") private String address; // Address private String usageFeeInfo; // Festival Usage Fee Information diff --git a/src/main/java/csw/korea/festival/main/festival/service/FestivalSearchService.java b/src/main/java/csw/korea/festival/main/festival/service/FestivalSearchService.java index 1035b66..75437c2 100644 --- a/src/main/java/csw/korea/festival/main/festival/service/FestivalSearchService.java +++ b/src/main/java/csw/korea/festival/main/festival/service/FestivalSearchService.java @@ -35,56 +35,77 @@ public class FestivalSearchService { public FestivalPage searchFestivals(String query, int page, int size) { SearchSession searchSession = Search.session(entityManager); - // 1. 쿼리를 개별 단어로 분할 - // String[] terms = query.split("\\s+"); + // 1. Split the query into individual terms + String[] terms = query.split("\\s+"); - String qwertyKorean = Korean.toHangul(query); // can be just normal English words but also trying to convert to Korean + // 2. Convert each term using qwerty to Korean conversion + String[] qwertyTerms = Arrays.stream(terms) + .map(Korean::toHangul) + .toArray(String[]::new); - // 2. 부울 쿼리 구성 + // 3. Build the boolean query dynamically SearchResult result = searchSession.search(Festival.class) - .where(f -> f.bool() - .should(f.match() - .fields("name", "nameEn") - .matching(query) - .boost(4.0f)) - .should(f.match() - .fields("name", "nameEn") - .matching(qwertyKorean) - .boost(3.0f)) + .where(f -> { + // Start a boolean predicate + BooleanPredicateClausesStep boolQuery = f.bool(); - .should(f.match() - .fields("summary", "summaryEn") - .matching(query) - .boost(2.0f)) - .should(f.match() + // For each term, add 'should' clauses + for (String term : terms) { + boolQuery.should(f.match() + .fields("name", "nameEn") + .matching(term) + .boost(9.0f)); + boolQuery.should(f.match() .fields("summary", "summaryEn") - .matching(qwertyKorean) - .boost(1.0f)) - - .should(f.match() + .matching(term) + .boost(7.0f)); + boolQuery.should(f.match() .field("categoryDisplayNames") - .matching(query) - .boost(1.0f)) - - .should(f.wildcard() + .matching(term) + .boost(6.0f)); + boolQuery.should(f.wildcard() .field("address") - .matching(STR."*\{query}*") - .boost(5.0f)) - .should(f.wildcard() - .field("address") - .matching(STR."*\{qwertyKorean}*") - .boost(2.0f)) - - .should(f.phrase() - .fields("province", "city", "district", "town", "street") - .matching(query) - .boost(6.0f)) - .should(f.phrase() + .matching("*" + term + "*") + .boost(10.0f)); + boolQuery.should(f.phrase() .fields("province", "city", "district", "town", "street") - .matching(qwertyKorean) - .boost(3.0f)) - .minimumShouldMatchNumber(1) // at least one should clause must match for a document to be included. - ) + .matching(term) + // .slop(2) + .boost(15.0f)); + } + + // Do the same for qwerty Korean converted terms +// for (String term : qwertyTerms) { +// boolQuery.should(f.match() +// .fields("name", "nameEn") +// .matching(term) +// .boost(3.0f)); +// boolQuery.should(f.match() +// .fields("summary", "summaryEn") +// .matching(term) +// .boost(1.0f)); +// boolQuery.should(f.wildcard() +// .field("address") +// .matching("*" + term + "*") +// .boost(2.0f)); +// boolQuery.should(f.phrase() +// .fields("province", "city", "district", "town", "street") +// .matching(term) +// .slop(2) +// .boost(3.0f)); +// } + + // 4. minimumShouldMatch to prefer documents that match more terms + // at least half of the terms to match + // int minShouldMatch = Math.max(1, terms.length); + + // Require a percentage of terms to match. + boolQuery.minimumShouldMatchPercent(50); + + + return boolQuery; + }) + // .highlighter(f -> f.field("name").field("summary")) .fetch(page * size, size); int totalHits = (int) result.total().hitCount(); @@ -92,16 +113,17 @@ public FestivalPage searchFestivals(String query, int page, int size) { int end = Math.min(start + size, totalHits); List paginatedFestivals = result.hits().subList(start, end); - // 페이지네이션된 페스티벌의 날씨 정보 가져오기 + // Process the festivals to include weather information paginatedFestivals = festivalWeatherService.processFestivalsWeather(paginatedFestivals); - // 페스티벌 페이지 객체 생성 + // Create and return the festival page object FestivalPage festivalPage = new FestivalPage(); festivalPage.setContent(paginatedFestivals); festivalPage.setPageNumber(page); festivalPage.setPageSize(size); festivalPage.setTotalElements(totalHits); - festivalPage.setTotalPages((totalHits + size - 1) / size); // 올림 계산 + festivalPage.setTotalPages((totalHits + size - 1) / size); + return festivalPage; } diff --git a/src/main/resources/lucene/english_synonyms.txt b/src/main/resources/lucene/english_synonyms.txt new file mode 100644 index 0000000..8f292f5 --- /dev/null +++ b/src/main/resources/lucene/english_synonyms.txt @@ -0,0 +1,28 @@ +festival, fest, festa, fair, event, gala, celebration, party, gathering +magic festival, magic festa, magic fair +light festival, light festa, illumination festival +lantern festival, lantern festa, lantern fair +seaweed festival, seaweed festa, seaweed fair +beer festival, beer festa, beer fair +autumn festival, fall festival, autumn festa, fall festa +winter festival, cold festival, winter festa, cold festa +spring festival, spring festa, spring fair +summer festival, summer festa, summer fair +concert, performance, show, gig, recital, live show +busking, street performance, street busking +dance, dancing, dance performance +art, arts, artistic performance +cultural performance, cultural show, cultural gala +food festival, culinary festival, food fair, cuisine festival +food trucks, food carts, mobile kitchens +craft market, handmade market, artisan market +flea market, bazaar, swap meet +photo zone, photo spot, photo area +fireworks, fireworks show, pyrotechnics +experience booth, experience stand, interactive booth +handmade, handcrafted, artisanal +celebratory performance, celebratory show +family-friendly, family-oriented, family suitable +kids activities, children activities, kids programs +traditional performance, traditional show, heritage performance +night market, night bazaar, evening market diff --git a/src/main/resources/lucene/korean_synonyms.txt b/src/main/resources/lucene/korean_synonyms.txt new file mode 100644 index 0000000..ff36d9f --- /dev/null +++ b/src/main/resources/lucene/korean_synonyms.txt @@ -0,0 +1,30 @@ +축제, 페스티벌, 행사, 축제회, 잔치, 모임, 갈라, 파티, 모임 +매직 페스티벌, 매직 축제, 마술 축제 +불빛 축제, 빛 축제, 조명 축제 +유등 축제, 랜턴 축제, 등불 축제 +해조 축제, 김 축제, 해초 축제 +맥주 축제, 맥주 페스티벌, 맥주 잔치 +가을 축제, 추계 축제, 가을 행사 +겨울 축제, 추운 축제, 겨울 행사 +봄 축제, 봄 행사, 봄 축제회 +여름 축제, 여름 행사, 여름 축제회 +공연, 쇼, 공연회, 콘서트, 리사이틀 +버스킹, 거리 공연, 길거리 공연 +춤, 댄싱, 춤 공연, 무용 +예술, 예술 공연, 아트 공연 +문화 공연, 문화 쇼, 문화 갈라 +음식 축제, 요리 축제, 음식 행사, 요리 행사 +푸드트럭, 푸드 카트, 모바일 주방 +수공예 시장, 핸드메이드 시장, 장인 시장 +플리마켓, 벼룩 시장, 스왑 미트 +포토존, 사진 촬영 장소, 사진 스팟 +불꽃놀이, 불꽃쇼, 파이로테크닉 +체험 부스, 체험 스탠드, 인터랙티브 부스 +수제, 수공예, 아티산 +축하 공연, 축하 쇼, 축하 갈라 +가족 친화적, 가족 지향적, 가족 적합 +어린이 활동, 아이들 활동, 어린이 프로그램 +전통 공연, 전통 쇼, 유산 공연 +야시장, 야간 시장, 저녁 시장 +야외 공연, 야외 쇼, 야외 공연회 +디저트, 디저트 부스, 스위트, 스위트 부스, 먹거리 \ No newline at end of file