Skip to content

Commit

Permalink
Refactor: Optimize search result
Browse files Browse the repository at this point in the history
  • Loading branch information
Alfex4936 committed Oct 14, 2024
1 parent 0bab72b commit ac27cb3
Show file tree
Hide file tree
Showing 6 changed files with 171 additions and 55 deletions.
10 changes: 8 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,12 @@
<version>9.12.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analysis-icu -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analysis-icu</artifactId>
<version>9.12.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.jboss.logging/jboss-logging -->
<!-- Hibernate Search now depends on JBoss Logging 3.6.-->
Expand Down Expand Up @@ -312,11 +318,11 @@
</goals>
<configuration>
<outputDirectory>${project.build.directory}/jacoco</outputDirectory>
<reports>
<formats>
<xml>true</xml> <!-- Ensure XML report is generated -->
<csv>false</csv> <!-- Disable CSV report -->
<html>true</html> <!-- Enable HTML report for visualization -->
</reports>
</formats>
<excludes>
<exclude>csw/korea/festival/main/common/annotation/**</exclude>
<!-- Exclude all classes ending with 'Config' -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.en.EnglishPossessiveFilterFactory;
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
import org.apache.lucene.analysis.icu.ICUNormalizer2FilterFactory;
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizerFactory;
import org.apache.lucene.analysis.ko.KoreanNumberFilterFactory;
import org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilterFactory;
import org.apache.lucene.analysis.ko.KoreanReadingFormFilterFactory;
import org.apache.lucene.analysis.ko.KoreanTokenizerFactory;
import org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory;
import org.apache.lucene.analysis.ngram.NGramFilterFactory;
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
import org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory;
import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurer;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
Expand All @@ -30,28 +35,53 @@ public LuceneAnalysisConfigurer luceneAnalysisConfigurer() {
.tokenizer(StandardTokenizerFactory.class)
.charFilter(HTMLStripCharFilterFactory.class)
.tokenFilter(LowerCaseFilterFactory.class)
.tokenFilter(SynonymGraphFilterFactory.class)
.param("synonyms", "lucene/english_synonyms.txt")
.param("ignoreCase", "true")
.param("expand", "true")

.tokenFilter(EdgeNGramFilterFactory.class)
.param("minGramSize", "3")
.param("maxGramSize", "10")
.tokenFilter(EnglishPossessiveFilterFactory.class)

.tokenFilter(StopFilterFactory.class)
// let it use the default EnglishAnalyzer stopwords
.param("ignoreCase", "true")
.tokenFilter(PorterStemFilterFactory.class);


// Korean Analyzer
context.analyzer("korean").custom()
.tokenizer(KoreanTokenizerFactory.class)
.charFilter(HTMLStripCharFilterFactory.class)
.charFilter(CJKWidthCharFilterFactory.class)

.tokenFilter(LowerCaseFilterFactory.class)
.tokenFilter(SynonymGraphFilterFactory.class)
.param("synonyms", "lucene/korean_synonyms.txt")
.param("ignoreCase", "true")
.param("expand", "true")

.tokenFilter(KoreanReadingFormFilterFactory.class)
.tokenFilter(KoreanPartOfSpeechStopFilterFactory.class)
// FIX ME: If adverbs are not meaningful for your searches, consider adding MAG and MAJ to the list.
// FIX ME: If adverbs are not meaningful for searches, consider adding MAG and MAJ to the list.
.param("tags", "E,EP,EF,EC,ETN,ETM,IC,J,MM,SP,SSC,SSO,SC,SE,XPN,SF,SY,XSA,UNKNOWN")
.tokenFilter(KoreanNumberFilterFactory.class);
.tokenFilter(KoreanNumberFilterFactory.class)

.tokenFilter(NGramFilterFactory.class)
.param("minGramSize", "2")
.param("maxGramSize", "5");

// Multi-lingual Analyzer
context.analyzer("multilingual").custom()
.tokenizer(StandardTokenizerFactory.class)
.tokenizer(ICUTokenizerFactory.class)
.charFilter(HTMLStripCharFilterFactory.class)
.tokenFilter(LowerCaseFilterFactory.class);
.tokenFilter(LowerCaseFilterFactory.class)
.tokenFilter(ICUNormalizer2FilterFactory.class)
.param("name", "nfkc_cf")
.tokenFilter(StopFilterFactory.class)
.param("ignoreCase", "true");
// .param("words", "stopwords.txt");

try {
context.analyzer("seok").instance(new CustomKoreanAnalyzer());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ public class Festival {
@Column(unique = true, nullable = false)
private String festivalId; // Festival ID

@FullTextField(analyzer = "seok")
@FullTextField(analyzer = "korean")
private String name; // Festival Name

@FullTextField(analyzer = "seok")
@FullTextField(analyzer = "korean")
@Column(length = 1000)
private String summary; // Festival Summary

Expand All @@ -42,7 +42,7 @@ public class Festival {
@Column(name = "end_date")
private LocalDate endDate;

@FullTextField(analyzer = "seok")
@FullTextField(analyzer = "korean")
private String address; // Address

private String usageFeeInfo; // Festival Usage Fee Information
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,73 +35,95 @@ public class FestivalSearchService {
public FestivalPage searchFestivals(String query, int page, int size) {
SearchSession searchSession = Search.session(entityManager);

// 1. 쿼리를 개별 단어로 분할
// String[] terms = query.split("\\s+");
// 1. Split the query into individual terms
String[] terms = query.split("\\s+");

String qwertyKorean = Korean.toHangul(query); // can be just normal English words but also trying to convert to Korean
// 2. Convert each term using qwerty to Korean conversion
String[] qwertyTerms = Arrays.stream(terms)
.map(Korean::toHangul)
.toArray(String[]::new);

// 2. 부울 쿼리 구성
// 3. Build the boolean query dynamically
SearchResult<Festival> result = searchSession.search(Festival.class)
.where(f -> f.bool()
.should(f.match()
.fields("name", "nameEn")
.matching(query)
.boost(4.0f))
.should(f.match()
.fields("name", "nameEn")
.matching(qwertyKorean)
.boost(3.0f))
.where(f -> {
// Start a boolean predicate
BooleanPredicateClausesStep<?> boolQuery = f.bool();

.should(f.match()
.fields("summary", "summaryEn")
.matching(query)
.boost(2.0f))
.should(f.match()
// For each term, add 'should' clauses
for (String term : terms) {
boolQuery.should(f.match()
.fields("name", "nameEn")
.matching(term)
.boost(9.0f));
boolQuery.should(f.match()
.fields("summary", "summaryEn")
.matching(qwertyKorean)
.boost(1.0f))

.should(f.match()
.matching(term)
.boost(7.0f));
boolQuery.should(f.match()
.field("categoryDisplayNames")
.matching(query)
.boost(1.0f))

.should(f.wildcard()
.matching(term)
.boost(6.0f));
boolQuery.should(f.wildcard()
.field("address")
.matching(STR."*\{query}*")
.boost(5.0f))
.should(f.wildcard()
.field("address")
.matching(STR."*\{qwertyKorean}*")
.boost(2.0f))

.should(f.phrase()
.fields("province", "city", "district", "town", "street")
.matching(query)
.boost(6.0f))
.should(f.phrase()
.matching("*" + term + "*")
.boost(10.0f));
boolQuery.should(f.phrase()
.fields("province", "city", "district", "town", "street")
.matching(qwertyKorean)
.boost(3.0f))
.minimumShouldMatchNumber(1) // at least one should clause must match for a document to be included.
)
.matching(term)
// .slop(2)
.boost(15.0f));
}

// Do the same for qwerty Korean converted terms
// for (String term : qwertyTerms) {
// boolQuery.should(f.match()
// .fields("name", "nameEn")
// .matching(term)
// .boost(3.0f));
// boolQuery.should(f.match()
// .fields("summary", "summaryEn")
// .matching(term)
// .boost(1.0f));
// boolQuery.should(f.wildcard()
// .field("address")
// .matching("*" + term + "*")
// .boost(2.0f));
// boolQuery.should(f.phrase()
// .fields("province", "city", "district", "town", "street")
// .matching(term)
// .slop(2)
// .boost(3.0f));
// }

// 4. minimumShouldMatch to prefer documents that match more terms
// at least half of the terms to match
// int minShouldMatch = Math.max(1, terms.length);

// Require a percentage of terms to match.
boolQuery.minimumShouldMatchPercent(50);


return boolQuery;
})
// .highlighter(f -> f.field("name").field("summary"))
.fetch(page * size, size);

int totalHits = (int) result.total().hitCount();
int start = Math.min(page * size, totalHits);
int end = Math.min(start + size, totalHits);
List<Festival> paginatedFestivals = result.hits().subList(start, end);

// 페이지네이션된 페스티벌의 날씨 정보 가져오기
// Process the festivals to include weather information
paginatedFestivals = festivalWeatherService.processFestivalsWeather(paginatedFestivals);

// 페스티벌 페이지 객체 생성
// Create and return the festival page object
FestivalPage festivalPage = new FestivalPage();
festivalPage.setContent(paginatedFestivals);
festivalPage.setPageNumber(page);
festivalPage.setPageSize(size);
festivalPage.setTotalElements(totalHits);
festivalPage.setTotalPages((totalHits + size - 1) / size); // 올림 계산
festivalPage.setTotalPages((totalHits + size - 1) / size);


return festivalPage;
}
Expand Down
28 changes: 28 additions & 0 deletions src/main/resources/lucene/english_synonyms.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
festival, fest, festa, fair, event, gala, celebration, party, gathering
magic festival, magic festa, magic fair
light festival, light festa, illumination festival
lantern festival, lantern festa, lantern fair
seaweed festival, seaweed festa, seaweed fair
beer festival, beer festa, beer fair
autumn festival, fall festival, autumn festa, fall festa
winter festival, cold festival, winter festa, cold festa
spring festival, spring festa, spring fair
summer festival, summer festa, summer fair
concert, performance, show, gig, recital, live show
busking, street performance, street busking
dance, dancing, dance performance
art, arts, artistic performance
cultural performance, cultural show, cultural gala
food festival, culinary festival, food fair, cuisine festival
food trucks, food carts, mobile kitchens
craft market, handmade market, artisan market
flea market, bazaar, swap meet
photo zone, photo spot, photo area
fireworks, fireworks show, pyrotechnics
experience booth, experience stand, interactive booth
handmade, handcrafted, artisanal
celebratory performance, celebratory show
family-friendly, family-oriented, family suitable
kids activities, children activities, kids programs
traditional performance, traditional show, heritage performance
night market, night bazaar, evening market
30 changes: 30 additions & 0 deletions src/main/resources/lucene/korean_synonyms.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
축제, 페스티벌, 행사, 축제회, 잔치, 모임, 갈라, 파티, 모임
매직 페스티벌, 매직 축제, 마술 축제
불빛 축제, 빛 축제, 조명 축제
유등 축제, 랜턴 축제, 등불 축제
해조 축제, 김 축제, 해초 축제
맥주 축제, 맥주 페스티벌, 맥주 잔치
가을 축제, 추계 축제, 가을 행사
겨울 축제, 추운 축제, 겨울 행사
봄 축제, 봄 행사, 봄 축제회
여름 축제, 여름 행사, 여름 축제회
공연, 쇼, 공연회, 콘서트, 리사이틀
버스킹, 거리 공연, 길거리 공연
춤, 댄싱, 춤 공연, 무용
예술, 예술 공연, 아트 공연
문화 공연, 문화 쇼, 문화 갈라
음식 축제, 요리 축제, 음식 행사, 요리 행사
푸드트럭, 푸드 카트, 모바일 주방
수공예 시장, 핸드메이드 시장, 장인 시장
플리마켓, 벼룩 시장, 스왑 미트
포토존, 사진 촬영 장소, 사진 스팟
불꽃놀이, 불꽃쇼, 파이로테크닉
체험 부스, 체험 스탠드, 인터랙티브 부스
수제, 수공예, 아티산
축하 공연, 축하 쇼, 축하 갈라
가족 친화적, 가족 지향적, 가족 적합
어린이 활동, 아이들 활동, 어린이 프로그램
전통 공연, 전통 쇼, 유산 공연
야시장, 야간 시장, 저녁 시장
야외 공연, 야외 쇼, 야외 공연회
디저트, 디저트 부스, 스위트, 스위트 부스, 먹거리

0 comments on commit ac27cb3

Please sign in to comment.