diff --git a/app/models/search.rb b/app/models/search.rb index 3b8127ef18..f81b9a9d81 100644 --- a/app/models/search.rb +++ b/app/models/search.rb @@ -1,6 +1,4 @@ module Search - CJK_PATTERN = /\p{Han}|\p{Hiragana}|\p{Katakana}|\p{Hangul}/ - def self.table_name_prefix "search_" end diff --git a/app/models/search/highlighter.rb b/app/models/search/highlighter.rb index 8672ed3c83..d2b53d7720 100644 --- a/app/models/search/highlighter.rb +++ b/app/models/search/highlighter.rb @@ -13,14 +13,8 @@ def highlight(text) result = text.dup terms.each do |term| - if term.match?(Search::CJK_PATTERN) - result.gsub!(/(#{Regexp.escape(term)})/i) do |match| - "#{OPENING_MARK}#{match}#{CLOSING_MARK}" - end - else - result.gsub!(/\b(#{Regexp.escape(term)}\w*)\b/i) do |match| - "#{OPENING_MARK}#{match}#{CLOSING_MARK}" - end + result.gsub!(/\b(#{Regexp.escape(term)}\w*)\b/i) do |match| + "#{OPENING_MARK}#{match}#{CLOSING_MARK}" end end diff --git a/app/models/search/query.rb b/app/models/search/query.rb index 6bf06ef420..5fe0829e41 100644 --- a/app/models/search/query.rb +++ b/app/models/search/query.rb @@ -33,7 +33,7 @@ def sanitize(terms) end def remove_invalid_search_characters(terms) - terms.gsub(/[^\p{L}\p{N}_"]/, " ") + terms.gsub(/[^\w"]/, " ") end def remove_unbalanced_quotes(terms) diff --git a/app/models/search/record/sqlite.rb b/app/models/search/record/sqlite.rb index 73b3b675fa..ae0d34281b 100644 --- a/app/models/search/record/sqlite.rb +++ b/app/models/search/record/sqlite.rb @@ -8,10 +8,9 @@ module Search::Record::SQLite has_one :search_records_fts, -> { with_rowid }, class_name: "Search::Record::SQLite::Fts", foreign_key: :rowid, primary_key: :id, dependent: :destroy - before_save :stem_content after_save :upsert_to_fts5_table - scope :matching, ->(query, account_id) { joins(:search_records_fts).where("search_records_fts MATCH ?", Search::Stemmer.stem(query.to_s)) } + scope :matching, ->(query, account_id) { joins(:search_records_fts).where("search_records_fts MATCH ?", query) } end class_methods do @@ -43,11 +42,6 @@ def comment_body end private - def stem_content - self.title = Search::Stemmer.stem(title) if title_changed? - self.content = Search::Stemmer.stem(content) if content_changed? - end - def escape_fts_highlight(html) return nil unless html.present? diff --git a/app/models/search/stemmer.rb b/app/models/search/stemmer.rb index 2ae0e1bf15..f6a6c56d44 100644 --- a/app/models/search/stemmer.rb +++ b/app/models/search/stemmer.rb @@ -5,43 +5,9 @@ module Search::Stemmer def stem(value) if value.present? - tokenize(value).join(" ") + value.gsub(/[^\w\s]/, "").split(/\s+/).map { |word| STEMMER.stem(word.downcase) }.join(" ") else value end end - - private - def tokenize(value) - tokens = [] - current_word = +"" - - value.each_char do |char| - if cjk_character?(char) - if current_word.present? - tokens << stem_word(current_word) - current_word = +"" - end - tokens << char - elsif char =~ /[\p{L}\p{N}_]/ - current_word << char - else - if current_word.present? - tokens << stem_word(current_word) - current_word = +"" - end - end - end - - tokens << stem_word(current_word) if current_word.present? - tokens - end - - def cjk_character?(char) - char.match?(Search::CJK_PATTERN) - end - - def stem_word(word) - STEMMER.stem(word.downcase) - end end diff --git a/test/models/search/highlighter_test.rb b/test/models/search/highlighter_test.rb index 822e4913d6..a065c4da19 100644 --- a/test/models/search/highlighter_test.rb +++ b/test/models/search/highlighter_test.rb @@ -82,34 +82,6 @@ class Search::HighlighterTest < ActiveSupport::TestCase assert_equal "<script>#{mark('test')}</script>", result end - test "highlight Chinese characters" do - highlighter = Search::Highlighter.new("测试") - result = highlighter.highlight("这是一个测试文本") - - assert_equal "这是一个#{mark('测试')}文本", result - end - - test "highlight Japanese characters" do - highlighter = Search::Highlighter.new("テスト") - result = highlighter.highlight("これはテストです") - - assert_equal "これは#{mark('テスト')}です", result - end - - test "highlight Korean characters" do - highlighter = Search::Highlighter.new("테스트") - result = highlighter.highlight("이것은 테스트입니다") - - assert_equal "이것은 #{mark('테스트')}입니다", result - end - - test "highlight mixed CJK and English" do - highlighter = Search::Highlighter.new("world 世界") - result = highlighter.highlight("hello world 你好世界") - - assert_equal "hello #{mark('world')} 你好#{mark('世界')}", result - end - private def mark(text) "#{Search::Highlighter::OPENING_MARK}#{text}#{Search::Highlighter::CLOSING_MARK}" diff --git a/test/models/search/query_test.rb b/test/models/search/query_test.rb deleted file mode 100644 index 628b9e69e8..0000000000 --- a/test/models/search/query_test.rb +++ /dev/null @@ -1,57 +0,0 @@ -require "test_helper" - -class Search::QueryTest < ActiveSupport::TestCase - setup do - @account = accounts(:"37s") - Current.account = @account - end - - test "sanitize preserves ASCII words" do - query = build_query("hello world") - - assert_equal "hello world", query.terms - end - - test "sanitize preserves Chinese characters" do - query = build_query("测试文本") - - assert_equal "测试文本", query.terms - end - - test "sanitize preserves Japanese characters" do - query = build_query("テスト") - - assert_equal "テスト", query.terms - end - - test "sanitize preserves Korean characters" do - query = build_query("테스트") - - assert_equal "테스트", query.terms - end - - test "sanitize preserves mixed CJK and English" do - query = build_query("hello 世界 test") - - assert_equal "hello 世界 test", query.terms - end - - test "sanitize removes special characters but preserves CJK" do - query = build_query("测试@文本") - - assert_equal "测试 文本", query.terms - end - - test "sanitize preserves quoted phrases with CJK" do - query = build_query('"你好世界"') - - assert_equal '"你好世界"', query.terms - end - - private - def build_query(terms) - query = Search::Query.wrap(terms) - query.validate - query - end -end diff --git a/test/models/search/stemmer_test.rb b/test/models/search/stemmer_test.rb index e2963ee10e..858ed2dca1 100644 --- a/test/models/search/stemmer_test.rb +++ b/test/models/search/stemmer_test.rb @@ -12,40 +12,4 @@ class Search::StemmerTest < ActiveSupport::TestCase assert_equal "test run jump walk", result end - - test "split Chinese characters for FTS indexing" do - result = Search::Stemmer.stem("测试") - - assert_equal "测 试", result - end - - test "split Japanese characters for FTS indexing" do - result = Search::Stemmer.stem("テスト") - - assert_equal "テ ス ト", result - end - - test "split Korean characters for FTS indexing" do - result = Search::Stemmer.stem("테스트") - - assert_equal "테 스 트", result - end - - test "mixed CJK and English" do - result = Search::Stemmer.stem("running 测试 test") - - assert_equal "run 测 试 test", result - end - - test "mixed CJK and English without spaces" do - result = Search::Stemmer.stem("hello世界test") - - assert_equal "hello 世 界 test", result - end - - test "CJK punctuation is treated as separator" do - result = Search::Stemmer.stem("你好。世界") - - assert_equal "你 好 世 界", result - end end