Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,454 changes: 1,405 additions & 49 deletions Cargo.lock

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@ basic-toml = "*"
bincode = "2.0.1"
csv = "1"
fast2s = "0.3.1"
fastembed = { version = "5.8.1", features = ["hf-hub-rustls-tls", "ort-download-binaries"], default-features = false, optional = true }
fjall = { version = "2.11.2", default-features = false, features = [
"single_writer_tx",
"miniz",
] }
indexmap = "2"
jiff = { version = "0.2.17", default-features = false, features = ["std"] }
qdrant-client = { version = "1.16.0", optional = true }
scraper = "0.25.0"
serde = { version = "1", features = ["derive"] }
stop-words = "0.9.0"
tantivy = "0.25.0"
tantivy-jieba = "0.18.0"
tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
tower = "0.5.2"
tower = "0.5.3"
tower-http = { version = "0.6.6", features = ["compression-zstd", "timeout"] }
tracing = { version = "0.1", features = [
"release_max_level_info",
Expand All @@ -38,6 +40,10 @@ tracing = { version = "0.1", features = [
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
zip = { version = "7", default-features = false, features = ["deflate"] }

[features]
default = []
vsearch = ["fastembed", "qdrant-client"]

[target.'cfg(not(target_os = "windows"))'.dependencies]
tikv-jemallocator = "0.6"

Expand Down
1 change: 1 addition & 0 deletions config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ index_with_full_text = false # whether establish full-text index
addr = "127.0.0.1:8081" # If allow LAN access, change it to "0.0.0.0:port".
export_limit = 10000
max_results = 50000
qdrant_grpc = "http://localhost:6334" # if feature "vsearch" not enabled, this config is ignored

# The raw data path you downloaded from the torrent, and you must NOT unzip it.
raw_data_path = "裁判文书全量数据(已完成)"
17 changes: 16 additions & 1 deletion src/bin/main.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
use axum::{Router, http::StatusCode, routing::get};
use cases::{AppState, CONFIG, Tan, case, help, kv_sep_partition_option, search, style};
use fjall::Config;

use std::{net::SocketAddr, sync::Arc, time::Duration};
use tokio::net::TcpListener;
use tower::ServiceBuilder;
use tower_http::{compression::CompressionLayer, timeout::TimeoutLayer};
use tracing::info;
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};

#[cfg(feature = "vsearch")]
use qdrant_client::Qdrant;

#[cfg(not(target_os = "windows"))]
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
Expand All @@ -28,7 +32,18 @@ async fn main() {
let db = keyspace
.open_partition("cases", kv_sep_partition_option())
.unwrap();
let app_state = AppState { db, searcher };

#[cfg(feature = "vsearch")]
let qclient = Qdrant::from_url(CONFIG.qdrant_grpc.as_str())
.build()
.unwrap();

let app_state = AppState {
db,
searcher,
#[cfg(feature = "vsearch")]
qclient,
};

let middleware_stack =
ServiceBuilder::new()
Expand Down
2 changes: 2 additions & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ pub struct Config {
pub raw_data_path: Option<String>,
pub export_limit: Option<usize>,
pub max_results: Option<usize>,
#[cfg(feature = "vsearch")]
pub qdrant_grpc: String,
}

impl Config {
Expand Down
120 changes: 95 additions & 25 deletions src/controller.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::sync::LazyLock;

use askama::Template;
use axum::{
body::Body,
Expand All @@ -10,13 +8,20 @@ use axum::{
use bincode::config::standard;
use indexmap::IndexSet;
use serde::Deserialize;
use std::sync::LazyLock;
use tantivy::{
DocAddress, Score, TantivyDocument,
collector::{Count, TopDocs},
schema::Value,
};
use tracing::info;

#[cfg(feature = "vsearch")]
use fastembed::{EmbeddingModel, InitOptions, TextEmbedding};

#[cfg(feature = "vsearch")]
use qdrant_client::qdrant::{SearchPointsBuilder, point_id::PointIdOptions};

use crate::{AppState, CONFIG, Case, remove_html_tags};

static EXPORT_LIMIT: LazyLock<usize> = LazyLock::new(|| CONFIG.export_limit.unwrap_or(10000));
Expand Down Expand Up @@ -51,6 +56,7 @@ pub struct QuerySearch {
search: Option<String>,
offset: Option<usize>,
export: Option<bool>,
search_type: Option<String>,
}

#[derive(Template)]
Expand All @@ -59,6 +65,8 @@ pub struct SearchPage {
search: String,
offset: usize,
total: usize,
search_type: String,
enable_vsearch: bool,
cases: Vec<(u32, String, Case)>,
}

Expand All @@ -72,35 +80,95 @@ pub async fn search(
}
let search = input.search.unwrap_or_default();
let export = input.export.unwrap_or_default();
let search_type =
if cfg!(feature = "vsearch") && input.search_type.as_deref() == Some("vsearch") {
"vsearch".to_owned()
} else {
"keyword".to_owned()
};
let limit = if export { *EXPORT_LIMIT } else { 20 };
let mut ids: IndexSet<u32> = IndexSet::with_capacity(20);
let mut total = 0;
if !search.is_empty() {
if export {
info!("exporting: {search}, offset: {offset}, limit: {limit}");
} else {
info!("searching: {search}, offset: {offset}, limit: {limit}");
}
if !search.trim().is_empty() {
let now = std::time::Instant::now();
let search = fast2s::convert(&search);
let (query, _) = state.searcher.query_parser.parse_query_lenient(&search);
let searcher = state.searcher.reader.searcher();
total = searcher.search(&query, &Count).unwrap();

let top_docs: Vec<(Score, DocAddress)> = searcher
.search(&query, &TopDocs::with_limit(limit).and_offset(offset))
.unwrap_or_default();

for (_score, doc_address) in top_docs {
if let Some(id) = searcher
.doc::<TantivyDocument>(doc_address)
.unwrap()
.get_first(state.searcher.id)
.unwrap()
.as_u64()
{
ids.insert(id as u32);
if search_type == "keyword" {
let (query, _) = state.searcher.query_parser.parse_query_lenient(&search);
let searcher = state.searcher.reader.searcher();
total = searcher.search(&query, &Count).unwrap();

let top_docs: Vec<(Score, DocAddress)> = searcher
.search(&query, &TopDocs::with_limit(limit).and_offset(offset))
.unwrap_or_default();

for (_score, doc_address) in top_docs {
if let Some(id) = searcher
.doc::<TantivyDocument>(doc_address)
.unwrap()
.get_first(state.searcher.id)
.unwrap()
.as_u64()
{
ids.insert(id as u32);
}
}
} else {
#[cfg(feature = "vsearch")]
if search_type == "vsearch" {
{
let mut model = TextEmbedding::try_new(
InitOptions::new(EmbeddingModel::BGESmallZHV15)
.with_show_download_progress(true),
)
.unwrap();
Comment on lines +119 to +123
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Performance: Embedding model is initialized on every request.

TextEmbedding::try_new() loads model weights and is expensive (potentially hundreds of milliseconds or more). This should be initialized once at startup and shared via AppState, similar to how searcher is handled.

🔧 Recommended approach

Initialize the embedding model once at startup in main.rs and add it to AppState:

// In AppState (src/lib.rs):
#[cfg(feature = "vsearch")]
pub embedding_model: Arc<Mutex<TextEmbedding>>,

// In main.rs:
#[cfg(feature = "vsearch")]
let embedding_model = Arc::new(Mutex::new(
    TextEmbedding::try_new(
        InitOptions::new(EmbeddingModel::BGESmallZHV15)
            .with_show_download_progress(true),
    ).expect("Failed to initialize embedding model")
));

Then in the controller, use state.embedding_model.lock().unwrap().embed(...).

🤖 Prompt for AI Agents
In `@src/controller.rs` around lines 119 - 123, The embedding model is being
constructed per-request via TextEmbedding::try_new(...) (with InitOptions and
EmbeddingModel::BGESmallZHV15) which is expensive; instead initialize the
TextEmbedding once at startup, add it to AppState (e.g. an
Arc<Mutex<TextEmbedding>> field), create it in main.rs during bootstrap, and
then in controller replace the per-request TextEmbedding::try_new call with
using state.embedding_model.lock().unwrap().embed(...) (or equivalent locking)
to reuse the single shared model instance.

let query_vec = model.embed(vec![&search], None).unwrap();
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Multiple unwrap() calls can cause panics on malformed data.

Line 124: model.embed(...).unwrap() will panic if embedding fails.
Line 132: query_vec.into_iter().next().unwrap() will panic if the vector is empty.

Consider graceful error handling.

🔧 Suggested fix with error handling
-                    let query_vec = model.embed(vec![&search], None).unwrap();
+                    let query_vec = match model.embed(vec![&search], None) {
+                        Ok(v) => v,
+                        Err(e) => {
+                            tracing::error!("Embedding failed: {e}");
+                            return into_response(&SearchPage {
+                                search,
+                                search_type,
+                                offset,
+                                cases: vec![],
+                                total: 0,
+                                enable_vsearch: true,
+                            });
+                        }
+                    };
+
+                    let Some(embedding) = query_vec.into_iter().next() else {
+                        tracing::error!("Empty embedding result");
+                        return into_response(&SearchPage { /* ... */ });
+                    };

Also applies to: 132-132

🤖 Prompt for AI Agents
In `@src/controller.rs` at line 124, The code currently calls
model.embed(vec![&search], None).unwrap() and later uses
query_vec.into_iter().next().unwrap(), which can panic on embed failures or
empty results; change these to propagate or handle errors instead: replace the
first unwrap by handling the Result from model.embed (using ? to return an Err
from the enclosing function or match/if let to log and return an error) and
replace the second unwrap by checking for Some(value) from
query_vec.into_iter().next() (e.g., match or if let Some(vec) { ... } else {
return Err(...) } ), ensuring the surrounding function signature returns a
Result or otherwise returns a controlled error path; reference model.embed,
query_vec, and the into_iter().next() usage when making these changes.


let client = state.qclient;
let search_limit = limit + offset;
if let Ok(search_result) = client
.search_points(
SearchPointsBuilder::new(
"cases",
query_vec.into_iter().next().unwrap(),
search_limit as u64,
)
.with_payload(false)
.limit(limit as u64)
.offset(offset as u64),
)
.await
{
for point in &search_result.result {
let id = point
.id
.as_ref()
.unwrap()
.point_id_options
.as_ref()
.unwrap();
if let PointIdOptions::Num(id) = id {
ids.insert(*id as u32);
}
}
} else {
tracing::error!("Qdrant search_points failed");
}
}
}
Comment on lines +115 to 157
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

total is never populated for vsearch, breaking pagination display.

For keyword search, total is set on line 98. For vsearch, total remains 0. This will likely confuse users and break pagination UI. Qdrant can return the total count if you need it.

Also, line 117 has a redundant check—search_type == "vsearch" is always true in this branch since we're in the else of search_type == "keyword".

💡 Fix for redundant check
         } else {
             #[cfg(feature = "vsearch")]
-            if search_type == "vsearch" {
                 {
                     // vsearch logic...
                 }
-            }
         }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
} else {
#[cfg(feature = "vsearch")]
if search_type == "vsearch" {
{
let mut model = TextEmbedding::try_new(
InitOptions::new(EmbeddingModel::BGESmallZHV15)
.with_show_download_progress(true),
)
.unwrap();
let query_vec = model.embed(vec![&search], None).unwrap();
let client = state.qclient;
let search_limit = limit + offset;
if let Ok(search_result) = client
.search_points(
SearchPointsBuilder::new(
"cases",
query_vec.into_iter().next().unwrap(),
search_limit as u64,
)
.with_payload(false)
.limit(limit as u64)
.offset(offset as u64),
)
.await
{
for point in &search_result.result {
let id = point
.id
.as_ref()
.unwrap()
.point_id_options
.as_ref()
.unwrap();
if let PointIdOptions::Num(id) = id {
ids.insert(*id as u32);
}
}
} else {
tracing::error!("Qdrant search_points failed");
}
}
}
} else {
#[cfg(feature = "vsearch")]
{
{
let mut model = TextEmbedding::try_new(
InitOptions::new(EmbeddingModel::BGESmallZHV15)
.with_show_download_progress(true),
)
.unwrap();
let query_vec = model.embed(vec![&search], None).unwrap();
let client = state.qclient;
let search_limit = limit + offset;
if let Ok(search_result) = client
.search_points(
SearchPointsBuilder::new(
"cases",
query_vec.into_iter().next().unwrap(),
search_limit as u64,
)
.with_payload(false)
.limit(limit as u64)
.offset(offset as u64),
)
.await
{
for point in &search_result.result {
let id = point
.id
.as_ref()
.unwrap()
.point_id_options
.as_ref()
.unwrap();
if let PointIdOptions::Num(id) = id {
ids.insert(*id as u32);
}
}
} else {
tracing::error!("Qdrant search_points failed");
}
}
}

}

let elapsed = now.elapsed().as_secs_f32();
if export {
info!("exporting: {search}, total:{total}, offset: {offset}, limit: {limit}");
} else if search_type == "keyword" {
info!(
"keyword search: {search}, total: {total}, offset: {offset}, limit: {limit}, elapsed: {elapsed}s"
);
} else {
info!(
"vsearch search: {search}, offset: {offset}, limit: {limit}, elapsed: {elapsed}s "
);
}
}

let mut cases = Vec::with_capacity(ids.len());
Expand Down Expand Up @@ -168,9 +236,11 @@ pub async fn search(

let body = SearchPage {
search,
search_type,
offset,
cases,
total,
enable_vsearch: cfg!(feature = "vsearch"),
};

into_response(&body)
Expand Down
5 changes: 5 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ use std::sync::Arc;
use tantivy::Searcher;
pub use tantivy::Tan;

#[cfg(feature = "vsearch")]
use qdrant_client::Qdrant;

mod config;
mod controller;
mod tantivy;
Expand All @@ -16,6 +19,8 @@ mod tantivy;
pub struct AppState {
pub db: PartitionHandle,
pub searcher: Arc<Searcher>,
#[cfg(feature = "vsearch")]
pub qclient: Qdrant,
}

pub fn kv_sep_partition_option() -> PartitionCreateOptions {
Expand Down
16 changes: 16 additions & 0 deletions static/style.css
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,22 @@ a:hover {
transform: scale(0.95);
}

.suggest_vsearch {
background: linear-gradient(90deg, #e0f7fa 0%, #f1f8e9 100%);
border-radius: 6px;
padding: 0.7em 1.2em;
margin: 1.2em auto 1.5em auto;
font-size: 1.05rem;
}

.search-type-select {
background: var(--surface);
border: none;
border-left: 1px solid var(--primary-light);
padding: 0 0.8rem;
font-size: 1rem;
}

/* ===== Secondary Navigation ===== */
.search-second-nav {
max-width: var(--max-width);
Expand Down
29 changes: 22 additions & 7 deletions templates/search.html
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@
name="search"
value="{{ search }}"
/>
{% if enable_vsearch %}
<select class="search-type-select" id="search_type" name="search_type">
<option value="keyword" {% if search_type != "vsearch" %} selected {% endif %}>关键词搜索</option>
<option value="vsearch" {% if search_type == "vsearch" %} selected {% endif %}>语义搜索</option>
</select>
{% endif %}
<button type="submit" class="search-button">🔍</button>
</div>
</form>
Expand All @@ -38,17 +44,26 @@
</div>
<div>
<p>
找到 {{ total }},<a
class="noline"
{% if search_type =="keyword" %} 找到 {{ total }},{% endif %}
<a class="noline"
title="最多导出10000条,调整offset参数可获得更多结果,offset=10000,即可获得第10000~20000条结果"
href="/?search={{ search }}&offset={{ offset }}&export=true"
>导出</a
>
href="/?search={{ search }}&offset={{ offset }}&export=true&search_type={{ search_type }}"
>导出</a>
</p>
</div>
</nav>
</header>

{% if enable_vsearch %}
{% if search_type == "keyword" %}
{% if total < 30 %}
{% if !search.is_empty() %}
<p class="suggest_vsearch">当前结果较少,建议使用 <a href="/?search={{ search }}&search_type=vsearch">语义搜索</a></p>
{% endif %}
{% endif %}
{% endif %}
{% endif %}

<main class="search-results">
{% for (id, preview, case) in cases %}
<div class="search-result-text">
Expand All @@ -67,9 +82,9 @@ <h3 class="underlineonhover">{{ case.case_name }}</h3>

<div class="pagination">
{% if cases.len() >= 20 %} {% if offset >= 20 %}
<a href="/?search={{ search }}&offset={{ offset - 20 }}">上一页</a
<a href="/?search={{ search }}&offset={{ offset - 20 }}&search_type={{ search_type }}">上一页</a
>&nbsp;&nbsp;&nbsp;&nbsp; {% endif %}
<a href="/?search={{ search }}&offset={{ offset + 20 }}">下一页</a>
<a href="/?search={{ search }}&offset={{ offset + 20 }}&search_type={{ search_type }}">下一页</a>
{% endif %}
</div>
<footer>
Expand Down