From 1bb493e0c981344f81d21d69db04a2b849b7c427 Mon Sep 17 00:00:00 2001 From: kampersanda Date: Sat, 21 Sep 2024 23:59:18 +0900 Subject: [PATCH 1/6] add --- README.md | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index bf24961..e06763f 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,7 @@ inspired by [ranx](https://github.com/AmenRa/ranx) and [Sakai's book](https://ww ## Features - **IRer-friendly**: - The library is designed to be easy to use for developers in information retrieval - by providing TREC-like data structures, such as Qrels and Run. + The library is designed to be easy to use for developers in information retrieval. - **Flexible**: The library supports various evaluation metrics, such as Precision, MAP, MRR, and nDCG. The supported metrics are available in [Metric](https://docs.rs/elinor/latest/elinor/metrics/enum.Metric.html). @@ -33,30 +32,30 @@ RUSTDOCFLAGS="--html-in-header katex.html" cargo doc --no-deps --open ## Getting Started -A simple routine to prepare Qrels and Run data structures +A simple routine to prepare gold and predicted relevance scores and evaluate them using Precision@3, MAP, MRR, and nDCG@3: ```rust -use elinor::{QrelsBuilder, RunBuilder, Metric}; - -// Construct Qrels data structure. -let mut qb = QrelsBuilder::new(); -qb.add_score("q_1", "d_1", 1)?; -qb.add_score("q_1", "d_2", 0)?; -qb.add_score("q_1", "d_3", 2)?; -qb.add_score("q_2", "d_2", 2)?; -qb.add_score("q_2", "d_4", 1)?; -let qrels = qb.build(); - -// Construct Run data structure. -let mut rb = RunBuilder::new(); -rb.add_score("q_1", "d_1", 0.5.into())?; -rb.add_score("q_1", "d_2", 0.4.into())?; -rb.add_score("q_1", "d_3", 0.3.into())?; -rb.add_score("q_2", "d_4", 0.1.into())?; -rb.add_score("q_2", "d_1", 0.2.into())?; -rb.add_score("q_2", "d_3", 0.3.into())?; -let run = rb.build(); +use elinor::{GoldRelStoreBuilder, PredRelStoreBuilder, Metric}; + +// Prepare gold relevance scores. +let mut b = GoldRelStoreBuilder::new(); +b.add_score("q_1", "d_1", 1)?; +b.add_score("q_1", "d_2", 0)?; +b.add_score("q_1", "d_3", 2)?; +b.add_score("q_2", "d_2", 2)?; +b.add_score("q_2", "d_4", 1)?; +let gold_rels = b.build(); + +// Prepare predicted relevance scores. +let mut b = PredRelStoreBuilder::new(); +b.add_score("q_1", "d_1", 0.5.into())?; +b.add_score("q_1", "d_2", 0.4.into())?; +b.add_score("q_1", "d_3", 0.3.into())?; +b.add_score("q_2", "d_4", 0.1.into())?; +b.add_score("q_2", "d_1", 0.2.into())?; +b.add_score("q_2", "d_3", 0.3.into())?; +let pred_rels = b.build(); // The metrics to evaluate can be specified via Metric instances. let metrics = vec![ @@ -67,8 +66,8 @@ let metrics = vec![ "ndcg@3".parse()?, ]; -// Evaluate the qrels and run data. -let evaluated = elinor::evaluate(&qrels, &run, metrics.iter().cloned())?; +// Evaluate. +let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metrics.iter().cloned())?; // Macro-averaged scores. for metric in &metrics { From 3edad671c0cb36aed1b4061892ce09bd335f10ca Mon Sep 17 00:00:00 2001 From: kampersanda Date: Sun, 22 Sep 2024 00:04:48 +0900 Subject: [PATCH 2/6] add --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e06763f..2f43e9a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Elinor: Evaluation Library in Information Retrieval +# Elinor: Evaluation Library in INfOrmation Retrieval

actions status From efe5b43bade8e38eee6faa81e0edb7ac14db4bbe Mon Sep 17 00:00:00 2001 From: kampersanda Date: Sun, 22 Sep 2024 00:08:40 +0900 Subject: [PATCH 3/6] add --- src/metrics.rs | 5 ++++- src/metrics/hits.rs | 21 --------------------- src/metrics/success.rs | 26 ++++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 22 deletions(-) create mode 100644 src/metrics/success.rs diff --git a/src/metrics.rs b/src/metrics.rs index 3f6decd..beb61e3 100644 --- a/src/metrics.rs +++ b/src/metrics.rs @@ -8,6 +8,7 @@ pub(crate) mod precision; pub(crate) mod r_precision; pub(crate) mod recall; pub(crate) mod reciprocal_rank; +pub(crate) mod success; use std::collections::HashMap; use std::fmt::Display; @@ -336,7 +337,9 @@ where let golds = gold_rels.get_map(query_id).unwrap(); let score = match metric { Metric::Hits { k } => hits::compute_hits(golds, sorted_preds, k, RELEVANT_LEVEL), - Metric::Success { k } => hits::compute_success(golds, sorted_preds, k, RELEVANT_LEVEL), + Metric::Success { k } => { + success::compute_success(golds, sorted_preds, k, RELEVANT_LEVEL) + } Metric::Precision { k } => { precision::compute_precision(golds, sorted_preds, k, RELEVANT_LEVEL) } diff --git a/src/metrics/hits.rs b/src/metrics/hits.rs index d25c7e8..b443274 100644 --- a/src/metrics/hits.rs +++ b/src/metrics/hits.rs @@ -25,24 +25,3 @@ where } hits as f64 } - -/// Returns 1 if at least one relevant document is found, 0 otherwise. -pub fn compute_success( - golds: &HashMap, - sorted_preds: &[Relevance], - k: usize, - rel_lvl: GoldScore, -) -> f64 -where - K: Eq + std::hash::Hash, -{ - let k = if k == 0 { sorted_preds.len() } else { k }; - for pred in sorted_preds.iter().take(k) { - if let Some(&rel) = golds.get(&pred.doc_id) { - if rel >= rel_lvl { - return 1.0; - } - } - } - 0.0 -} diff --git a/src/metrics/success.rs b/src/metrics/success.rs new file mode 100644 index 0000000..6c23663 --- /dev/null +++ b/src/metrics/success.rs @@ -0,0 +1,26 @@ +use std::collections::HashMap; + +use crate::GoldScore; +use crate::PredScore; +use crate::Relevance; + +/// Returns 1 if at least one relevant document is found, 0 otherwise. +pub fn compute_success( + golds: &HashMap, + sorted_preds: &[Relevance], + k: usize, + rel_lvl: GoldScore, +) -> f64 +where + K: Eq + std::hash::Hash, +{ + let k = if k == 0 { sorted_preds.len() } else { k }; + for pred in sorted_preds.iter().take(k) { + if let Some(&rel) = golds.get(&pred.doc_id) { + if rel >= rel_lvl { + return 1.0; + } + } + } + 0.0 +} From ed1a102bec1e7990c424a6e40cff61be75a1feb9 Mon Sep 17 00:00:00 2001 From: kampersanda Date: Sun, 22 Sep 2024 00:32:01 +0900 Subject: [PATCH 4/6] add --- .github/workflows/ci.yml | 1 - elinor-evaluate/src/main.rs | 5 ++- examples/from_json.rs | 18 ++-------- examples/from_trec.rs | 18 ++-------- examples/simple.rs | 45 ------------------------- src/lib.rs | 65 ++++++++++++++++++++----------------- 6 files changed, 44 insertions(+), 108 deletions(-) delete mode 100644 examples/simple.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 22ea933..e24678c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -57,7 +57,6 @@ jobs: cargo run --release --example from_json --features serde cargo run --release --example from_trec cargo run --release --example paired_bootstrap_test - cargo run --release --example simple correctness-test: name: Correctness test against trec_eval diff --git a/elinor-evaluate/src/main.rs b/elinor-evaluate/src/main.rs index 0f42d69..235685f 100644 --- a/elinor-evaluate/src/main.rs +++ b/elinor-evaluate/src/main.rs @@ -28,10 +28,9 @@ fn main() -> Result<(), Box> { let pred_rels = trec::parse_pred_rels_in_trec(load_lines(&args.pred_file)?.into_iter())?; let metrics = all_metrics(&args.ks); - let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metrics.iter().cloned())?; - for metric in &metrics { - let score = evaluated.mean_scores[metric]; + let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metric)?; + let score = evaluated.mean_score(); println!("{metric}\t{score:.4}"); } diff --git a/examples/from_json.rs b/examples/from_json.rs index 6c85a5d..53af2da 100644 --- a/examples/from_json.rs +++ b/examples/from_json.rs @@ -61,22 +61,10 @@ fn main() -> Result<()> { Metric::NDCG { k: 3 }, Metric::NDCGBurges { k: 3 }, ]; - let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metrics.iter().cloned())?; - println!("=== Mean scores ==="); - for metric in &metrics { - let score = evaluated.mean_scores[metric]; - println!("{metric}: {score:.4}"); - } - - println!("\n=== Scores for each query ==="); - for metric in &metrics { - println!("{metric}"); - let qid_to_score = &evaluated.all_scores[metric]; - for qid in ["q_1", "q_2"] { - let score = qid_to_score[qid]; - println!("- {qid}: {score:.4}"); - } + for metric in metrics { + let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metric)?; + println!("{:?}: {:.4}", metric, evaluated.mean_score()); } Ok(()) diff --git a/examples/from_trec.rs b/examples/from_trec.rs index 8dca1d4..ba595c7 100644 --- a/examples/from_trec.rs +++ b/examples/from_trec.rs @@ -37,22 +37,10 @@ q_2 0 d_4 3 0.1 SAMPLE Metric::NDCG { k: 3 }, Metric::NDCGBurges { k: 3 }, ]; - let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metrics.iter().cloned())?; - println!("=== Mean scores ==="); - for metric in &metrics { - let score = evaluated.mean_scores[metric]; - println!("{metric}: {score:.4}"); - } - - println!("\n=== Scores for each query ==="); - for metric in &metrics { - println!("{metric}"); - let qid_to_score = &evaluated.all_scores[metric]; - for qid in ["q_1", "q_2"] { - let score = qid_to_score[qid]; - println!("- {qid}: {score:.4}"); - } + for metric in metrics { + let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metric)?; + println!("{:?}: {:.4}", metric, evaluated.mean_score()); } Ok(()) diff --git a/examples/simple.rs b/examples/simple.rs deleted file mode 100644 index abbdd3a..0000000 --- a/examples/simple.rs +++ /dev/null @@ -1,45 +0,0 @@ -use anyhow::Result; -use elinor::GoldRelStoreBuilder; -use elinor::Metric; -use elinor::PredRelStoreBuilder; - -fn main() -> Result<()> { - // Prepare gold relevance scores. - let mut b = GoldRelStoreBuilder::new(); - b.add_score("q_1", "d_1", 1)?; - b.add_score("q_1", "d_2", 0)?; - b.add_score("q_1", "d_3", 2)?; - b.add_score("q_2", "d_2", 2)?; - b.add_score("q_2", "d_4", 1)?; - let gold_rels = b.build(); - - // Prepare predicted relevance scores. - let mut b = PredRelStoreBuilder::new(); - b.add_score("q_1", "d_1", 0.5.into())?; - b.add_score("q_1", "d_2", 0.4.into())?; - b.add_score("q_1", "d_3", 0.3.into())?; - b.add_score("q_2", "d_4", 0.1.into())?; - b.add_score("q_2", "d_1", 0.2.into())?; - b.add_score("q_2", "d_3", 0.3.into())?; - let pred_rels = b.build(); - - // The metrics to evaluate can be specified via Metric instances. - let metrics = vec![ - Metric::Precision { k: 3 }, - Metric::AP { k: 0 }, // k=0 means all documents. - // The instances can also be specified via strings. - "rr".parse()?, - "ndcg@3".parse()?, - ]; - - // Evaluate. - let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metrics.iter().cloned())?; - - // Macro-averaged scores. - for metric in &metrics { - let score = evaluated.mean_scores[metric]; - println!("{metric}: {score:.4}"); - } - - Ok(()) -} diff --git a/src/lib.rs b/src/lib.rs index 4d74ebe..775cb5e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,6 +19,7 @@ //! ``` //! # fn main() -> Result<(), Box> { //! use elinor::{GoldRelStoreBuilder, PredRelStoreBuilder, Metric}; +//! use approx::assert_abs_diff_eq; //! //! // Prepare gold relevance scores. //! let mut b = GoldRelStoreBuilder::new(); @@ -48,14 +49,22 @@ //! "ndcg@3".parse()?, //! ]; //! -//! // Evaluate. -//! let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metrics.iter().cloned())?; +//! // Evaluate Precision@3. +//! let evaluated = elinor::evaluate(&gold_rels, &pred_rels, Metric::Precision { k: 3 })?; +//! assert_abs_diff_eq!(evaluated.mean_score(), 0.5000, epsilon = 1e-4); +//! +//! // Evaluate MAP, where all documents are considered via k=0. +//! let evaluated = elinor::evaluate(&gold_rels, &pred_rels, Metric::AP { k: 0 })?; +//! assert_abs_diff_eq!(evaluated.mean_score(), 0.5000, epsilon = 1e-4); +//! +//! // Evaluate MRR, where the metric is specified via a string representation. +//! let evaluated = elinor::evaluate(&gold_rels, &pred_rels, "rr".parse()?)?; +//! assert_abs_diff_eq!(evaluated.mean_score(), 0.6667, epsilon = 1e-4); +//! +//! // Evaluate nDCG@3, where the metric is specified via a string representation. +//! let evaluated = elinor::evaluate(&gold_rels, &pred_rels, "ndcg@3".parse()?)?; +//! assert_abs_diff_eq!(evaluated.mean_score(), 0.4751, epsilon = 1e-4); //! -//! // Macro-averaged scores. -//! for metric in &metrics { -//! let score = evaluated.mean_scores[metric]; -//! println!("{metric}: {score:.4}"); -//! } //! // => precision@3: 0.5000 //! // => ap: 0.5000 //! // => rr: 0.6667 @@ -73,9 +82,9 @@ pub mod relevance; pub mod statistical_tests; pub mod trec; -use ordered_float::OrderedFloat; use std::collections::HashMap; -use std::collections::HashSet; + +use ordered_float::OrderedFloat; pub use metrics::Metric; pub use relevance::Relevance; @@ -102,34 +111,32 @@ pub type PredRelStoreBuilder = relevance::RelevanceStoreBuilder /// Data type to store evaluated scores. pub struct Evaluated { - /// Metric to macro-averaged score. - pub mean_scores: HashMap, + scores: HashMap, + mean_score: f64, +} + +impl Evaluated { + /// Returns the reference to the mappping from query ids to scores. + pub const fn scores(&self) -> &HashMap { + &self.scores + } - /// Metric to mapping from query ID to the score. - pub all_scores: HashMap>, + /// Returns the macro-averaged score. + pub const fn mean_score(&self) -> f64 { + self.mean_score + } } /// Evaluates the given gold_rels and pred_rels data using the specified metrics. -pub fn evaluate( +pub fn evaluate( gold_rels: &GoldRelStore, pred_rels: &PredRelStore, - metrics: M, + metric: Metric, ) -> Result, errors::ElinorError> where K: Clone + Eq + Ord + std::hash::Hash + std::fmt::Display, - M: IntoIterator, { - let metrics: HashSet = metrics.into_iter().collect(); - let mut mean_scores = HashMap::new(); - let mut all_scores = HashMap::new(); - for metric in metrics { - let result = metrics::compute_metric(gold_rels, pred_rels, metric)?; - let mean_score = result.values().sum::() / result.len() as f64; - mean_scores.insert(metric, mean_score); - all_scores.insert(metric, result); - } - Ok(Evaluated { - mean_scores, - all_scores, - }) + let scores = metrics::compute_metric(gold_rels, pred_rels, metric)?; + let mean_score = scores.values().sum::() / scores.len() as f64; + Ok(Evaluated { scores, mean_score }) } From c22b0488d0c63a78b60c5d1d9041a918743d8979 Mon Sep 17 00:00:00 2001 From: kampersanda Date: Sun, 22 Sep 2024 00:38:01 +0900 Subject: [PATCH 5/6] add --- README.md | 37 ++++++++++++++++--------------------- elinor-evaluate/src/main.rs | 2 +- src/lib.rs | 14 -------------- 3 files changed, 17 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 2f43e9a..be5d830 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ and evaluate them using Precision@3, MAP, MRR, and nDCG@3: ```rust use elinor::{GoldRelStoreBuilder, PredRelStoreBuilder, Metric}; +use approx::assert_abs_diff_eq; // Prepare gold relevance scores. let mut b = GoldRelStoreBuilder::new(); @@ -57,27 +58,21 @@ b.add_score("q_2", "d_1", 0.2.into())?; b.add_score("q_2", "d_3", 0.3.into())?; let pred_rels = b.build(); -// The metrics to evaluate can be specified via Metric instances. -let metrics = vec![ - Metric::Precision { k: 3 }, - Metric::AP { k: 0 }, // k=0 means all documents. - // The instances can also be specified via strings. - "rr".parse()?, - "ndcg@3".parse()?, -]; - -// Evaluate. -let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metrics.iter().cloned())?; - -// Macro-averaged scores. -for metric in &metrics { - let score = evaluated.mean_scores[metric]; - println!("{metric}: {score:.4}"); -} -// => precision@3: 0.5000 -// => ap: 0.5000 -// => rr: 0.6667 -// => ndcg@3: 0.4751 +// Evaluate Precision@3. +let evaluated = elinor::evaluate(&gold_rels, &pred_rels, Metric::Precision { k: 3 })?; +assert_abs_diff_eq!(evaluated.mean_score(), 0.5000, epsilon = 1e-4); + +// Evaluate MAP, where all documents are considered via k=0. +let evaluated = elinor::evaluate(&gold_rels, &pred_rels, Metric::AP { k: 0 })?; +assert_abs_diff_eq!(evaluated.mean_score(), 0.5000, epsilon = 1e-4); + +// Evaluate MRR, where the metric is specified via a string representation. +let evaluated = elinor::evaluate(&gold_rels, &pred_rels, "rr".parse()?)?; +assert_abs_diff_eq!(evaluated.mean_score(), 0.6667, epsilon = 1e-4); + +// Evaluate nDCG@3, where the metric is specified via a string representation. +let evaluated = elinor::evaluate(&gold_rels, &pred_rels, "ndcg@3".parse()?)?; +assert_abs_diff_eq!(evaluated.mean_score(), 0.4751, epsilon = 1e-4); ``` Other examples are available in the [`examples`](https://github.com/kampersanda/elinor/tree/main/examples) directory. diff --git a/elinor-evaluate/src/main.rs b/elinor-evaluate/src/main.rs index 235685f..8447c29 100644 --- a/elinor-evaluate/src/main.rs +++ b/elinor-evaluate/src/main.rs @@ -28,7 +28,7 @@ fn main() -> Result<(), Box> { let pred_rels = trec::parse_pred_rels_in_trec(load_lines(&args.pred_file)?.into_iter())?; let metrics = all_metrics(&args.ks); - for metric in &metrics { + for metric in metrics { let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metric)?; let score = evaluated.mean_score(); println!("{metric}\t{score:.4}"); diff --git a/src/lib.rs b/src/lib.rs index 775cb5e..16cb38e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -40,15 +40,6 @@ //! b.add_score("q_2", "d_3", 0.3.into())?; //! let pred_rels = b.build(); //! -//! // The metrics to evaluate can be specified via Metric instances. -//! let metrics = vec![ -//! Metric::Precision { k: 3 }, -//! Metric::AP { k: 0 }, // k=0 means all documents. -//! // The instances can also be specified via strings. -//! "rr".parse()?, -//! "ndcg@3".parse()?, -//! ]; -//! //! // Evaluate Precision@3. //! let evaluated = elinor::evaluate(&gold_rels, &pred_rels, Metric::Precision { k: 3 })?; //! assert_abs_diff_eq!(evaluated.mean_score(), 0.5000, epsilon = 1e-4); @@ -64,11 +55,6 @@ //! // Evaluate nDCG@3, where the metric is specified via a string representation. //! let evaluated = elinor::evaluate(&gold_rels, &pred_rels, "ndcg@3".parse()?)?; //! assert_abs_diff_eq!(evaluated.mean_score(), 0.4751, epsilon = 1e-4); -//! -//! // => precision@3: 0.5000 -//! // => ap: 0.5000 -//! // => rr: 0.6667 -//! // => ndcg@3: 0.4751 //! # Ok(()) //! # } //! ``` From 31b8a757010a36bbde87a7ee0d0a952555f23281 Mon Sep 17 00:00:00 2001 From: kampersanda Date: Sun, 22 Sep 2024 00:49:59 +0900 Subject: [PATCH 6/6] add --- src/lib.rs | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 16cb38e..dd4092f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -126,3 +126,39 @@ where let mean_score = scores.values().sum::() / scores.len() as f64; Ok(Evaluated { scores, mean_score }) } + +#[cfg(test)] +mod tests { + use super::*; + use approx::assert_relative_eq; + + #[test] + fn test_evaluate() -> Result<(), errors::ElinorError> { + let mut b = GoldRelStoreBuilder::new(); + b.add_score("q_1", "d_1", 1)?; + b.add_score("q_1", "d_2", 0)?; + b.add_score("q_1", "d_3", 2)?; + b.add_score("q_2", "d_2", 2)?; + b.add_score("q_2", "d_4", 1)?; + let gold_rels = b.build(); + + let mut b = PredRelStoreBuilder::new(); + b.add_score("q_1", "d_1", 0.5.into())?; + b.add_score("q_1", "d_2", 0.4.into())?; + b.add_score("q_1", "d_3", 0.3.into())?; + b.add_score("q_2", "d_4", 0.1.into())?; + b.add_score("q_2", "d_1", 0.2.into())?; + b.add_score("q_2", "d_3", 0.3.into())?; + let pred_rels = b.build(); + + let evaluated = evaluate(&gold_rels, &pred_rels, Metric::Precision { k: 3 })?; + assert_relative_eq!(evaluated.mean_score(), (2. / 3. + 1. / 3.) / 2.); + + let scores = evaluated.scores(); + assert_eq!(scores.len(), 2); + assert_relative_eq!(scores["q_1"], 2. / 3.); + assert_relative_eq!(scores["q_2"], 1. / 3.); + + Ok(()) + } +}