diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 22ea933..e24678c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -57,7 +57,6 @@ jobs:
cargo run --release --example from_json --features serde
cargo run --release --example from_trec
cargo run --release --example paired_bootstrap_test
- cargo run --release --example simple
correctness-test:
name: Correctness test against trec_eval
diff --git a/README.md b/README.md
index bf24961..be5d830 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Elinor: Evaluation Library in Information Retrieval
+# Elinor: Evaluation Library in INfOrmation Retrieval
@@ -14,8 +14,7 @@ inspired by [ranx](https://github.com/AmenRa/ranx) and [Sakai's book](https://ww
## Features
- **IRer-friendly**:
- The library is designed to be easy to use for developers in information retrieval
- by providing TREC-like data structures, such as Qrels and Run.
+ The library is designed to be easy to use for developers in information retrieval.
- **Flexible**:
The library supports various evaluation metrics, such as Precision, MAP, MRR, and nDCG.
The supported metrics are available in [Metric](https://docs.rs/elinor/latest/elinor/metrics/enum.Metric.html).
@@ -33,52 +32,47 @@ RUSTDOCFLAGS="--html-in-header katex.html" cargo doc --no-deps --open
## Getting Started
-A simple routine to prepare Qrels and Run data structures
+A simple routine to prepare gold and predicted relevance scores
and evaluate them using Precision@3, MAP, MRR, and nDCG@3:
```rust
-use elinor::{QrelsBuilder, RunBuilder, Metric};
-
-// Construct Qrels data structure.
-let mut qb = QrelsBuilder::new();
-qb.add_score("q_1", "d_1", 1)?;
-qb.add_score("q_1", "d_2", 0)?;
-qb.add_score("q_1", "d_3", 2)?;
-qb.add_score("q_2", "d_2", 2)?;
-qb.add_score("q_2", "d_4", 1)?;
-let qrels = qb.build();
-
-// Construct Run data structure.
-let mut rb = RunBuilder::new();
-rb.add_score("q_1", "d_1", 0.5.into())?;
-rb.add_score("q_1", "d_2", 0.4.into())?;
-rb.add_score("q_1", "d_3", 0.3.into())?;
-rb.add_score("q_2", "d_4", 0.1.into())?;
-rb.add_score("q_2", "d_1", 0.2.into())?;
-rb.add_score("q_2", "d_3", 0.3.into())?;
-let run = rb.build();
-
-// The metrics to evaluate can be specified via Metric instances.
-let metrics = vec![
- Metric::Precision { k: 3 },
- Metric::AP { k: 0 }, // k=0 means all documents.
- // The instances can also be specified via strings.
- "rr".parse()?,
- "ndcg@3".parse()?,
-];
-
-// Evaluate the qrels and run data.
-let evaluated = elinor::evaluate(&qrels, &run, metrics.iter().cloned())?;
-
-// Macro-averaged scores.
-for metric in &metrics {
- let score = evaluated.mean_scores[metric];
- println!("{metric}: {score:.4}");
-}
-// => precision@3: 0.5000
-// => ap: 0.5000
-// => rr: 0.6667
-// => ndcg@3: 0.4751
+use elinor::{GoldRelStoreBuilder, PredRelStoreBuilder, Metric};
+use approx::assert_abs_diff_eq;
+
+// Prepare gold relevance scores.
+let mut b = GoldRelStoreBuilder::new();
+b.add_score("q_1", "d_1", 1)?;
+b.add_score("q_1", "d_2", 0)?;
+b.add_score("q_1", "d_3", 2)?;
+b.add_score("q_2", "d_2", 2)?;
+b.add_score("q_2", "d_4", 1)?;
+let gold_rels = b.build();
+
+// Prepare predicted relevance scores.
+let mut b = PredRelStoreBuilder::new();
+b.add_score("q_1", "d_1", 0.5.into())?;
+b.add_score("q_1", "d_2", 0.4.into())?;
+b.add_score("q_1", "d_3", 0.3.into())?;
+b.add_score("q_2", "d_4", 0.1.into())?;
+b.add_score("q_2", "d_1", 0.2.into())?;
+b.add_score("q_2", "d_3", 0.3.into())?;
+let pred_rels = b.build();
+
+// Evaluate Precision@3.
+let evaluated = elinor::evaluate(&gold_rels, &pred_rels, Metric::Precision { k: 3 })?;
+assert_abs_diff_eq!(evaluated.mean_score(), 0.5000, epsilon = 1e-4);
+
+// Evaluate MAP, where all documents are considered via k=0.
+let evaluated = elinor::evaluate(&gold_rels, &pred_rels, Metric::AP { k: 0 })?;
+assert_abs_diff_eq!(evaluated.mean_score(), 0.5000, epsilon = 1e-4);
+
+// Evaluate MRR, where the metric is specified via a string representation.
+let evaluated = elinor::evaluate(&gold_rels, &pred_rels, "rr".parse()?)?;
+assert_abs_diff_eq!(evaluated.mean_score(), 0.6667, epsilon = 1e-4);
+
+// Evaluate nDCG@3, where the metric is specified via a string representation.
+let evaluated = elinor::evaluate(&gold_rels, &pred_rels, "ndcg@3".parse()?)?;
+assert_abs_diff_eq!(evaluated.mean_score(), 0.4751, epsilon = 1e-4);
```
Other examples are available in the [`examples`](https://github.com/kampersanda/elinor/tree/main/examples) directory.
diff --git a/elinor-evaluate/src/main.rs b/elinor-evaluate/src/main.rs
index 0f42d69..8447c29 100644
--- a/elinor-evaluate/src/main.rs
+++ b/elinor-evaluate/src/main.rs
@@ -28,10 +28,9 @@ fn main() -> Result<(), Box> {
let pred_rels = trec::parse_pred_rels_in_trec(load_lines(&args.pred_file)?.into_iter())?;
let metrics = all_metrics(&args.ks);
- let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metrics.iter().cloned())?;
-
- for metric in &metrics {
- let score = evaluated.mean_scores[metric];
+ for metric in metrics {
+ let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metric)?;
+ let score = evaluated.mean_score();
println!("{metric}\t{score:.4}");
}
diff --git a/examples/from_json.rs b/examples/from_json.rs
index 6c85a5d..53af2da 100644
--- a/examples/from_json.rs
+++ b/examples/from_json.rs
@@ -61,22 +61,10 @@ fn main() -> Result<()> {
Metric::NDCG { k: 3 },
Metric::NDCGBurges { k: 3 },
];
- let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metrics.iter().cloned())?;
- println!("=== Mean scores ===");
- for metric in &metrics {
- let score = evaluated.mean_scores[metric];
- println!("{metric}: {score:.4}");
- }
-
- println!("\n=== Scores for each query ===");
- for metric in &metrics {
- println!("{metric}");
- let qid_to_score = &evaluated.all_scores[metric];
- for qid in ["q_1", "q_2"] {
- let score = qid_to_score[qid];
- println!("- {qid}: {score:.4}");
- }
+ for metric in metrics {
+ let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metric)?;
+ println!("{:?}: {:.4}", metric, evaluated.mean_score());
}
Ok(())
diff --git a/examples/from_trec.rs b/examples/from_trec.rs
index 8dca1d4..ba595c7 100644
--- a/examples/from_trec.rs
+++ b/examples/from_trec.rs
@@ -37,22 +37,10 @@ q_2 0 d_4 3 0.1 SAMPLE
Metric::NDCG { k: 3 },
Metric::NDCGBurges { k: 3 },
];
- let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metrics.iter().cloned())?;
- println!("=== Mean scores ===");
- for metric in &metrics {
- let score = evaluated.mean_scores[metric];
- println!("{metric}: {score:.4}");
- }
-
- println!("\n=== Scores for each query ===");
- for metric in &metrics {
- println!("{metric}");
- let qid_to_score = &evaluated.all_scores[metric];
- for qid in ["q_1", "q_2"] {
- let score = qid_to_score[qid];
- println!("- {qid}: {score:.4}");
- }
+ for metric in metrics {
+ let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metric)?;
+ println!("{:?}: {:.4}", metric, evaluated.mean_score());
}
Ok(())
diff --git a/examples/simple.rs b/examples/simple.rs
deleted file mode 100644
index abbdd3a..0000000
--- a/examples/simple.rs
+++ /dev/null
@@ -1,45 +0,0 @@
-use anyhow::Result;
-use elinor::GoldRelStoreBuilder;
-use elinor::Metric;
-use elinor::PredRelStoreBuilder;
-
-fn main() -> Result<()> {
- // Prepare gold relevance scores.
- let mut b = GoldRelStoreBuilder::new();
- b.add_score("q_1", "d_1", 1)?;
- b.add_score("q_1", "d_2", 0)?;
- b.add_score("q_1", "d_3", 2)?;
- b.add_score("q_2", "d_2", 2)?;
- b.add_score("q_2", "d_4", 1)?;
- let gold_rels = b.build();
-
- // Prepare predicted relevance scores.
- let mut b = PredRelStoreBuilder::new();
- b.add_score("q_1", "d_1", 0.5.into())?;
- b.add_score("q_1", "d_2", 0.4.into())?;
- b.add_score("q_1", "d_3", 0.3.into())?;
- b.add_score("q_2", "d_4", 0.1.into())?;
- b.add_score("q_2", "d_1", 0.2.into())?;
- b.add_score("q_2", "d_3", 0.3.into())?;
- let pred_rels = b.build();
-
- // The metrics to evaluate can be specified via Metric instances.
- let metrics = vec![
- Metric::Precision { k: 3 },
- Metric::AP { k: 0 }, // k=0 means all documents.
- // The instances can also be specified via strings.
- "rr".parse()?,
- "ndcg@3".parse()?,
- ];
-
- // Evaluate.
- let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metrics.iter().cloned())?;
-
- // Macro-averaged scores.
- for metric in &metrics {
- let score = evaluated.mean_scores[metric];
- println!("{metric}: {score:.4}");
- }
-
- Ok(())
-}
diff --git a/src/lib.rs b/src/lib.rs
index 4d74ebe..dd4092f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -19,6 +19,7 @@
//! ```
//! # fn main() -> Result<(), Box> {
//! use elinor::{GoldRelStoreBuilder, PredRelStoreBuilder, Metric};
+//! use approx::assert_abs_diff_eq;
//!
//! // Prepare gold relevance scores.
//! let mut b = GoldRelStoreBuilder::new();
@@ -39,27 +40,21 @@
//! b.add_score("q_2", "d_3", 0.3.into())?;
//! let pred_rels = b.build();
//!
-//! // The metrics to evaluate can be specified via Metric instances.
-//! let metrics = vec![
-//! Metric::Precision { k: 3 },
-//! Metric::AP { k: 0 }, // k=0 means all documents.
-//! // The instances can also be specified via strings.
-//! "rr".parse()?,
-//! "ndcg@3".parse()?,
-//! ];
+//! // Evaluate Precision@3.
+//! let evaluated = elinor::evaluate(&gold_rels, &pred_rels, Metric::Precision { k: 3 })?;
+//! assert_abs_diff_eq!(evaluated.mean_score(), 0.5000, epsilon = 1e-4);
//!
-//! // Evaluate.
-//! let evaluated = elinor::evaluate(&gold_rels, &pred_rels, metrics.iter().cloned())?;
+//! // Evaluate MAP, where all documents are considered via k=0.
+//! let evaluated = elinor::evaluate(&gold_rels, &pred_rels, Metric::AP { k: 0 })?;
+//! assert_abs_diff_eq!(evaluated.mean_score(), 0.5000, epsilon = 1e-4);
//!
-//! // Macro-averaged scores.
-//! for metric in &metrics {
-//! let score = evaluated.mean_scores[metric];
-//! println!("{metric}: {score:.4}");
-//! }
-//! // => precision@3: 0.5000
-//! // => ap: 0.5000
-//! // => rr: 0.6667
-//! // => ndcg@3: 0.4751
+//! // Evaluate MRR, where the metric is specified via a string representation.
+//! let evaluated = elinor::evaluate(&gold_rels, &pred_rels, "rr".parse()?)?;
+//! assert_abs_diff_eq!(evaluated.mean_score(), 0.6667, epsilon = 1e-4);
+//!
+//! // Evaluate nDCG@3, where the metric is specified via a string representation.
+//! let evaluated = elinor::evaluate(&gold_rels, &pred_rels, "ndcg@3".parse()?)?;
+//! assert_abs_diff_eq!(evaluated.mean_score(), 0.4751, epsilon = 1e-4);
//! # Ok(())
//! # }
//! ```
@@ -73,9 +68,9 @@ pub mod relevance;
pub mod statistical_tests;
pub mod trec;
-use ordered_float::OrderedFloat;
use std::collections::HashMap;
-use std::collections::HashSet;
+
+use ordered_float::OrderedFloat;
pub use metrics::Metric;
pub use relevance::Relevance;
@@ -102,34 +97,68 @@ pub type PredRelStoreBuilder = relevance::RelevanceStoreBuilder
/// Data type to store evaluated scores.
pub struct Evaluated {
- /// Metric to macro-averaged score.
- pub mean_scores: HashMap,
+ scores: HashMap,
+ mean_score: f64,
+}
+
+impl Evaluated {
+ /// Returns the reference to the mappping from query ids to scores.
+ pub const fn scores(&self) -> &HashMap {
+ &self.scores
+ }
- /// Metric to mapping from query ID to the score.
- pub all_scores: HashMap>,
+ /// Returns the macro-averaged score.
+ pub const fn mean_score(&self) -> f64 {
+ self.mean_score
+ }
}
/// Evaluates the given gold_rels and pred_rels data using the specified metrics.
-pub fn evaluate(
+pub fn evaluate(
gold_rels: &GoldRelStore,
pred_rels: &PredRelStore,
- metrics: M,
+ metric: Metric,
) -> Result, errors::ElinorError>
where
K: Clone + Eq + Ord + std::hash::Hash + std::fmt::Display,
- M: IntoIterator- ,
{
- let metrics: HashSet = metrics.into_iter().collect();
- let mut mean_scores = HashMap::new();
- let mut all_scores = HashMap::new();
- for metric in metrics {
- let result = metrics::compute_metric(gold_rels, pred_rels, metric)?;
- let mean_score = result.values().sum::() / result.len() as f64;
- mean_scores.insert(metric, mean_score);
- all_scores.insert(metric, result);
+ let scores = metrics::compute_metric(gold_rels, pred_rels, metric)?;
+ let mean_score = scores.values().sum::() / scores.len() as f64;
+ Ok(Evaluated { scores, mean_score })
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use approx::assert_relative_eq;
+
+ #[test]
+ fn test_evaluate() -> Result<(), errors::ElinorError> {
+ let mut b = GoldRelStoreBuilder::new();
+ b.add_score("q_1", "d_1", 1)?;
+ b.add_score("q_1", "d_2", 0)?;
+ b.add_score("q_1", "d_3", 2)?;
+ b.add_score("q_2", "d_2", 2)?;
+ b.add_score("q_2", "d_4", 1)?;
+ let gold_rels = b.build();
+
+ let mut b = PredRelStoreBuilder::new();
+ b.add_score("q_1", "d_1", 0.5.into())?;
+ b.add_score("q_1", "d_2", 0.4.into())?;
+ b.add_score("q_1", "d_3", 0.3.into())?;
+ b.add_score("q_2", "d_4", 0.1.into())?;
+ b.add_score("q_2", "d_1", 0.2.into())?;
+ b.add_score("q_2", "d_3", 0.3.into())?;
+ let pred_rels = b.build();
+
+ let evaluated = evaluate(&gold_rels, &pred_rels, Metric::Precision { k: 3 })?;
+ assert_relative_eq!(evaluated.mean_score(), (2. / 3. + 1. / 3.) / 2.);
+
+ let scores = evaluated.scores();
+ assert_eq!(scores.len(), 2);
+ assert_relative_eq!(scores["q_1"], 2. / 3.);
+ assert_relative_eq!(scores["q_2"], 1. / 3.);
+
+ Ok(())
}
- Ok(Evaluated {
- mean_scores,
- all_scores,
- })
}
diff --git a/src/metrics.rs b/src/metrics.rs
index 3f6decd..beb61e3 100644
--- a/src/metrics.rs
+++ b/src/metrics.rs
@@ -8,6 +8,7 @@ pub(crate) mod precision;
pub(crate) mod r_precision;
pub(crate) mod recall;
pub(crate) mod reciprocal_rank;
+pub(crate) mod success;
use std::collections::HashMap;
use std::fmt::Display;
@@ -336,7 +337,9 @@ where
let golds = gold_rels.get_map(query_id).unwrap();
let score = match metric {
Metric::Hits { k } => hits::compute_hits(golds, sorted_preds, k, RELEVANT_LEVEL),
- Metric::Success { k } => hits::compute_success(golds, sorted_preds, k, RELEVANT_LEVEL),
+ Metric::Success { k } => {
+ success::compute_success(golds, sorted_preds, k, RELEVANT_LEVEL)
+ }
Metric::Precision { k } => {
precision::compute_precision(golds, sorted_preds, k, RELEVANT_LEVEL)
}
diff --git a/src/metrics/hits.rs b/src/metrics/hits.rs
index d25c7e8..b443274 100644
--- a/src/metrics/hits.rs
+++ b/src/metrics/hits.rs
@@ -25,24 +25,3 @@ where
}
hits as f64
}
-
-/// Returns 1 if at least one relevant document is found, 0 otherwise.
-pub fn compute_success(
- golds: &HashMap,
- sorted_preds: &[Relevance],
- k: usize,
- rel_lvl: GoldScore,
-) -> f64
-where
- K: Eq + std::hash::Hash,
-{
- let k = if k == 0 { sorted_preds.len() } else { k };
- for pred in sorted_preds.iter().take(k) {
- if let Some(&rel) = golds.get(&pred.doc_id) {
- if rel >= rel_lvl {
- return 1.0;
- }
- }
- }
- 0.0
-}
diff --git a/src/metrics/success.rs b/src/metrics/success.rs
new file mode 100644
index 0000000..6c23663
--- /dev/null
+++ b/src/metrics/success.rs
@@ -0,0 +1,26 @@
+use std::collections::HashMap;
+
+use crate::GoldScore;
+use crate::PredScore;
+use crate::Relevance;
+
+/// Returns 1 if at least one relevant document is found, 0 otherwise.
+pub fn compute_success(
+ golds: &HashMap,
+ sorted_preds: &[Relevance],
+ k: usize,
+ rel_lvl: GoldScore,
+) -> f64
+where
+ K: Eq + std::hash::Hash,
+{
+ let k = if k == 0 { sorted_preds.len() } else { k };
+ for pred in sorted_preds.iter().take(k) {
+ if let Some(&rel) = golds.get(&pred.doc_id) {
+ if rel >= rel_lvl {
+ return 1.0;
+ }
+ }
+ }
+ 0.0
+}