diff --git a/docs/regressions/regressions-msmarco-passage-ca.md b/docs/regressions/regressions-msmarco-passage-ca.md index 5addf66318..38d841da66 100644 --- a/docs/regressions/regressions-msmarco-passage-ca.md +++ b/docs/regressions/regressions-msmarco-passage-ca.md @@ -21,11 +21,11 @@ Typical indexing command: ``` target/appassembler/bin/IndexCollection \ -collection JsonCollection \ - -input /path/to/msmarco-wp \ + -input /path/to/msmarco-passage \ -generator DefaultLuceneDocumentGenerator \ -index indexes/lucene-index.msmarco-passage-ca/ \ -threads 9 -storePositions -storeDocvectors -storeRaw -analyzeWithHuggingFaceTokenizer bert-base-uncased -useCompositeAnalyzer \ - >& logs/log.msmarco-wp & + >& logs/log.msmarco-passage & ``` The directory `/path/to/msmarco-passage-wp/` should be a directory containing the corpus in Anserini's jsonl format. @@ -44,17 +44,17 @@ target/appassembler/bin/SearchCollection \ -index indexes/lucene-index.msmarco-passage-ca/ \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ -topicReader TsvInt \ - -output runs/run.msmarco-wp.bm25-default.topics.msmarco-passage.dev-subset.txt \ - -bm25 -analyzeWithHuggingFaceTokenizer bert-base-uncased -useCompositeAnalyzer & + -output runs/run.msmarco-passage.bm25-default.topics.msmarco-passage.dev-subset.txt \ + -bm25 -analyzeWithHuggingFaceTokenizer bert-base-uncased -useCompositeAnalyzer & ``` Evaluation can be performed using `trec_eval`: ``` -tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-wp.bm25-default.topics.msmarco-passage.dev-subset.txt -tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-wp.bm25-default.topics.msmarco-passage.dev-subset.txt -tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-wp.bm25-default.topics.msmarco-passage.dev-subset.txt -tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-wp.bm25-default.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage.bm25-default.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage.bm25-default.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage.bm25-default.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage.bm25-default.topics.msmarco-passage.dev-subset.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-msmarco-passage-hgf-wp.md b/docs/regressions/regressions-msmarco-passage-hgf-wp.md index 9565a46c67..7f862beb7d 100644 --- a/docs/regressions/regressions-msmarco-passage-hgf-wp.md +++ b/docs/regressions/regressions-msmarco-passage-hgf-wp.md @@ -23,11 +23,11 @@ Typical indexing command: ``` target/appassembler/bin/IndexCollection \ -collection JsonCollection \ - -input /path/to/msmarco-wp \ + -input /path/to/msmarco-passage \ -generator DefaultLuceneDocumentGenerator \ -index indexes/lucene-index.msmarco-passage-hgf-wp/ \ -threads 9 -storePositions -storeDocvectors -storeRaw -analyzeWithHuggingFaceTokenizer bert-base-uncased \ - >& logs/log.msmarco-wp & + >& logs/log.msmarco-passage & ``` The directory `/path/to/msmarco-passage-wp/` should be a directory containing the corpus in Anserini's jsonl format. @@ -46,17 +46,17 @@ target/appassembler/bin/SearchCollection \ -index indexes/lucene-index.msmarco-passage-hgf-wp/ \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ -topicReader TsvInt \ - -output runs/run.msmarco-wp.bm25-default.topics.msmarco-passage.dev-subset.txt \ - -bm25 -analyzeWithHuggingFaceTokenizer bert-base-uncased & + -output runs/run.msmarco-passage.bm25-default.topics.msmarco-passage.dev-subset.txt \ + -bm25 -analyzeWithHuggingFaceTokenizer bert-base-uncased & ``` Evaluation can be performed using `trec_eval`: ``` -tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-wp.bm25-default.topics.msmarco-passage.dev-subset.txt -tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-wp.bm25-default.topics.msmarco-passage.dev-subset.txt -tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-wp.bm25-default.topics.msmarco-passage.dev-subset.txt -tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-wp.bm25-default.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage.bm25-default.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage.bm25-default.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage.bm25-default.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage.bm25-default.topics.msmarco-passage.dev-subset.txt ``` ## Effectiveness diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java index 727d5bd2ed..0ec74c16c5 100644 --- a/src/main/java/io/anserini/search/SearchCollection.java +++ b/src/main/java/io/anserini/search/SearchCollection.java @@ -235,7 +235,7 @@ public static class Args extends BaseSearchArgs { // ---------------------------------------------------------- @Option(name = "-impact", - forbids = {"-bm25", "-qld", "-qljm", "-inl2", "-spl", "-f2exp", "-f2log"}, + forbids = {"-bm25", "-bm25.accurate", "-qld", "-qljm", "-inl2", "-spl", "-f2exp", "-f2log"}, usage = "ranking model: BM25") public boolean impact = false; @@ -244,11 +244,13 @@ public static class Args extends BaseSearchArgs { // ------------------- @Option(name = "-bm25", - forbids = {"-impact", "-qld", "-qljm", "-inl2", "-spl", "-f2exp", "-f2log"}, + forbids = {"-impact", "-bm25.accurate", "-qld", "-qljm", "-inl2", "-spl", "-f2exp", "-f2log"}, usage = "ranking model: BM25") public boolean bm25 = false; - @Option(name = "-bm25.accurate", usage = "BM25: use accurate document lengths") + @Option(name = "-bm25.accurate", + forbids = {"-impact", "-bm25", "-qld", "-qljm", "-inl2", "-spl", "-f2exp", "-f2log"}, + usage = "BM25: use accurate document lengths") public boolean bm25Accurate = false; // BM25 parameters: Robertson et al. (TREC 4) propose the range of 1.0-2.0 for k1 and 0.6-0.75 for b, with k1 = 1.2 @@ -269,7 +271,7 @@ public static class Args extends BaseSearchArgs { // -------------------------------------------------------- @Option(name = "-qld", - forbids = {"-impact", "-bm25", "-qljm", "-inl2", "-spl", "-f2exp", "-f2log"}, + forbids = {"-impact", "-bm25", "-bm25.accurate", "-qljm", "-inl2", "-spl", "-f2exp", "-f2log"}, usage = "ranking model: query likelihood with Dirichlet smoothing") public boolean qld = false; @@ -289,7 +291,7 @@ public static class Args extends BaseSearchArgs { // ------------------------------------------------------------- @Option(name = "-qljm", - forbids = {"-impact", "-bm25", "-qld", "-inl2", "-spl", "-f2exp", "-f2log"}, + forbids = {"-impact", "-bm25", "-bm25.accurate", "-qld", "-inl2", "-spl", "-f2exp", "-f2log"}, usage = "ranking model: query likelihood with Jelinek-Mercer smoothing") public boolean qljm = false; @@ -301,7 +303,7 @@ public static class Args extends BaseSearchArgs { // ----------------------------------------- @Option(name = "-inl2", - forbids = {"-impact", "bm25", "-qld", "-qljm", "-spl", "-f2exp", "-f2log"}, + forbids = {"-impact", "bm25", "-bm25.accurate", "-qld", "-qljm", "-spl", "-f2exp", "-f2log"}, usage = "use I(n)L2 scoring model") public boolean inl2 = false; @@ -309,7 +311,7 @@ public static class Args extends BaseSearchArgs { public String[] inl2_c = new String[]{"0.1"}; @Option(name = "-spl", - forbids = {"-impact", "bm25", "-qld", "-qljm", "-inl2", "-f2exp", "-f2log"}, + forbids = {"-impact", "bm25", "-bm25.accurate", "-qld", "-qljm", "-inl2", "-f2exp", "-f2log"}, usage = "use SPL scoring model") public boolean spl = false; @@ -317,7 +319,7 @@ public static class Args extends BaseSearchArgs { public String[] spl_c = new String[]{"0.1"}; @Option(name = "-f2exp", - forbids = {"-impact", "bm25", "-qld", "-qljm", "-inl2", "-spl", "-f2log"}, + forbids = {"-impact", "bm25", "-bm25.accurate", "-qld", "-qljm", "-inl2", "-spl", "-f2log"}, usage = "use F2Exp scoring model") public boolean f2exp = false; @@ -325,7 +327,7 @@ public static class Args extends BaseSearchArgs { public String[] f2exp_s = new String[]{"0.5"}; @Option(name = "-f2log", - forbids = {"-impact", "bm25", "-qld", "-qljm", "-inl2", "-spl", "-f2exp"}, + forbids = {"-impact", "bm25", "-bm25.accurate", "-qld", "-qljm", "-inl2", "-spl", "-f2exp"}, usage = "use F2Log scoring model") public boolean f2log = false; @@ -503,6 +505,7 @@ public static class Args extends BaseSearchArgs { public Args impact() { this.impact = true; this.bm25 = false; + this.bm25Accurate = false; this.qld = false; this.qljm = false; this.inl2 = false; @@ -516,6 +519,21 @@ public Args impact() { public Args bm25() { this.impact = false; this.bm25 = true; + this.bm25Accurate = false; + this.qld = false; + this.qljm = false; + this.inl2 = false; + this.spl = false; + this.f2exp = false; + this.f2log = false; + + return this; + } + + public Args bm25Accurate() { + this.impact = false; + this.bm25 = false; + this.bm25Accurate = true; this.qld = false; this.qljm = false; this.inl2 = false; @@ -529,6 +547,7 @@ public Args bm25() { public Args qld() { this.impact = false; this.bm25 = false; + this.bm25Accurate = false; this.qld = true; this.qljm = false; this.inl2 = false; @@ -542,6 +561,7 @@ public Args qld() { public Args qljm() { this.impact = false; this.bm25 = false; + this.bm25Accurate = false; this.qld = false; this.qljm = true; this.inl2 = false; @@ -555,6 +575,7 @@ public Args qljm() { public Args inl2() { this.impact = false; this.bm25 = false; + this.bm25Accurate = false; this.qld = false; this.qljm = false; this.inl2 = true; @@ -568,6 +589,7 @@ public Args inl2() { public Args spl() { this.impact = false; this.bm25 = false; + this.bm25Accurate = false; this.qld = false; this.qljm = false; this.inl2 = false; @@ -581,6 +603,7 @@ public Args spl() { public Args f2exp() { this.impact = false; this.bm25 = false; + this.bm25Accurate = false; this.qld = false; this.qljm = false; this.inl2 = false; @@ -594,6 +617,7 @@ public Args f2exp() { public Args f2log() { this.impact = false; this.bm25 = false; + this.bm25Accurate = false; this.qld = false; this.qljm = false; this.inl2 = false; diff --git a/src/main/python/run_regression.py b/src/main/python/run_regression.py index b8cffe19a2..0905cd5465 100644 --- a/src/main/python/run_regression.py +++ b/src/main/python/run_regression.py @@ -137,8 +137,10 @@ def construct_indexing_command(yaml_data, args): return index_command -def construct_runfile_path(corpus, id, model_name): - return os.path.join('runs/', 'run.{0}.{1}.{2}'.format(corpus, id, model_name)) +def construct_runfile_path(index, id, model_name): + # if the index is 'indexes/lucene-index.msmarco-passage-ca/', we pull out 'msmarco-passage-ca' + index_part = index.split('/')[1].split('.')[1] + return os.path.join('runs/', 'run.{0}.{1}.{2}'.format(index_part, id, model_name)) def construct_search_commands(yaml_data): @@ -148,7 +150,7 @@ def construct_search_commands(yaml_data): '-index', construct_index_path(yaml_data), '-topics', os.path.join('tools/topics-and-qrels', topic_set['path']), '-topicReader', topic_set['topic_reader'] if 'topic_reader' in topic_set and topic_set['topic_reader'] else yaml_data['topic_reader'], - '-output', construct_runfile_path(yaml_data['corpus'], topic_set['id'], model['name']), + '-output', construct_runfile_path(yaml_data['index_path'], topic_set['id'], model['name']), model['params'] ] for (model, topic_set) in list(itertools.product(yaml_data['models'], yaml_data['topics'])) @@ -162,8 +164,8 @@ def construct_convert_commands(yaml_data): conversion['command'], '--index', construct_index_path(yaml_data), '--topics', topic_set['id'], - '--input', construct_runfile_path(yaml_data['corpus'], topic_set['id'], model['name']) + conversion['in_file_ext'], - '--output', construct_runfile_path(yaml_data['corpus'], topic_set['id'], model['name']) + conversion['out_file_ext'], + '--input', construct_runfile_path(yaml_data['index_path'], topic_set['id'], model['name']) + conversion['in_file_ext'], + '--output', construct_runfile_path(yaml_data['index_path'], topic_set['id'], model['name']) + conversion['out_file_ext'], conversion['params'] if 'params' in conversion and conversion['params'] else '', topic_set['convert_params'] if 'convert_params' in topic_set and topic_set['convert_params'] else '', ] @@ -186,7 +188,7 @@ def evaluate_and_verify(yaml_data, dry_run): eval_cmd = [ os.path.join(metric['command']), metric['params'] if 'params' in metric and metric['params'] else '', os.path.join('tools/topics-and-qrels', topic_set['qrel']) if 'qrel' in topic_set and topic_set['qrel'] else '', - construct_runfile_path(yaml_data['corpus'], topic_set['id'], model['name']) + (yaml_data['conversions'][-1]['out_file_ext'] if 'conversions' in yaml_data and yaml_data['conversions'][-1]['out_file_ext'] else '') + construct_runfile_path(yaml_data['index_path'], topic_set['id'], model['name']) + (yaml_data['conversions'][-1]['out_file_ext'] if 'conversions' in yaml_data and yaml_data['conversions'][-1]['out_file_ext'] else '') ] if dry_run: logger.info(' '.join(eval_cmd)) diff --git a/src/main/resources/regression/msmarco-passage-ca.yaml b/src/main/resources/regression/msmarco-passage-ca.yaml index dce104ca46..23f890f1f9 100644 --- a/src/main/resources/regression/msmarco-passage-ca.yaml +++ b/src/main/resources/regression/msmarco-passage-ca.yaml @@ -1,5 +1,5 @@ --- -corpus: msmarco-wp +corpus: msmarco-passage corpus_path: collections/msmarco/passage index_path: indexes/lucene-index.msmarco-passage-ca/ @@ -52,7 +52,7 @@ topics: models: - name: bm25-default display: BM25 (default) - params: -bm25 -analyzeWithHuggingFaceTokenizer bert-base-uncased -useCompositeAnalyzer + params: -bm25 -analyzeWithHuggingFaceTokenizer bert-base-uncased -useCompositeAnalyzer results: AP@1000: - 0.1968 diff --git a/src/main/resources/regression/msmarco-passage-hgf-wp.yaml b/src/main/resources/regression/msmarco-passage-hgf-wp.yaml index d5781c8357..90b9a3fcbc 100644 --- a/src/main/resources/regression/msmarco-passage-hgf-wp.yaml +++ b/src/main/resources/regression/msmarco-passage-hgf-wp.yaml @@ -1,5 +1,5 @@ --- -corpus: msmarco-wp +corpus: msmarco-passage corpus_path: collections/msmarco/passage index_path: indexes/lucene-index.msmarco-passage-hgf-wp/ @@ -52,7 +52,7 @@ topics: models: - name: bm25-default display: BM25 (default) - params: -bm25 -analyzeWithHuggingFaceTokenizer bert-base-uncased + params: -bm25 -analyzeWithHuggingFaceTokenizer bert-base-uncased results: AP@1000: - 0.1836 diff --git a/src/test/java/io/anserini/integration/TrecEndToEndTest.java b/src/test/java/io/anserini/integration/TrecEndToEndTest.java index 53b4017e4f..c477d19ce0 100644 --- a/src/test/java/io/anserini/integration/TrecEndToEndTest.java +++ b/src/test/java/io/anserini/integration/TrecEndToEndTest.java @@ -18,6 +18,7 @@ import io.anserini.collection.TrecCollection; import io.anserini.index.IndexCollection; +import io.anserini.search.SearchCollection; import java.util.Arrays; import java.util.List; @@ -70,12 +71,9 @@ protected void setCheckIndexGroundTruth() { "text\n" + "")); - referenceDocTokens.put("TREC_DOC_1", Map.of( - "contents", Arrays.asList(new String[]{null, null, "head", "veri", "simpl", "text"}))); - referenceDocTokens.put("WSJ_1", Map.of( - "contents", List.of("head", "text", "01", "30", "03", "content"))); - referenceDocTokens.put("DOC222", Map.of( - "contents", List.of("head", "simpl", "enough", "text", "text", "text"))); + referenceDocTokens.put("TREC_DOC_1", Map.of("contents", Arrays.asList(null, null, "head", "veri", "simpl", "text"))); + referenceDocTokens.put("WSJ_1", Map.of("contents", List.of("head", "text", "01", "30", "03", "content"))); + referenceDocTokens.put("DOC222", Map.of("contents", List.of("head", "simpl", "enough", "text", "text", "text"))); fieldNormStatusTotalFields = 1; // text termIndexStatusTermCount = 12; // Note that standard analyzer ignores stopwords; includes docids. @@ -97,6 +95,36 @@ protected void setSearchGroundTruth() { "1 Q0 TREC_DOC_1 2 0.333400 Anserini", "1 Q0 WSJ_1 3 0.068700 Anserini"}); + SearchCollection.Args argsRm3 = createDefaultSearchArgs().bm25(); + argsRm3.rm3 = true; + testQueries.put("bm25.rm3", argsRm3); + referenceRunOutput.put("bm25.rm3", new String[]{ + "1 Q0 DOC222 1 0.085800 Anserini", + "1 Q0 TREC_DOC_1 2 0.083400 Anserini", + "1 Q0 WSJ_1 3 0.017200 Anserini"}); + + SearchCollection.Args argsRocchio = createDefaultSearchArgs().bm25(); + argsRocchio.rocchio = true; + testQueries.put("bm25.rocchio", argsRocchio); + referenceRunOutput.put("bm25.rocchio", new String[]{ + "1 Q0 DOC222 1 0.242700 Anserini", + "1 Q0 TREC_DOC_1 2 0.235800 Anserini", + "1 Q0 WSJ_1 3 0.048500 Anserini"}); + + SearchCollection.Args argsBM25prf = createDefaultSearchArgs().bm25(); + argsBM25prf.bm25prf = true; + testQueries.put("bm25.bm25prf", argsBM25prf); + referenceRunOutput.put("bm25.bm25prf", new String[]{ + "1 Q0 DOC222 1 1.942500 Anserini", + "1 Q0 TREC_DOC_1 2 1.572300 Anserini", + "1 Q0 WSJ_1 3 1.200600 Anserini"}); + + testQueries.put("bm25Accurate", createDefaultSearchArgs().bm25Accurate()); + referenceRunOutput.put("bm25Accurate", new String[]{ + "1 Q0 DOC222 1 0.343200 Anserini", + "1 Q0 TREC_DOC_1 2 0.333400 Anserini", + "1 Q0 WSJ_1 3 0.068700 Anserini"}); + testQueries.put("qld", createDefaultSearchArgs().qld()); referenceRunOutput.put("qld", new String[]{ "1 Q0 DOC222 1 0.002500 Anserini", diff --git a/src/test/java/io/anserini/search/SearchCollectionTest.java b/src/test/java/io/anserini/search/SearchCollectionTest.java index 6bf52e9c4c..50da58618e 100644 --- a/src/test/java/io/anserini/search/SearchCollectionTest.java +++ b/src/test/java/io/anserini/search/SearchCollectionTest.java @@ -83,6 +83,11 @@ public void testMutallyExclusive() throws Exception { "-bm25", "-qld"}); assertTrue(err.toString().contains("cannot be used with the option")); + err.reset(); + SearchCollection.main(new String[] {"-index", "foo", "-output", "bar", "-topicReader", "baz", "-topics", "topic", + "-bm25", "-bm25.accurate"}); + assertTrue(err.toString().contains("cannot be used with the option")); + err.reset(); SearchCollection.main(new String[] {"-index", "foo", "-output", "bar", "-topicReader", "baz", "-topics", "topic", "-bm25", "-qljm"}); @@ -106,22 +111,58 @@ public void testMutallyExclusive() throws Exception { restoreStderr(); } + @Test + public void testInvalidTopicReader() throws Exception { + redirectStderr(); + + err.reset(); + SearchCollection.main(new String[] { + "-index", "src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_collection2/", + "-topics", "src/test/resources/sample_topics/Trec", + "-topicReader", "FakeTrec", + "-output", "run.test", "-bm25"}); + assertTrue(err.toString().contains("Unable to load topic reader")); + + restoreStderr(); + } + + @Test + public void testInvalidFields() throws Exception { + redirectStderr(); + + err.reset(); + SearchCollection.main(new String[] { + "-index", "src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_collection2/", + "-topics", "src/test/resources/sample_topics/Trec", + "-topicReader", "Trec", + "-fields", "field1=a", + "-output", "run.test", "-bm25"}); + assertTrue(err.toString().contains("Error parsing -fields")); + + restoreStderr(); + } + @Test public void testSearchLucene9() throws Exception { - SearchCollection.main( - new String[] {"-index", "src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_collection2/", - "-topics", "src/test/resources/sample_topics/Trec", - "-topicReader", "Trec", "-output", "run.test", "-bm25"}); + SearchCollection.main(new String[] { + "-index", "src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_collection2/", + "-topics", "src/test/resources/sample_topics/Trec", + "-topicReader", "Trec", + "-output", "run.test", "-bm25"}); + TestUtils.checkFile("run.test", new String[]{ "1 Q0 DOC222 1 0.343200 Anserini", "1 Q0 TREC_DOC_1 2 0.333400 Anserini", "1 Q0 WSJ_1 3 0.068700 Anserini"}); new File("run.test").delete(); - SearchCollection.main( - new String[] {"-index", "src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/", - "-topics", "src/test/resources/sample_topics/json_topics1.tsv", - "-topicReader", "TsvInt", "-output", "run.test", "-pretokenized", "-impact"}); + SearchCollection.main(new String[] { + "-index", "src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/", + "-topics", "src/test/resources/sample_topics/json_topics1.tsv", + "-topicReader", "TsvInt", + "-output", "run.test", + "-pretokenized", "-impact"}); + TestUtils.checkFile("run.test", new String[]{ "1 Q0 2000001 1 4.000000 Anserini",}); new File("run.test").delete(); @@ -129,20 +170,24 @@ public void testSearchLucene9() throws Exception { @Test public void testSearchLucene8() throws Exception { - SearchCollection.main( - new String[] {"-index", "src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_collection2/", - "-topics", "src/test/resources/sample_topics/Trec", - "-topicReader", "Trec", "-output", "run.test", "-bm25"}); + SearchCollection.main(new String[] { + "-index", "src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_collection2/", + "-topics", "src/test/resources/sample_topics/Trec", + "-topicReader", "Trec", + "-output", "run.test", "-bm25"}); + TestUtils.checkFile("run.test", new String[]{ "1 Q0 DOC222 1 0.343192 Anserini", "1 Q0 TREC_DOC_1 2 0.333445 Anserini", "1 Q0 WSJ_1 3 0.068654 Anserini"}); new File("run.test").delete(); - SearchCollection.main( - new String[] {"-index", "src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/", - "-topics", "src/test/resources/sample_topics/json_topics1.tsv", - "-topicReader", "TsvInt", "-output", "run.test", "-pretokenized", "-impact"}); + SearchCollection.main(new String[] { + "-index", "src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/", + "-topics", "src/test/resources/sample_topics/json_topics1.tsv", + "-topicReader", "TsvInt", "-output", + "run.test", "-pretokenized", "-impact"}); + TestUtils.checkFile("run.test", new String[]{ "1 Q0 2000001 1 4.000000 Anserini",}); new File("run.test").delete();