From 8aea08ed65086ccb773991d9ef8824861d1b34df Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 15 Jan 2025 22:12:28 -0500 Subject: [PATCH 1/2] Add bindings to Snowflake indexes. --- .../java/io/anserini/index/IndexInfo.java | 100 ++++++++++++++++++ .../io/anserini/index/PrebuiltIndexTest.java | 2 +- 2 files changed, 101 insertions(+), 1 deletion(-) diff --git a/src/main/java/io/anserini/index/IndexInfo.java b/src/main/java/io/anserini/index/IndexInfo.java index efa8e511d1..1ce13b79a5 100644 --- a/src/main/java/io/anserini/index/IndexInfo.java +++ b/src/main/java/io/anserini/index/IndexInfo.java @@ -159,6 +159,106 @@ public enum IndexInfo { "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.msmarco-v2.1-doc-segmented.20240418.4f9675.tar.gz" }, "6ec4cd595c9fe1ad91b43eabb39a637c"), + MSMARCO_V21_DOC_SEGMENTED_SHARD00_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard00.arctic-embed-l.hnsw-int8", + "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard00) encoded by Snowflake's arctic-embed-l model.", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard00.arctic-embed-l.20250114.4884f5.tar.gz", + "", + "MS MARCO V2.1 Segmented Doc", + "Snowflake's arctic-embed-l w/ HNSW int8", + new String[] { + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard00.arctic-embed-l.20250114.4884f5.tar.gz" }, + "aab3f8e9aa0563bd0f875584784a0845"), + + MSMARCO_V21_DOC_SEGMENTED_SHARD01_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard01.arctic-embed-l.hnsw-int8", + "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard01) encoded by Snowflake's arctic-embed-l model.", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard01.arctic-embed-l.20250114.4884f5.tar.gz", + "", + "MS MARCO V2.1 Segmented Doc", + "Snowflake's arctic-embed-l w/ HNSW int8", + new String[] { + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard01.arctic-embed-l.20250114.4884f5.tar.gz" }, + "34ea30fe72c2bc1795ae83e71b191547"), + + MSMARCO_V21_DOC_SEGMENTED_SHARD02_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard02.arctic-embed-l.hnsw-int8", + "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard02) encoded by Snowflake's arctic-embed-l model.", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard02.arctic-embed-l.20250114.4884f5.tar.gz", + "", + "MS MARCO V2.1 Segmented Doc", + "Snowflake's arctic-embed-l w/ HNSW int8", + new String[] { + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard02.arctic-embed-l.20250114.4884f5.tar.gz" }, + "b6271d6db65119977491675f74f466d5"), + + MSMARCO_V21_DOC_SEGMENTED_SHARD03_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard03.arctic-embed-l.hnsw-int8", + "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard03) encoded by Snowflake's arctic-embed-l model.", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard03.arctic-embed-l.20250114.4884f5.tar.gz", + "", + "MS MARCO V2.1 Segmented Doc", + "Snowflake's arctic-embed-l w/ HNSW int8", + new String[] { + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard03.arctic-embed-l.20250114.4884f5.tar.gz" }, + "a9cd644eb6037f67d2e9c06a8f60928d"), + + MSMARCO_V21_DOC_SEGMENTED_SHARD04_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard04.arctic-embed-l.hnsw-int8", + "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard04) encoded by Snowflake's arctic-embed-l model.", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard04.arctic-embed-l.20250114.4884f5.tar.gz", + "", + "MS MARCO V2.1 Segmented Doc", + "Snowflake's arctic-embed-l w/ HNSW int8", + new String[] { + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard04.arctic-embed-l.20250114.4884f5.tar.gz" }, + "07b7e451e0525d01c1f1f2b1c42b1bd5"), + + MSMARCO_V21_DOC_SEGMENTED_SHARD05_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard05.arctic-embed-l.hnsw-int8", + "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard05) encoded by Snowflake's arctic-embed-l model.", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard05.arctic-embed-l.20250114.4884f5.tar.gz", + "", + "MS MARCO V2.1 Segmented Doc", + "Snowflake's arctic-embed-l w/ HNSW int8", + new String[] { + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard05.arctic-embed-l.20250114.4884f5.tar.gz" }, + "2573dce175788981be2f266ebb33c96d"), + + MSMARCO_V21_DOC_SEGMENTED_SHARD06_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard06.arctic-embed-l.hnsw-int8", + "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard06) encoded by Snowflake's arctic-embed-l model.", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard06.arctic-embed-l.20250114.4884f5.tar.gz", + "", + "MS MARCO V2.1 Segmented Doc", + "Snowflake's arctic-embed-l w/ HNSW int8", + new String[] { + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard06.arctic-embed-l.20250114.4884f5.tar.gz" }, + "a644aea445a8b78cc9e99d2ce111ff11"), + + MSMARCO_V21_DOC_SEGMENTED_SHARD07_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard07.arctic-embed-l.hnsw-int8", + "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard07) encoded by Snowflake's arctic-embed-l model.", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard07.arctic-embed-l.20250114.4884f5.tar.gz", + "", + "MS MARCO V2.1 Segmented Doc", + "Snowflake's arctic-embed-l w/ HNSW int8", + new String[] { + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard07.arctic-embed-l.20250114.4884f5.tar.gz" }, + "402d37deccb44b5fc105049889e8aaea"), + + MSMARCO_V21_DOC_SEGMENTED_SHARD08_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard08.arctic-embed-l.hnsw-int8", + "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard08) encoded by Snowflake's arctic-embed-l model.", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard08.arctic-embed-l.20250114.4884f5.tar.gz", + "", + "MS MARCO V2.1 Segmented Doc", + "Snowflake's arctic-embed-l w/ HNSW int8", + new String[] { + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard08.arctic-embed-l.20250114.4884f5.tar.gz" }, + "89ebcd027f7297b26a1edc8ae5726527"), + + MSMARCO_V21_DOC_SEGMENTED_SHARD09_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard09.arctic-embed-l.hnsw-int8", + "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard09) encoded by Snowflake's arctic-embed-l model.", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard09.arctic-embed-l.20250114.4884f5.tar.gz", + "", + "MS MARCO V2.1 Segmented Doc", + "Snowflake's arctic-embed-l w/ HNSW int8", + new String[] { + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard09.arctic-embed-l.20250114.4884f5.tar.gz" }, + "5e580bb7eb9ee2bb6bfa492b3430c17d"), + // BEIR: flat BEIR_V1_0_0_TREC_COVID_FLAT("beir-v1.0.0-trec-covid.flat", "Lucene inverted 'flat' index of BEIR collection 'trec-covid'.", diff --git a/src/test/java/io/anserini/index/PrebuiltIndexTest.java b/src/test/java/io/anserini/index/PrebuiltIndexTest.java index 6658abb23e..a89c7c906b 100644 --- a/src/test/java/io/anserini/index/PrebuiltIndexTest.java +++ b/src/test/java/io/anserini/index/PrebuiltIndexTest.java @@ -60,6 +60,6 @@ public void testUrls() { // test number of prebuilt-indexes @Test public void testNumPrebuiltIndexes() { - assertEquals(159, IndexInfo.values().length); + assertEquals(169, IndexInfo.values().length); } } From 1ec424f1a675367fed0524a2615f617208c89052 Mon Sep 17 00:00:00 2001 From: lintool Date: Thu, 16 Jan 2025 13:32:30 -0500 Subject: [PATCH 2/2] tweaks to docs. --- .../java/io/anserini/index/IndexInfo.java | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/main/java/io/anserini/index/IndexInfo.java b/src/main/java/io/anserini/index/IndexInfo.java index 1ce13b79a5..0b2506fc47 100644 --- a/src/main/java/io/anserini/index/IndexInfo.java +++ b/src/main/java/io/anserini/index/IndexInfo.java @@ -162,8 +162,8 @@ public enum IndexInfo { MSMARCO_V21_DOC_SEGMENTED_SHARD00_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard00.arctic-embed-l.hnsw-int8", "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard00) encoded by Snowflake's arctic-embed-l model.", "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard00.arctic-embed-l.20250114.4884f5.tar.gz", - "", - "MS MARCO V2.1 Segmented Doc", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented.arctic-embed-l.20250114.4884f5.README.md", + "MS MARCO V2.1 Segmented Doc (shard00)", "Snowflake's arctic-embed-l w/ HNSW int8", new String[] { "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard00.arctic-embed-l.20250114.4884f5.tar.gz" }, @@ -172,8 +172,8 @@ public enum IndexInfo { MSMARCO_V21_DOC_SEGMENTED_SHARD01_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard01.arctic-embed-l.hnsw-int8", "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard01) encoded by Snowflake's arctic-embed-l model.", "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard01.arctic-embed-l.20250114.4884f5.tar.gz", - "", - "MS MARCO V2.1 Segmented Doc", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented.arctic-embed-l.20250114.4884f5.README.md", + "MS MARCO V2.1 Segmented Doc (shard01)", "Snowflake's arctic-embed-l w/ HNSW int8", new String[] { "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard01.arctic-embed-l.20250114.4884f5.tar.gz" }, @@ -182,8 +182,8 @@ public enum IndexInfo { MSMARCO_V21_DOC_SEGMENTED_SHARD02_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard02.arctic-embed-l.hnsw-int8", "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard02) encoded by Snowflake's arctic-embed-l model.", "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard02.arctic-embed-l.20250114.4884f5.tar.gz", - "", - "MS MARCO V2.1 Segmented Doc", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented.arctic-embed-l.20250114.4884f5.README.md", + "MS MARCO V2.1 Segmented Doc (shard02)", "Snowflake's arctic-embed-l w/ HNSW int8", new String[] { "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard02.arctic-embed-l.20250114.4884f5.tar.gz" }, @@ -192,8 +192,8 @@ public enum IndexInfo { MSMARCO_V21_DOC_SEGMENTED_SHARD03_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard03.arctic-embed-l.hnsw-int8", "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard03) encoded by Snowflake's arctic-embed-l model.", "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard03.arctic-embed-l.20250114.4884f5.tar.gz", - "", - "MS MARCO V2.1 Segmented Doc", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented.arctic-embed-l.20250114.4884f5.README.md", + "MS MARCO V2.1 Segmented Doc (shard03)", "Snowflake's arctic-embed-l w/ HNSW int8", new String[] { "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard03.arctic-embed-l.20250114.4884f5.tar.gz" }, @@ -202,8 +202,8 @@ public enum IndexInfo { MSMARCO_V21_DOC_SEGMENTED_SHARD04_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard04.arctic-embed-l.hnsw-int8", "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard04) encoded by Snowflake's arctic-embed-l model.", "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard04.arctic-embed-l.20250114.4884f5.tar.gz", - "", - "MS MARCO V2.1 Segmented Doc", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented.arctic-embed-l.20250114.4884f5.README.md", + "MS MARCO V2.1 Segmented Doc (shard04)", "Snowflake's arctic-embed-l w/ HNSW int8", new String[] { "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard04.arctic-embed-l.20250114.4884f5.tar.gz" }, @@ -212,8 +212,8 @@ public enum IndexInfo { MSMARCO_V21_DOC_SEGMENTED_SHARD05_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard05.arctic-embed-l.hnsw-int8", "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard05) encoded by Snowflake's arctic-embed-l model.", "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard05.arctic-embed-l.20250114.4884f5.tar.gz", - "", - "MS MARCO V2.1 Segmented Doc", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented.arctic-embed-l.20250114.4884f5.README.md", + "MS MARCO V2.1 Segmented Doc (shard05)", "Snowflake's arctic-embed-l w/ HNSW int8", new String[] { "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard05.arctic-embed-l.20250114.4884f5.tar.gz" }, @@ -222,8 +222,8 @@ public enum IndexInfo { MSMARCO_V21_DOC_SEGMENTED_SHARD06_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard06.arctic-embed-l.hnsw-int8", "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard06) encoded by Snowflake's arctic-embed-l model.", "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard06.arctic-embed-l.20250114.4884f5.tar.gz", - "", - "MS MARCO V2.1 Segmented Doc", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented.arctic-embed-l.20250114.4884f5.README.md", + "MS MARCO V2.1 Segmented Doc (shard06)", "Snowflake's arctic-embed-l w/ HNSW int8", new String[] { "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard06.arctic-embed-l.20250114.4884f5.tar.gz" }, @@ -232,8 +232,8 @@ public enum IndexInfo { MSMARCO_V21_DOC_SEGMENTED_SHARD07_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard07.arctic-embed-l.hnsw-int8", "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard07) encoded by Snowflake's arctic-embed-l model.", "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard07.arctic-embed-l.20250114.4884f5.tar.gz", - "", - "MS MARCO V2.1 Segmented Doc", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented.arctic-embed-l.20250114.4884f5.README.md", + "MS MARCO V2.1 Segmented Doc (shard07)", "Snowflake's arctic-embed-l w/ HNSW int8", new String[] { "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard07.arctic-embed-l.20250114.4884f5.tar.gz" }, @@ -242,8 +242,8 @@ public enum IndexInfo { MSMARCO_V21_DOC_SEGMENTED_SHARD08_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard08.arctic-embed-l.hnsw-int8", "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard08) encoded by Snowflake's arctic-embed-l model.", "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard08.arctic-embed-l.20250114.4884f5.tar.gz", - "", - "MS MARCO V2.1 Segmented Doc", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented.arctic-embed-l.20250114.4884f5.README.md", + "MS MARCO V2.1 Segmented Doc (shard08)", "Snowflake's arctic-embed-l w/ HNSW int8", new String[] { "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard08.arctic-embed-l.20250114.4884f5.tar.gz" }, @@ -252,8 +252,8 @@ public enum IndexInfo { MSMARCO_V21_DOC_SEGMENTED_SHARD09_ARCTIC_EMBED_L_HNSW_INT8("msmarco-v2.1-doc-segmented-shard09.arctic-embed-l.hnsw-int8", "Lucene quantized (int8) HNSW index of the MS MARCO V2.1 segmented document corpus (shard09) encoded by Snowflake's arctic-embed-l model.", "lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard09.arctic-embed-l.20250114.4884f5.tar.gz", - "", - "MS MARCO V2.1 Segmented Doc", + "lucene-hnsw-int8.msmarco-v2.1-doc-segmented.arctic-embed-l.20250114.4884f5.README.md", + "MS MARCO V2.1 Segmented Doc (shard09)", "Snowflake's arctic-embed-l w/ HNSW int8", new String[] { "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw-int8.msmarco-v2.1-doc-segmented-shard09.arctic-embed-l.20250114.4884f5.tar.gz" },