castorini
diff --git a/‎pom.xml
Lines changed: 10 additions & 0 deletions b/‎pom.xml
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/main/java/io/anserini/index/IndexInfo.java
Lines changed: 102 additions & 0 deletions b/‎src/main/java/io/anserini/index/IndexInfo.java
Lines changed: 102 additions & 0 deletions
diff --git a/‎src/main/java/io/anserini/search/SearchCollection.java
Lines changed: 19 additions & 2 deletions b/‎src/main/java/io/anserini/search/SearchCollection.java
Lines changed: 19 additions & 2 deletions
diff --git a/‎src/main/java/io/anserini/util/PrebuiltIndexHandler.java
Lines changed: 206 additions & 0 deletions b/‎src/main/java/io/anserini/util/PrebuiltIndexHandler.java
Lines changed: 206 additions & 0 deletions
@@ -463,5 +463,15 @@
       <artifactId>api</artifactId>
       <version>0.21.0</version>
     </dependency>
+    <dependency>
+    <groupId>me.tongfei</groupId>
+    <artifactId>progressbar</artifactId>
+    <version>0.10.0</version>
+    </dependency>
+    <dependency>
+    <groupId>commons-codec</groupId>
+    <artifactId>commons-codec</artifactId>
+    <version>1.15</version>
+    </dependency>
   </dependencies>
 </project>
@@ -0,0 +1,102 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.index;
+
+public enum IndexInfo {
+  MSMARCO_V1_PASSAGE("msmarco-v1-passage",
+      "Lucene index of the MS MARCO V1 passage corpus. (Lucene 9)",
+      "lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz",
+      "lucene-index.msmarco-v1-passage.20221004.252b5e.README.md",
+      new String[] {
+          "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz" },
+      "c697b18c9a0686ca760583e615dbe450", "2170758938", "352316036", "8841823",
+      "2660824", false),
+
+  CACM("cacm",
+      "Lucene index of the CACM corpus. (Lucene 9)",
+      "lucene-index.cacm.tar.gz",
+      new String[] {
+          "https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.20221005.252b5e.tar.gz" },
+      "cfe14d543c6a27f4d742fb2d0099b8e0",
+      "2347197",
+      "320968",
+      "3204",
+      "14363");
+
+  public final String indexName;
+  public final String description;
+  public final String filename;
+  public final String readme;
+  public final String[] urls;
+  public final String md5;
+  public final String size;
+  public final String totalTerms;
+  public final String totalDocs;
+  public final String totalUniqueTerms;
+  public final boolean downloaded;
+
+  // constructor with all 11 fields
+  IndexInfo(String indexName, String description, String filename, String readme, String[] urls, String md5,
+      String size, String totalTerms, String totalDocs, String totalUniqueTerms, boolean downloaded) {
+    this.indexName = indexName;
+    this.description = description;
+    this.filename = filename;
+    this.readme = readme;
+    this.urls = urls;
+    this.md5 = md5;
+    this.size = size;
+    this.totalTerms = totalTerms;
+    this.totalDocs = totalDocs;
+    this.totalUniqueTerms = totalUniqueTerms;
+    this.downloaded = downloaded;
+  }
+
+  // constructor with 9 fields
+  IndexInfo(String indexName, String description, String filename, String[] urls, String md5, String size,
+      String totalTerms, String totalDocs, String totalUniqueTerms) {
+    this.indexName = indexName;
+    this.description = description;
+    this.filename = filename;
+    this.readme = "";
+    this.urls = urls;
+    this.md5 = md5;
+    this.size = size;
+    this.totalTerms = totalTerms;
+    this.totalDocs = totalDocs;
+    this.totalUniqueTerms = totalUniqueTerms;
+    this.downloaded = false;
+  }
+
+  public static boolean contains(String indexName) {
+    for (IndexInfo indexInfo : IndexInfo.values()) {
+      if (indexInfo.indexName.equals(indexName)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  public static IndexInfo get(String indexName) {
+    for (IndexInfo indexInfo : IndexInfo.values()) {
+      if (indexInfo.indexName.equals(indexName)) {
+        return indexInfo;
+      }
+    }
+    throw new IllegalArgumentException("Index name " + indexName + " not found!");
+  }
+
+}
@@ -44,6 +44,8 @@
 import io.anserini.search.topicreader.BackgroundLinkingTopicReader;
 import io.anserini.search.topicreader.TopicReader;
 import io.anserini.search.topicreader.Topics;
+import io.anserini.util.PrebuiltIndexHandler;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.time.DurationFormatUtils;
@@ -955,12 +957,27 @@ public void run() {
 
   public SearchCollection(Args args) throws IOException {
     this.args = args;
-    Path indexPath = Paths.get(args.index);
+    Path indexPath = Path.of(args.index);
+    PrebuiltIndexHandler indexHandler = new PrebuiltIndexHandler(args.index);
+    if (!Files.exists(indexPath)) {
+      // it doesn't exist locally, we try to download it from remote
+      try {
+        indexHandler.initialize();
+        indexHandler.download();
+        indexPath = Path.of(indexHandler.decompressIndex());
+      } catch (IOException e) {
+        throw new RuntimeException("MD5 checksum does not match!");
+      } catch (Exception e) {
+        throw new IllegalArgumentException(String.format("Index path '%s' does not exist or is not a directory.", args.index));
+      }
+    } else {
+      // if it exists locally, we use it
+      indexPath = Paths.get(args.index);
+    }
 
     if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
       throw new IllegalArgumentException(String.format("Index path '%s' does not exist or is not a directory.", args.index));
     }
-
     LOG.info("============ Initializing Searcher ============");
     LOG.info("Index: " + indexPath);
     this.reader = args.inmem ? DirectoryReader.open(MMapDirectory.open(indexPath)) :
 
@@ -0,0 +1,206 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.util;
+
+import me.tongfei.progressbar.ProgressBar;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CountingInputStream;
+
+import io.anserini.index.IndexInfo;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+public class PrebuiltIndexHandler {
+  private String indexName;
+  private String saveRootPath;
+  private IndexInfo info = null;
+  private Path indexFolderPath = null;
+  private boolean initialized = false;
+  private Path savePath;
+
+  public PrebuiltIndexHandler(String indexName) {
+    this.indexName = indexName;
+    this.saveRootPath = getCache();
+  }
+
+  private String getCache() {
+    /*
+     * Get the pyserini cache path firs to avoid double downloads. If the pyserini
+     * cache path does not exist, use the anserini cache path.
+     */
+    final Path PyseriniPath = Paths.get(System.getProperty("user.home"), ".cache", "pyserini", "indexes");
+    final Path AnseriniPath = Paths.get(System.getProperty("user.home"), ".cache", "anserini", "indexes");
+    if (checkFileExist(PyseriniPath)) {
+      return PyseriniPath.toString();
+    } else {
+      return AnseriniPath.toString();
+    }
+  }
+
+  private static boolean checkFileExist(Path path) {
+    return path.toFile().exists();
+  }
+
+  private boolean checkIndexFileExist() {
+    /*
+     * Check if the index file exists. If the index file exists, return true.
+     * Otherwise, return false.
+     */
+    if (checkFileExist(savePath) || checkFileExist(Paths.get(savePath.toString().replace(".gz", "")))
+        || checkFileExist(Paths.get(savePath.toString().replace(".tar.gz", "")))) {
+      return true;
+    }
+    return false;
+  }
+
+  private static IndexInfo getIndexInfo(String indexName) {
+    /*
+     * Get the index info from the index name.
+     */
+    try {
+      IndexInfo info = IndexInfo.get(indexName);
+      return info;
+    } catch (IllegalArgumentException e) {
+      throw new IllegalArgumentException("Index not found!" + e.getMessage());
+    }
+  }
+
+  private static boolean checkMD5(InputStream st, String md5) throws IOException {
+    /*
+     * Check the MD5 checksum of the index file.
+     */
+    String generatedChecksum = DigestUtils.md5Hex(st);
+    return generatedChecksum.equals(md5);
+  }
+
+  public void initialize() {
+    if (initialized) {
+      return;
+    }
+    info = getIndexInfo(indexName);
+    // check if saveRootPath exists
+    if (!checkFileExist(Paths.get(saveRootPath))) {
+      try {
+        Files.createDirectories(Paths.get(saveRootPath));
+      } catch (IOException e) {
+        e.printStackTrace();
+      }
+    }
+    savePath = Paths.get(saveRootPath, info.filename);
+    initialized = true;
+  }
+
+  public void download() throws IOException {
+    /*
+     * Download the index file to the save path. If the file already exists, do
+     * nothing. If the file does not exist, download the file and check the MD5
+     * checksum.
+     */
+    if (!initialized) {
+      throw new IllegalStateException("Handler not initialized!");
+    }
+    if (checkIndexFileExist()) {
+      System.out.println("Index file already exists! Skip downloading.");
+      return;
+    }
+
+    URL url = new URL(info.urls[0]);
+    HttpURLConnection httpConnection = (HttpURLConnection) (url.openConnection());
+    long completeFileSize = httpConnection.getContentLengthLong();
+
+    try (InputStream inputStream = url.openStream();
+        CountingInputStream cis = new CountingInputStream(inputStream);
+        FileOutputStream fileOS = new FileOutputStream(savePath.toFile());
+        ProgressBar pb = new ProgressBar(indexName, Math.floorDiv(completeFileSize, 1000))) {
+
+      pb.setExtraMessage("Downloading...");
+
+      new Thread(() -> {
+        try {
+          IOUtils.copyLarge(cis, fileOS);
+        } catch (IOException e) {
+          e.printStackTrace();
+        }
+      }).start();
+
+      while (cis.getByteCount() < completeFileSize) {
+        pb.stepTo(Math.floorDiv(cis.getByteCount(), 1000));
+      }
+      
+      pb.stepTo(Math.floorDiv(cis.getByteCount(), 1000));
+      pb.close();
+
+      InputStream is = Files.newInputStream(savePath);
+      if (!checkMD5(is, info.md5)) {
+        throw new IOException("MD5 check failed!");
+      }
+    }
+  }
+
+  public String decompressIndex() throws Exception {
+    /*
+     * Decompress the tar.gz or tar index file to an archive folder. If the folder
+     * already exists, do nothing.
+     */
+    if (!initialized) {
+      throw new IllegalStateException("Handler not initialized!");
+    }
+    if (!checkIndexFileExist()) {
+      throw new Exception("Index file does not exist!");
+    }
+
+    String indexFolder = savePath.toString().replace(".tar.gz", "");
+    if (checkFileExist(Paths.get(indexFolder))) {
+      System.out.println("Index folder already exists!");
+      return indexFolder;
+    }
+    System.out.println("Decompressing index...");
+
+    if (checkFileExist(Paths.get(savePath.toString()))) {
+      ProcessBuilder pbGZIP = new ProcessBuilder("gzip", "-d", savePath.toString());
+      Process pGZIP = pbGZIP.start();
+      pGZIP.waitFor();
+    }
+
+    if (checkFileExist(Paths.get(savePath.toString().replace(".gz", "")))) {
+      ProcessBuilder pbTAR = new ProcessBuilder("tar", "-xvf",
+          savePath.toString().substring(0, savePath.toString().length() - 3), "-C", saveRootPath);
+      Process pTar = pbTAR.start();
+      pTar.waitFor();
+
+      // detele the tar file for saving space
+      Files.delete(Path.of(savePath.toString().replace(".gz", "")));
+    }
+
+    System.out.println("Index decompressed successfully!");
+    this.indexFolderPath = Paths.get(indexFolder);
+    return indexFolder;
+  }
+
+  public Path getIndexFolderPath() {
+    return this.indexFolderPath;
+  }
+}