Skip to content

Add download from remote feature #2301

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -463,5 +463,15 @@
<artifactId>api</artifactId>
<version>0.21.0</version>
</dependency>
<dependency>
<groupId>me.tongfei</groupId>
<artifactId>progressbar</artifactId>
<version>0.10.0</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.15</version>
</dependency>
</dependencies>
</project>
102 changes: 102 additions & 0 deletions src/main/java/io/anserini/index/IndexInfo.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.index;

public enum IndexInfo {
MSMARCO_V1_PASSAGE("msmarco-v1-passage",
"Lucene index of the MS MARCO V1 passage corpus. (Lucene 9)",
"lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz",
"lucene-index.msmarco-v1-passage.20221004.252b5e.README.md",
new String[] {
"https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz" },
"c697b18c9a0686ca760583e615dbe450", "2170758938", "352316036", "8841823",
"2660824", false),

CACM("cacm",
"Lucene index of the CACM corpus. (Lucene 9)",
"lucene-index.cacm.tar.gz",
new String[] {
"https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.20221005.252b5e.tar.gz" },
"cfe14d543c6a27f4d742fb2d0099b8e0",
"2347197",
"320968",
"3204",
"14363");

public final String indexName;
public final String description;
public final String filename;
public final String readme;
public final String[] urls;
public final String md5;
public final String size;
public final String totalTerms;
public final String totalDocs;
public final String totalUniqueTerms;
public final boolean downloaded;

// constructor with all 11 fields
IndexInfo(String indexName, String description, String filename, String readme, String[] urls, String md5,
String size, String totalTerms, String totalDocs, String totalUniqueTerms, boolean downloaded) {
this.indexName = indexName;
this.description = description;
this.filename = filename;
this.readme = readme;
this.urls = urls;
this.md5 = md5;
this.size = size;
this.totalTerms = totalTerms;
this.totalDocs = totalDocs;
this.totalUniqueTerms = totalUniqueTerms;
this.downloaded = downloaded;
}

// constructor with 9 fields
IndexInfo(String indexName, String description, String filename, String[] urls, String md5, String size,
String totalTerms, String totalDocs, String totalUniqueTerms) {
this.indexName = indexName;
this.description = description;
this.filename = filename;
this.readme = "";
this.urls = urls;
this.md5 = md5;
this.size = size;
this.totalTerms = totalTerms;
this.totalDocs = totalDocs;
this.totalUniqueTerms = totalUniqueTerms;
this.downloaded = false;
}

public static boolean contains(String indexName) {
for (IndexInfo indexInfo : IndexInfo.values()) {
if (indexInfo.indexName.equals(indexName)) {
return true;
}
}
return false;
}

public static IndexInfo get(String indexName) {
for (IndexInfo indexInfo : IndexInfo.values()) {
if (indexInfo.indexName.equals(indexName)) {
return indexInfo;
}
}
throw new IllegalArgumentException("Index name " + indexName + " not found!");
}

}
21 changes: 19 additions & 2 deletions src/main/java/io/anserini/search/SearchCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
import io.anserini.search.topicreader.BackgroundLinkingTopicReader;
import io.anserini.search.topicreader.TopicReader;
import io.anserini.search.topicreader.Topics;
import io.anserini.util.PrebuiltIndexHandler;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DurationFormatUtils;
Expand Down Expand Up @@ -955,12 +957,27 @@ public void run() {

public SearchCollection(Args args) throws IOException {
this.args = args;
Path indexPath = Paths.get(args.index);
Path indexPath = Path.of(args.index);
PrebuiltIndexHandler indexHandler = new PrebuiltIndexHandler(args.index);
if (!Files.exists(indexPath)) {
// it doesn't exist locally, we try to download it from remote
try {
indexHandler.initialize();
indexHandler.download();
indexPath = Path.of(indexHandler.decompressIndex());
} catch (IOException e) {
throw new RuntimeException("MD5 checksum does not match!");
} catch (Exception e) {
throw new IllegalArgumentException(String.format("Index path '%s' does not exist or is not a directory.", args.index));
}
} else {
// if it exists locally, we use it
indexPath = Paths.get(args.index);
}

if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
throw new IllegalArgumentException(String.format("Index path '%s' does not exist or is not a directory.", args.index));
}

LOG.info("============ Initializing Searcher ============");
LOG.info("Index: " + indexPath);
this.reader = args.inmem ? DirectoryReader.open(MMapDirectory.open(indexPath)) :
Expand Down
206 changes: 206 additions & 0 deletions src/main/java/io/anserini/util/PrebuiltIndexHandler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.util;

import me.tongfei.progressbar.ProgressBar;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CountingInputStream;

import io.anserini.index.IndexInfo;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

public class PrebuiltIndexHandler {
private String indexName;
private String saveRootPath;
private IndexInfo info = null;
private Path indexFolderPath = null;
private boolean initialized = false;
private Path savePath;

public PrebuiltIndexHandler(String indexName) {
this.indexName = indexName;
this.saveRootPath = getCache();
}

private String getCache() {
/*
* Get the pyserini cache path firs to avoid double downloads. If the pyserini
* cache path does not exist, use the anserini cache path.
*/
final Path PyseriniPath = Paths.get(System.getProperty("user.home"), ".cache", "pyserini", "indexes");
final Path AnseriniPath = Paths.get(System.getProperty("user.home"), ".cache", "anserini", "indexes");
if (checkFileExist(PyseriniPath)) {
return PyseriniPath.toString();
} else {
return AnseriniPath.toString();
}
}

private static boolean checkFileExist(Path path) {
return path.toFile().exists();
}

private boolean checkIndexFileExist() {
/*
* Check if the index file exists. If the index file exists, return true.
* Otherwise, return false.
*/
if (checkFileExist(savePath) || checkFileExist(Paths.get(savePath.toString().replace(".gz", "")))
|| checkFileExist(Paths.get(savePath.toString().replace(".tar.gz", "")))) {
return true;
}
return false;
}

private static IndexInfo getIndexInfo(String indexName) {
/*
* Get the index info from the index name.
*/
try {
IndexInfo info = IndexInfo.get(indexName);
return info;
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("Index not found!" + e.getMessage());
}
}

private static boolean checkMD5(InputStream st, String md5) throws IOException {
/*
* Check the MD5 checksum of the index file.
*/
String generatedChecksum = DigestUtils.md5Hex(st);
return generatedChecksum.equals(md5);
}

public void initialize() {
if (initialized) {
return;
}
info = getIndexInfo(indexName);
// check if saveRootPath exists
if (!checkFileExist(Paths.get(saveRootPath))) {
try {
Files.createDirectories(Paths.get(saveRootPath));
} catch (IOException e) {
e.printStackTrace();
}
}
savePath = Paths.get(saveRootPath, info.filename);
initialized = true;
}

public void download() throws IOException {
/*
* Download the index file to the save path. If the file already exists, do
* nothing. If the file does not exist, download the file and check the MD5
* checksum.
*/
if (!initialized) {
throw new IllegalStateException("Handler not initialized!");
}
if (checkIndexFileExist()) {
System.out.println("Index file already exists! Skip downloading.");
return;
}

URL url = new URL(info.urls[0]);
HttpURLConnection httpConnection = (HttpURLConnection) (url.openConnection());
long completeFileSize = httpConnection.getContentLengthLong();

try (InputStream inputStream = url.openStream();
CountingInputStream cis = new CountingInputStream(inputStream);
FileOutputStream fileOS = new FileOutputStream(savePath.toFile());
ProgressBar pb = new ProgressBar(indexName, Math.floorDiv(completeFileSize, 1000))) {

pb.setExtraMessage("Downloading...");

new Thread(() -> {
try {
IOUtils.copyLarge(cis, fileOS);
} catch (IOException e) {
e.printStackTrace();
}
}).start();

while (cis.getByteCount() < completeFileSize) {
pb.stepTo(Math.floorDiv(cis.getByteCount(), 1000));
}

pb.stepTo(Math.floorDiv(cis.getByteCount(), 1000));
pb.close();

InputStream is = Files.newInputStream(savePath);
if (!checkMD5(is, info.md5)) {
throw new IOException("MD5 check failed!");
}
}
}

public String decompressIndex() throws Exception {
/*
* Decompress the tar.gz or tar index file to an archive folder. If the folder
* already exists, do nothing.
*/
if (!initialized) {
throw new IllegalStateException("Handler not initialized!");
}
if (!checkIndexFileExist()) {
throw new Exception("Index file does not exist!");
}

String indexFolder = savePath.toString().replace(".tar.gz", "");
if (checkFileExist(Paths.get(indexFolder))) {
System.out.println("Index folder already exists!");
return indexFolder;
}
System.out.println("Decompressing index...");

if (checkFileExist(Paths.get(savePath.toString()))) {
ProcessBuilder pbGZIP = new ProcessBuilder("gzip", "-d", savePath.toString());
Process pGZIP = pbGZIP.start();
pGZIP.waitFor();
}

if (checkFileExist(Paths.get(savePath.toString().replace(".gz", "")))) {
ProcessBuilder pbTAR = new ProcessBuilder("tar", "-xvf",
savePath.toString().substring(0, savePath.toString().length() - 3), "-C", saveRootPath);
Process pTar = pbTAR.start();
pTar.waitFor();

// detele the tar file for saving space
Files.delete(Path.of(savePath.toString().replace(".gz", "")));
}

System.out.println("Index decompressed successfully!");
this.indexFolderPath = Paths.get(indexFolder);
return indexFolder;
}

public Path getIndexFolderPath() {
return this.indexFolderPath;
}
}
Loading