Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add download from remote feature #2301

Merged
merged 9 commits into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -463,5 +463,15 @@
<artifactId>api</artifactId>
<version>0.21.0</version>
</dependency>
<dependency>
<groupId>me.tongfei</groupId>
<artifactId>progressbar</artifactId>
<version>0.10.0</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.15</version>
</dependency>
</dependencies>
</project>
102 changes: 102 additions & 0 deletions src/main/java/io/anserini/index/IndexInfo.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.index;

public enum IndexInfo {
ArthurChen189 marked this conversation as resolved.
Show resolved Hide resolved
MSMARCO_V1_PASSAGE("msmarco-v1-passage",
"Lucene index of the MS MARCO V1 passage corpus. (Lucene 9)",
"lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz",
"lucene-index.msmarco-v1-passage.20221004.252b5e.README.md",
new String[] {
"https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz" },
"c697b18c9a0686ca760583e615dbe450", "2170758938", "352316036", "8841823",
"2660824", false),

CACM("cacm",
"Lucene index of the CACM corpus. (Lucene 9)",
"lucene-index.cacm.tar.gz",
new String[] {
"https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.20221005.252b5e.tar.gz" },
"cfe14d543c6a27f4d742fb2d0099b8e0",
"2347197",
"320968",
"3204",
"14363");

public final String indexName;
public final String description;
public final String filename;
public final String readme;
public final String[] urls;
public final String md5;
public final String size;
public final String totalTerms;
public final String totalDocs;
public final String totalUniqueTerms;
public final boolean downloaded;

// constructor with all 11 fields
ArthurChen189 marked this conversation as resolved.
Show resolved Hide resolved
IndexInfo(String indexName, String description, String filename, String readme, String[] urls, String md5,
String size, String totalTerms, String totalDocs, String totalUniqueTerms, boolean downloaded) {
this.indexName = indexName;
this.description = description;
this.filename = filename;
this.readme = readme;
this.urls = urls;
this.md5 = md5;
this.size = size;
this.totalTerms = totalTerms;
this.totalDocs = totalDocs;
this.totalUniqueTerms = totalUniqueTerms;
this.downloaded = downloaded;
}

// constructor with 9 fields
IndexInfo(String indexName, String description, String filename, String[] urls, String md5, String size,
String totalTerms, String totalDocs, String totalUniqueTerms) {
this.indexName = indexName;
this.description = description;
this.filename = filename;
this.readme = "";
this.urls = urls;
this.md5 = md5;
this.size = size;
this.totalTerms = totalTerms;
this.totalDocs = totalDocs;
this.totalUniqueTerms = totalUniqueTerms;
this.downloaded = false;
}

public static boolean contains(String indexName) {
for (IndexInfo indexInfo : IndexInfo.values()) {
if (indexInfo.indexName.equals(indexName)) {
return true;

Check warning on line 87 in src/main/java/io/anserini/index/IndexInfo.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/index/IndexInfo.java#L87

Added line #L87 was not covered by tests
}
}
return false;

Check warning on line 90 in src/main/java/io/anserini/index/IndexInfo.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/index/IndexInfo.java#L90

Added line #L90 was not covered by tests
}

public static IndexInfo get(String indexName) {
for (IndexInfo indexInfo : IndexInfo.values()) {
if (indexInfo.indexName.equals(indexName)) {
return indexInfo;
}
}
throw new IllegalArgumentException("Index name " + indexName + " not found!");
}

}
21 changes: 19 additions & 2 deletions src/main/java/io/anserini/search/SearchCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
import io.anserini.search.topicreader.BackgroundLinkingTopicReader;
import io.anserini.search.topicreader.TopicReader;
import io.anserini.search.topicreader.Topics;
import io.anserini.util.PrebuiltIndexHandler;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DurationFormatUtils;
Expand Down Expand Up @@ -955,12 +957,27 @@

public SearchCollection(Args args) throws IOException {
this.args = args;
Path indexPath = Paths.get(args.index);
Path indexPath = Path.of(args.index);
PrebuiltIndexHandler indexHandler = new PrebuiltIndexHandler(args.index);
if (!Files.exists(indexPath)) {
// it doesn't exist locally, we try to download it from remote
try {
indexHandler.initialize();
indexHandler.download();
indexPath = Path.of(indexHandler.decompressIndex());
} catch (IOException e) {
throw new RuntimeException("MD5 checksum does not match!");

Check warning on line 969 in src/main/java/io/anserini/search/SearchCollection.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/SearchCollection.java#L965-L969

Added lines #L965 - L969 were not covered by tests
} catch (Exception e) {
throw new IllegalArgumentException(String.format("Index path '%s' does not exist or is not a directory.", args.index));
}

Check warning on line 972 in src/main/java/io/anserini/search/SearchCollection.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/SearchCollection.java#L972

Added line #L972 was not covered by tests
} else {
// if it exists locally, we use it
indexPath = Paths.get(args.index);
}

if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
throw new IllegalArgumentException(String.format("Index path '%s' does not exist or is not a directory.", args.index));
}

LOG.info("============ Initializing Searcher ============");
LOG.info("Index: " + indexPath);
this.reader = args.inmem ? DirectoryReader.open(MMapDirectory.open(indexPath)) :
Expand Down
206 changes: 206 additions & 0 deletions src/main/java/io/anserini/util/PrebuiltIndexHandler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.util;

import me.tongfei.progressbar.ProgressBar;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CountingInputStream;

import io.anserini.index.IndexInfo;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

public class PrebuiltIndexHandler {
private String indexName;
private String saveRootPath;
private IndexInfo info = null;
private Path indexFolderPath = null;
private boolean initialized = false;
private Path savePath;

public PrebuiltIndexHandler(String indexName) {
this.indexName = indexName;
this.saveRootPath = getCache();
}

private String getCache() {
/*
* Get the pyserini cache path firs to avoid double downloads. If the pyserini
* cache path does not exist, use the anserini cache path.
*/
final Path PyseriniPath = Paths.get(System.getProperty("user.home"), ".cache", "pyserini", "indexes");
final Path AnseriniPath = Paths.get(System.getProperty("user.home"), ".cache", "anserini", "indexes");
if (checkFileExist(PyseriniPath)) {
return PyseriniPath.toString();

Check warning on line 57 in src/main/java/io/anserini/util/PrebuiltIndexHandler.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/util/PrebuiltIndexHandler.java#L57

Added line #L57 was not covered by tests
} else {
return AnseriniPath.toString();
}
}

private static boolean checkFileExist(Path path) {
return path.toFile().exists();
}

private boolean checkIndexFileExist() {
/*
* Check if the index file exists. If the index file exists, return true.
* Otherwise, return false.
*/
if (checkFileExist(savePath) || checkFileExist(Paths.get(savePath.toString().replace(".gz", "")))
|| checkFileExist(Paths.get(savePath.toString().replace(".tar.gz", "")))) {
return true;
}
return false;
}

private static IndexInfo getIndexInfo(String indexName) {
/*
* Get the index info from the index name.
*/
try {
IndexInfo info = IndexInfo.get(indexName);
return info;
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("Index not found!" + e.getMessage());
}
}

private static boolean checkMD5(InputStream st, String md5) throws IOException {
/*
* Check the MD5 checksum of the index file.
*/
String generatedChecksum = DigestUtils.md5Hex(st);
return generatedChecksum.equals(md5);
}

public void initialize() {
if (initialized) {
return;

Check warning on line 101 in src/main/java/io/anserini/util/PrebuiltIndexHandler.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/util/PrebuiltIndexHandler.java#L101

Added line #L101 was not covered by tests
}
info = getIndexInfo(indexName);
// check if saveRootPath exists
if (!checkFileExist(Paths.get(saveRootPath))) {
try {
Files.createDirectories(Paths.get(saveRootPath));
} catch (IOException e) {
e.printStackTrace();

Check warning on line 109 in src/main/java/io/anserini/util/PrebuiltIndexHandler.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/util/PrebuiltIndexHandler.java#L108-L109

Added lines #L108 - L109 were not covered by tests
}
}
savePath = Paths.get(saveRootPath, info.filename);
initialized = true;
}

public void download() throws IOException {
/*
* Download the index file to the save path. If the file already exists, do
* nothing. If the file does not exist, download the file and check the MD5
* checksum.
*/
if (!initialized) {
throw new IllegalStateException("Handler not initialized!");

Check warning on line 123 in src/main/java/io/anserini/util/PrebuiltIndexHandler.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/util/PrebuiltIndexHandler.java#L123

Added line #L123 was not covered by tests
}
if (checkIndexFileExist()) {
System.out.println("Index file already exists! Skip downloading.");
return;

Check warning on line 127 in src/main/java/io/anserini/util/PrebuiltIndexHandler.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/util/PrebuiltIndexHandler.java#L126-L127

Added lines #L126 - L127 were not covered by tests
}

URL url = new URL(info.urls[0]);
HttpURLConnection httpConnection = (HttpURLConnection) (url.openConnection());
long completeFileSize = httpConnection.getContentLengthLong();

try (InputStream inputStream = url.openStream();
CountingInputStream cis = new CountingInputStream(inputStream);
FileOutputStream fileOS = new FileOutputStream(savePath.toFile());
ProgressBar pb = new ProgressBar(indexName, Math.floorDiv(completeFileSize, 1000))) {

pb.setExtraMessage("Downloading...");

new Thread(() -> {
try {
IOUtils.copyLarge(cis, fileOS);
} catch (IOException e) {
e.printStackTrace();

Check warning on line 145 in src/main/java/io/anserini/util/PrebuiltIndexHandler.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/util/PrebuiltIndexHandler.java#L144-L145

Added lines #L144 - L145 were not covered by tests
}
}).start();

while (cis.getByteCount() < completeFileSize) {
pb.stepTo(Math.floorDiv(cis.getByteCount(), 1000));
}

pb.stepTo(Math.floorDiv(cis.getByteCount(), 1000));
pb.close();

InputStream is = Files.newInputStream(savePath);
if (!checkMD5(is, info.md5)) {
throw new IOException("MD5 check failed!");

Check warning on line 158 in src/main/java/io/anserini/util/PrebuiltIndexHandler.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/util/PrebuiltIndexHandler.java#L158

Added line #L158 was not covered by tests
}
}
}

public String decompressIndex() throws Exception {
/*
* Decompress the tar.gz or tar index file to an archive folder. If the folder
* already exists, do nothing.
*/
if (!initialized) {
throw new IllegalStateException("Handler not initialized!");

Check warning on line 169 in src/main/java/io/anserini/util/PrebuiltIndexHandler.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/util/PrebuiltIndexHandler.java#L169

Added line #L169 was not covered by tests
}
if (!checkIndexFileExist()) {
throw new Exception("Index file does not exist!");

Check warning on line 172 in src/main/java/io/anserini/util/PrebuiltIndexHandler.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/util/PrebuiltIndexHandler.java#L172

Added line #L172 was not covered by tests
}

String indexFolder = savePath.toString().replace(".tar.gz", "");
if (checkFileExist(Paths.get(indexFolder))) {
System.out.println("Index folder already exists!");
return indexFolder;

Check warning on line 178 in src/main/java/io/anserini/util/PrebuiltIndexHandler.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/util/PrebuiltIndexHandler.java#L177-L178

Added lines #L177 - L178 were not covered by tests
}
System.out.println("Decompressing index...");

if (checkFileExist(Paths.get(savePath.toString()))) {
ProcessBuilder pbGZIP = new ProcessBuilder("gzip", "-d", savePath.toString());
Process pGZIP = pbGZIP.start();
pGZIP.waitFor();
}

if (checkFileExist(Paths.get(savePath.toString().replace(".gz", "")))) {
ProcessBuilder pbTAR = new ProcessBuilder("tar", "-xvf",
savePath.toString().substring(0, savePath.toString().length() - 3), "-C", saveRootPath);
Process pTar = pbTAR.start();
pTar.waitFor();

// detele the tar file for saving space
Files.delete(Path.of(savePath.toString().replace(".gz", "")));
}

System.out.println("Index decompressed successfully!");
this.indexFolderPath = Paths.get(indexFolder);
return indexFolder;
}

public Path getIndexFolderPath() {
return this.indexFolderPath;
}
}
Loading
Loading