-
Notifications
You must be signed in to change notification settings - Fork 497
Add download from remote feature #2301
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
728e5fb
add download from remote feature
ArthurChen189 9c8267c
fix maven build
ArthurChen189 2e12e02
add unittest for prebuilt index handler
ArthurChen189 f4f9de4
rename IndexHandler to PrebuiltIndexHandler
ArthurChen189 5962d92
minor fixup
ArthurChen189 288f95a
minor fixup
ArthurChen189 360f199
merge with main
ArthurChen189 ded4ce6
resolve merge conflicts
ArthurChen189 6bbc664
minor fixup
ArthurChen189 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
/* | ||
* Anserini: A Lucene toolkit for reproducible information retrieval research | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.anserini.index; | ||
|
||
public enum IndexInfo { | ||
MSMARCO_V1_PASSAGE("msmarco-v1-passage", | ||
"Lucene index of the MS MARCO V1 passage corpus. (Lucene 9)", | ||
"lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz", | ||
"lucene-index.msmarco-v1-passage.20221004.252b5e.README.md", | ||
new String[] { | ||
"https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz" }, | ||
"c697b18c9a0686ca760583e615dbe450", "2170758938", "352316036", "8841823", | ||
"2660824", false), | ||
|
||
CACM("cacm", | ||
"Lucene index of the CACM corpus. (Lucene 9)", | ||
"lucene-index.cacm.tar.gz", | ||
new String[] { | ||
"https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.20221005.252b5e.tar.gz" }, | ||
"cfe14d543c6a27f4d742fb2d0099b8e0", | ||
"2347197", | ||
"320968", | ||
"3204", | ||
"14363"); | ||
|
||
public final String indexName; | ||
public final String description; | ||
public final String filename; | ||
public final String readme; | ||
public final String[] urls; | ||
public final String md5; | ||
public final String size; | ||
public final String totalTerms; | ||
public final String totalDocs; | ||
public final String totalUniqueTerms; | ||
public final boolean downloaded; | ||
|
||
// constructor with all 11 fields | ||
ArthurChen189 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
IndexInfo(String indexName, String description, String filename, String readme, String[] urls, String md5, | ||
String size, String totalTerms, String totalDocs, String totalUniqueTerms, boolean downloaded) { | ||
this.indexName = indexName; | ||
this.description = description; | ||
this.filename = filename; | ||
this.readme = readme; | ||
this.urls = urls; | ||
this.md5 = md5; | ||
this.size = size; | ||
this.totalTerms = totalTerms; | ||
this.totalDocs = totalDocs; | ||
this.totalUniqueTerms = totalUniqueTerms; | ||
this.downloaded = downloaded; | ||
} | ||
|
||
// constructor with 9 fields | ||
IndexInfo(String indexName, String description, String filename, String[] urls, String md5, String size, | ||
String totalTerms, String totalDocs, String totalUniqueTerms) { | ||
this.indexName = indexName; | ||
this.description = description; | ||
this.filename = filename; | ||
this.readme = ""; | ||
this.urls = urls; | ||
this.md5 = md5; | ||
this.size = size; | ||
this.totalTerms = totalTerms; | ||
this.totalDocs = totalDocs; | ||
this.totalUniqueTerms = totalUniqueTerms; | ||
this.downloaded = false; | ||
} | ||
|
||
public static boolean contains(String indexName) { | ||
for (IndexInfo indexInfo : IndexInfo.values()) { | ||
if (indexInfo.indexName.equals(indexName)) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
public static IndexInfo get(String indexName) { | ||
for (IndexInfo indexInfo : IndexInfo.values()) { | ||
if (indexInfo.indexName.equals(indexName)) { | ||
return indexInfo; | ||
} | ||
} | ||
throw new IllegalArgumentException("Index name " + indexName + " not found!"); | ||
} | ||
|
||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
206 changes: 206 additions & 0 deletions
206
src/main/java/io/anserini/util/PrebuiltIndexHandler.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
/* | ||
* Anserini: A Lucene toolkit for reproducible information retrieval research | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.anserini.util; | ||
|
||
import me.tongfei.progressbar.ProgressBar; | ||
|
||
import org.apache.commons.codec.digest.DigestUtils; | ||
import org.apache.commons.io.IOUtils; | ||
import org.apache.commons.io.input.CountingInputStream; | ||
|
||
import io.anserini.index.IndexInfo; | ||
|
||
import java.io.FileOutputStream; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.net.HttpURLConnection; | ||
import java.net.URL; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.nio.file.Paths; | ||
|
||
public class PrebuiltIndexHandler { | ||
private String indexName; | ||
private String saveRootPath; | ||
private IndexInfo info = null; | ||
private Path indexFolderPath = null; | ||
private boolean initialized = false; | ||
private Path savePath; | ||
|
||
public PrebuiltIndexHandler(String indexName) { | ||
this.indexName = indexName; | ||
this.saveRootPath = getCache(); | ||
} | ||
|
||
private String getCache() { | ||
/* | ||
* Get the pyserini cache path firs to avoid double downloads. If the pyserini | ||
* cache path does not exist, use the anserini cache path. | ||
*/ | ||
final Path PyseriniPath = Paths.get(System.getProperty("user.home"), ".cache", "pyserini", "indexes"); | ||
final Path AnseriniPath = Paths.get(System.getProperty("user.home"), ".cache", "anserini", "indexes"); | ||
if (checkFileExist(PyseriniPath)) { | ||
return PyseriniPath.toString(); | ||
} else { | ||
return AnseriniPath.toString(); | ||
} | ||
} | ||
|
||
private static boolean checkFileExist(Path path) { | ||
return path.toFile().exists(); | ||
} | ||
|
||
private boolean checkIndexFileExist() { | ||
/* | ||
* Check if the index file exists. If the index file exists, return true. | ||
* Otherwise, return false. | ||
*/ | ||
if (checkFileExist(savePath) || checkFileExist(Paths.get(savePath.toString().replace(".gz", ""))) | ||
|| checkFileExist(Paths.get(savePath.toString().replace(".tar.gz", "")))) { | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
private static IndexInfo getIndexInfo(String indexName) { | ||
/* | ||
* Get the index info from the index name. | ||
*/ | ||
try { | ||
IndexInfo info = IndexInfo.get(indexName); | ||
return info; | ||
} catch (IllegalArgumentException e) { | ||
throw new IllegalArgumentException("Index not found!" + e.getMessage()); | ||
} | ||
} | ||
|
||
private static boolean checkMD5(InputStream st, String md5) throws IOException { | ||
/* | ||
* Check the MD5 checksum of the index file. | ||
*/ | ||
String generatedChecksum = DigestUtils.md5Hex(st); | ||
return generatedChecksum.equals(md5); | ||
} | ||
|
||
public void initialize() { | ||
if (initialized) { | ||
return; | ||
} | ||
info = getIndexInfo(indexName); | ||
// check if saveRootPath exists | ||
if (!checkFileExist(Paths.get(saveRootPath))) { | ||
try { | ||
Files.createDirectories(Paths.get(saveRootPath)); | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
} | ||
} | ||
savePath = Paths.get(saveRootPath, info.filename); | ||
initialized = true; | ||
} | ||
|
||
public void download() throws IOException { | ||
/* | ||
* Download the index file to the save path. If the file already exists, do | ||
* nothing. If the file does not exist, download the file and check the MD5 | ||
* checksum. | ||
*/ | ||
if (!initialized) { | ||
throw new IllegalStateException("Handler not initialized!"); | ||
} | ||
if (checkIndexFileExist()) { | ||
System.out.println("Index file already exists! Skip downloading."); | ||
return; | ||
} | ||
|
||
URL url = new URL(info.urls[0]); | ||
HttpURLConnection httpConnection = (HttpURLConnection) (url.openConnection()); | ||
long completeFileSize = httpConnection.getContentLengthLong(); | ||
|
||
try (InputStream inputStream = url.openStream(); | ||
CountingInputStream cis = new CountingInputStream(inputStream); | ||
FileOutputStream fileOS = new FileOutputStream(savePath.toFile()); | ||
ProgressBar pb = new ProgressBar(indexName, Math.floorDiv(completeFileSize, 1000))) { | ||
|
||
pb.setExtraMessage("Downloading..."); | ||
|
||
new Thread(() -> { | ||
try { | ||
IOUtils.copyLarge(cis, fileOS); | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
} | ||
}).start(); | ||
|
||
while (cis.getByteCount() < completeFileSize) { | ||
pb.stepTo(Math.floorDiv(cis.getByteCount(), 1000)); | ||
} | ||
|
||
pb.stepTo(Math.floorDiv(cis.getByteCount(), 1000)); | ||
pb.close(); | ||
|
||
InputStream is = Files.newInputStream(savePath); | ||
if (!checkMD5(is, info.md5)) { | ||
throw new IOException("MD5 check failed!"); | ||
} | ||
} | ||
} | ||
|
||
public String decompressIndex() throws Exception { | ||
/* | ||
* Decompress the tar.gz or tar index file to an archive folder. If the folder | ||
* already exists, do nothing. | ||
*/ | ||
if (!initialized) { | ||
throw new IllegalStateException("Handler not initialized!"); | ||
} | ||
if (!checkIndexFileExist()) { | ||
throw new Exception("Index file does not exist!"); | ||
} | ||
|
||
String indexFolder = savePath.toString().replace(".tar.gz", ""); | ||
if (checkFileExist(Paths.get(indexFolder))) { | ||
System.out.println("Index folder already exists!"); | ||
return indexFolder; | ||
} | ||
System.out.println("Decompressing index..."); | ||
|
||
if (checkFileExist(Paths.get(savePath.toString()))) { | ||
ProcessBuilder pbGZIP = new ProcessBuilder("gzip", "-d", savePath.toString()); | ||
Process pGZIP = pbGZIP.start(); | ||
pGZIP.waitFor(); | ||
} | ||
|
||
if (checkFileExist(Paths.get(savePath.toString().replace(".gz", "")))) { | ||
ProcessBuilder pbTAR = new ProcessBuilder("tar", "-xvf", | ||
savePath.toString().substring(0, savePath.toString().length() - 3), "-C", saveRootPath); | ||
Process pTar = pbTAR.start(); | ||
pTar.waitFor(); | ||
|
||
// detele the tar file for saving space | ||
Files.delete(Path.of(savePath.toString().replace(".gz", ""))); | ||
} | ||
|
||
System.out.println("Index decompressed successfully!"); | ||
this.indexFolderPath = Paths.get(indexFolder); | ||
return indexFolder; | ||
} | ||
|
||
public Path getIndexFolderPath() { | ||
return this.indexFolderPath; | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.