Skip to content

Commit 45591ab

Browse files
Add download-from-remote feature for pre-built indexes (#2301)
* add PrebuiltIndexHandler * add a download progress bar * add MD5 checksum checking * add gzip and unzip tarball functionalities * add corresponding unittests
1 parent 2c14a49 commit 45591ab

File tree

7 files changed

+457
-4
lines changed

7 files changed

+457
-4
lines changed

pom.xml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,5 +463,15 @@
463463
<artifactId>api</artifactId>
464464
<version>0.21.0</version>
465465
</dependency>
466+
<dependency>
467+
<groupId>me.tongfei</groupId>
468+
<artifactId>progressbar</artifactId>
469+
<version>0.10.0</version>
470+
</dependency>
471+
<dependency>
472+
<groupId>commons-codec</groupId>
473+
<artifactId>commons-codec</artifactId>
474+
<version>1.15</version>
475+
</dependency>
466476
</dependencies>
467477
</project>
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/*
2+
* Anserini: A Lucene toolkit for reproducible information retrieval research
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package io.anserini.index;
18+
19+
public enum IndexInfo {
20+
MSMARCO_V1_PASSAGE("msmarco-v1-passage",
21+
"Lucene index of the MS MARCO V1 passage corpus. (Lucene 9)",
22+
"lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz",
23+
"lucene-index.msmarco-v1-passage.20221004.252b5e.README.md",
24+
new String[] {
25+
"https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz" },
26+
"c697b18c9a0686ca760583e615dbe450", "2170758938", "352316036", "8841823",
27+
"2660824", false),
28+
29+
CACM("cacm",
30+
"Lucene index of the CACM corpus. (Lucene 9)",
31+
"lucene-index.cacm.tar.gz",
32+
new String[] {
33+
"https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.20221005.252b5e.tar.gz" },
34+
"cfe14d543c6a27f4d742fb2d0099b8e0",
35+
"2347197",
36+
"320968",
37+
"3204",
38+
"14363");
39+
40+
public final String indexName;
41+
public final String description;
42+
public final String filename;
43+
public final String readme;
44+
public final String[] urls;
45+
public final String md5;
46+
public final String size;
47+
public final String totalTerms;
48+
public final String totalDocs;
49+
public final String totalUniqueTerms;
50+
public final boolean downloaded;
51+
52+
// constructor with all 11 fields
53+
IndexInfo(String indexName, String description, String filename, String readme, String[] urls, String md5,
54+
String size, String totalTerms, String totalDocs, String totalUniqueTerms, boolean downloaded) {
55+
this.indexName = indexName;
56+
this.description = description;
57+
this.filename = filename;
58+
this.readme = readme;
59+
this.urls = urls;
60+
this.md5 = md5;
61+
this.size = size;
62+
this.totalTerms = totalTerms;
63+
this.totalDocs = totalDocs;
64+
this.totalUniqueTerms = totalUniqueTerms;
65+
this.downloaded = downloaded;
66+
}
67+
68+
// constructor with 9 fields
69+
IndexInfo(String indexName, String description, String filename, String[] urls, String md5, String size,
70+
String totalTerms, String totalDocs, String totalUniqueTerms) {
71+
this.indexName = indexName;
72+
this.description = description;
73+
this.filename = filename;
74+
this.readme = "";
75+
this.urls = urls;
76+
this.md5 = md5;
77+
this.size = size;
78+
this.totalTerms = totalTerms;
79+
this.totalDocs = totalDocs;
80+
this.totalUniqueTerms = totalUniqueTerms;
81+
this.downloaded = false;
82+
}
83+
84+
public static boolean contains(String indexName) {
85+
for (IndexInfo indexInfo : IndexInfo.values()) {
86+
if (indexInfo.indexName.equals(indexName)) {
87+
return true;
88+
}
89+
}
90+
return false;
91+
}
92+
93+
public static IndexInfo get(String indexName) {
94+
for (IndexInfo indexInfo : IndexInfo.values()) {
95+
if (indexInfo.indexName.equals(indexName)) {
96+
return indexInfo;
97+
}
98+
}
99+
throw new IllegalArgumentException("Index name " + indexName + " not found!");
100+
}
101+
102+
}

src/main/java/io/anserini/search/SearchCollection.java

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
import io.anserini.search.topicreader.BackgroundLinkingTopicReader;
4545
import io.anserini.search.topicreader.TopicReader;
4646
import io.anserini.search.topicreader.Topics;
47+
import io.anserini.util.PrebuiltIndexHandler;
48+
4749
import org.apache.commons.io.IOUtils;
4850
import org.apache.commons.lang3.StringUtils;
4951
import org.apache.commons.lang3.time.DurationFormatUtils;
@@ -955,12 +957,27 @@ public void run() {
955957

956958
public SearchCollection(Args args) throws IOException {
957959
this.args = args;
958-
Path indexPath = Paths.get(args.index);
960+
Path indexPath = Path.of(args.index);
961+
PrebuiltIndexHandler indexHandler = new PrebuiltIndexHandler(args.index);
962+
if (!Files.exists(indexPath)) {
963+
// it doesn't exist locally, we try to download it from remote
964+
try {
965+
indexHandler.initialize();
966+
indexHandler.download();
967+
indexPath = Path.of(indexHandler.decompressIndex());
968+
} catch (IOException e) {
969+
throw new RuntimeException("MD5 checksum does not match!");
970+
} catch (Exception e) {
971+
throw new IllegalArgumentException(String.format("Index path '%s' does not exist or is not a directory.", args.index));
972+
}
973+
} else {
974+
// if it exists locally, we use it
975+
indexPath = Paths.get(args.index);
976+
}
959977

960978
if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
961979
throw new IllegalArgumentException(String.format("Index path '%s' does not exist or is not a directory.", args.index));
962980
}
963-
964981
LOG.info("============ Initializing Searcher ============");
965982
LOG.info("Index: " + indexPath);
966983
this.reader = args.inmem ? DirectoryReader.open(MMapDirectory.open(indexPath)) :
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
/*
2+
* Anserini: A Lucene toolkit for reproducible information retrieval research
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package io.anserini.util;
18+
19+
import me.tongfei.progressbar.ProgressBar;
20+
21+
import org.apache.commons.codec.digest.DigestUtils;
22+
import org.apache.commons.io.IOUtils;
23+
import org.apache.commons.io.input.CountingInputStream;
24+
25+
import io.anserini.index.IndexInfo;
26+
27+
import java.io.FileOutputStream;
28+
import java.io.IOException;
29+
import java.io.InputStream;
30+
import java.net.HttpURLConnection;
31+
import java.net.URL;
32+
import java.nio.file.Files;
33+
import java.nio.file.Path;
34+
import java.nio.file.Paths;
35+
36+
public class PrebuiltIndexHandler {
37+
private String indexName;
38+
private String saveRootPath;
39+
private IndexInfo info = null;
40+
private Path indexFolderPath = null;
41+
private boolean initialized = false;
42+
private Path savePath;
43+
44+
public PrebuiltIndexHandler(String indexName) {
45+
this.indexName = indexName;
46+
this.saveRootPath = getCache();
47+
}
48+
49+
private String getCache() {
50+
/*
51+
* Get the pyserini cache path firs to avoid double downloads. If the pyserini
52+
* cache path does not exist, use the anserini cache path.
53+
*/
54+
final Path PyseriniPath = Paths.get(System.getProperty("user.home"), ".cache", "pyserini", "indexes");
55+
final Path AnseriniPath = Paths.get(System.getProperty("user.home"), ".cache", "anserini", "indexes");
56+
if (checkFileExist(PyseriniPath)) {
57+
return PyseriniPath.toString();
58+
} else {
59+
return AnseriniPath.toString();
60+
}
61+
}
62+
63+
private static boolean checkFileExist(Path path) {
64+
return path.toFile().exists();
65+
}
66+
67+
private boolean checkIndexFileExist() {
68+
/*
69+
* Check if the index file exists. If the index file exists, return true.
70+
* Otherwise, return false.
71+
*/
72+
if (checkFileExist(savePath) || checkFileExist(Paths.get(savePath.toString().replace(".gz", "")))
73+
|| checkFileExist(Paths.get(savePath.toString().replace(".tar.gz", "")))) {
74+
return true;
75+
}
76+
return false;
77+
}
78+
79+
private static IndexInfo getIndexInfo(String indexName) {
80+
/*
81+
* Get the index info from the index name.
82+
*/
83+
try {
84+
IndexInfo info = IndexInfo.get(indexName);
85+
return info;
86+
} catch (IllegalArgumentException e) {
87+
throw new IllegalArgumentException("Index not found!" + e.getMessage());
88+
}
89+
}
90+
91+
private static boolean checkMD5(InputStream st, String md5) throws IOException {
92+
/*
93+
* Check the MD5 checksum of the index file.
94+
*/
95+
String generatedChecksum = DigestUtils.md5Hex(st);
96+
return generatedChecksum.equals(md5);
97+
}
98+
99+
public void initialize() {
100+
if (initialized) {
101+
return;
102+
}
103+
info = getIndexInfo(indexName);
104+
// check if saveRootPath exists
105+
if (!checkFileExist(Paths.get(saveRootPath))) {
106+
try {
107+
Files.createDirectories(Paths.get(saveRootPath));
108+
} catch (IOException e) {
109+
e.printStackTrace();
110+
}
111+
}
112+
savePath = Paths.get(saveRootPath, info.filename);
113+
initialized = true;
114+
}
115+
116+
public void download() throws IOException {
117+
/*
118+
* Download the index file to the save path. If the file already exists, do
119+
* nothing. If the file does not exist, download the file and check the MD5
120+
* checksum.
121+
*/
122+
if (!initialized) {
123+
throw new IllegalStateException("Handler not initialized!");
124+
}
125+
if (checkIndexFileExist()) {
126+
System.out.println("Index file already exists! Skip downloading.");
127+
return;
128+
}
129+
130+
URL url = new URL(info.urls[0]);
131+
HttpURLConnection httpConnection = (HttpURLConnection) (url.openConnection());
132+
long completeFileSize = httpConnection.getContentLengthLong();
133+
134+
try (InputStream inputStream = url.openStream();
135+
CountingInputStream cis = new CountingInputStream(inputStream);
136+
FileOutputStream fileOS = new FileOutputStream(savePath.toFile());
137+
ProgressBar pb = new ProgressBar(indexName, Math.floorDiv(completeFileSize, 1000))) {
138+
139+
pb.setExtraMessage("Downloading...");
140+
141+
new Thread(() -> {
142+
try {
143+
IOUtils.copyLarge(cis, fileOS);
144+
} catch (IOException e) {
145+
e.printStackTrace();
146+
}
147+
}).start();
148+
149+
while (cis.getByteCount() < completeFileSize) {
150+
pb.stepTo(Math.floorDiv(cis.getByteCount(), 1000));
151+
}
152+
153+
pb.stepTo(Math.floorDiv(cis.getByteCount(), 1000));
154+
pb.close();
155+
156+
InputStream is = Files.newInputStream(savePath);
157+
if (!checkMD5(is, info.md5)) {
158+
throw new IOException("MD5 check failed!");
159+
}
160+
}
161+
}
162+
163+
public String decompressIndex() throws Exception {
164+
/*
165+
* Decompress the tar.gz or tar index file to an archive folder. If the folder
166+
* already exists, do nothing.
167+
*/
168+
if (!initialized) {
169+
throw new IllegalStateException("Handler not initialized!");
170+
}
171+
if (!checkIndexFileExist()) {
172+
throw new Exception("Index file does not exist!");
173+
}
174+
175+
String indexFolder = savePath.toString().replace(".tar.gz", "");
176+
if (checkFileExist(Paths.get(indexFolder))) {
177+
System.out.println("Index folder already exists!");
178+
return indexFolder;
179+
}
180+
System.out.println("Decompressing index...");
181+
182+
if (checkFileExist(Paths.get(savePath.toString()))) {
183+
ProcessBuilder pbGZIP = new ProcessBuilder("gzip", "-d", savePath.toString());
184+
Process pGZIP = pbGZIP.start();
185+
pGZIP.waitFor();
186+
}
187+
188+
if (checkFileExist(Paths.get(savePath.toString().replace(".gz", "")))) {
189+
ProcessBuilder pbTAR = new ProcessBuilder("tar", "-xvf",
190+
savePath.toString().substring(0, savePath.toString().length() - 3), "-C", saveRootPath);
191+
Process pTar = pbTAR.start();
192+
pTar.waitFor();
193+
194+
// detele the tar file for saving space
195+
Files.delete(Path.of(savePath.toString().replace(".gz", "")));
196+
}
197+
198+
System.out.println("Index decompressed successfully!");
199+
this.indexFolderPath = Paths.get(indexFolder);
200+
return indexFolder;
201+
}
202+
203+
public Path getIndexFolderPath() {
204+
return this.indexFolderPath;
205+
}
206+
}

0 commit comments

Comments
 (0)