Skip to content

Commit

Permalink
downloads wikipedia fvec files for 100k, switched to squad based quer…
Browse files Browse the repository at this point in the history
…y vectors (#130)

* downloads wikipedia fvec files for 100k, switched to squad based query vectors

* update bench readme
  • Loading branch information
phact authored Oct 23, 2023
1 parent 8aa7605 commit b3309aa
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 23 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,6 @@ target/
.DS_Store

**/.flattened-pom.xml

fvec/
hdf5/
9 changes: 2 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,18 +100,13 @@ This may not be correct in all setups (e.g. no hyperthreading or hybrid architec
Some sample KNN datasets for testing based on ada-002 embeddings generated on wikipedia data are available in ivec/fvec format for testing at:

```
aws s3 ls s3://astra-vector/wikipedia/ --no-sign-request
aws s3 ls s3://astra-vector/wikipedia_scout/ --no-sign-request
PRE 100k/
PRE 1M/
PRE 4M/
```

download them with the aws s3 cli as follows:

```
aws s3 sync s3://astra-vector/wikipedia/100k ./ --no-sign-request
```

Bench (see below) automatically downloads the 100k dataset to the `./fvec` directory

## Developing and Testing
This project is organized as a [multimodule Maven build](https://maven.apache.org/guides/mini/guide-multiple-modules.html). The intent is to produce a multirelease jar suitable for use as
Expand Down
10 changes: 3 additions & 7 deletions jvector-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,13 @@ You can use [`plot_output.py`](./plot_output.py) to graph the [pareto-optimal po
Some sample KNN datasets for testing based on ada-002 embeddings generated on wikipedia data are available in ivec/fvec format for testing at:

```
aws s3 ls s3://astra-vector/wikipedia/ --no-sign-request
aws s3 ls s3://astra-vector/wikipedia_squad/ --no-sign-request
PRE 100k/
PRE 1M/
PRE 4M/
```

download them with the aws s3 cli as follows:

```
aws s3 sync s3://astra-vector/wikipedia/100k ./ --no-sign-request
```
Bench automatically downloads the 100k dataset to the `./fvec` directory .

To run `SiftSmall`/`Bench` without the JVM vector module available, you can use the following invocations:

Expand Down Expand Up @@ -90,4 +86,4 @@ RESULT [2,1,0]
* Search index for the top-k closest vectors (ordinals of indexed values returned per query)
* `BULKLOAD {localpath}`
* Bulk loads a local file in numpy format Rows x Columns


10 changes: 10 additions & 0 deletions jvector-examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@
<artifactId>util-mmap</artifactId>
<version>1.0.52-3042601</version>
</dependency>
<dependency>
<groupId>software.amazon.awssdk</groupId>
<artifactId>s3-transfer-manager</artifactId>
<version>2.21.2</version>
</dependency>
<dependency>
<groupId>software.amazon.awssdk</groupId>
<artifactId>aws-crt-client</artifactId>
<version>2.21.2</version>
</dependency>
<dependency>
<groupId>com.kohlschutter.junixsocket</groupId>
<artifactId>junixsocket-core</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import io.github.jbellis.jvector.disk.CachingGraphIndex;
import io.github.jbellis.jvector.disk.OnDiskGraphIndex;
import io.github.jbellis.jvector.disk.SimpleMappedReader;
import io.github.jbellis.jvector.example.util.DataSet;
import io.github.jbellis.jvector.example.util.Hdf5Loader;
import io.github.jbellis.jvector.example.util.ReaderSupplierFactory;
Expand All @@ -28,24 +29,37 @@
import io.github.jbellis.jvector.util.Bits;
import io.github.jbellis.jvector.vector.VectorEncoding;
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
import software.amazon.awssdk.auth.credentials.AnonymousCredentialsProvider;
import software.amazon.awssdk.http.crt.AwsCrtAsyncHttpClient;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.s3.S3AsyncClient;
import software.amazon.awssdk.services.s3.S3AsyncClientBuilder;
import software.amazon.awssdk.transfer.s3.S3TransferManager;
import software.amazon.awssdk.transfer.s3.model.*;
import software.amazon.awssdk.transfer.s3.progress.LoggingTransferListener;
import java.util.logging.Logger;

import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.LongAdder;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

/**
* Tests GraphIndexes against vectors from various datasets
*/
public class Bench {

private static final Logger LOG = Logger.getLogger(SimpleMappedReader.class.getName());
private static void testRecall(int M, int efConstruction, List<Boolean> diskOptions, List<Integer> efSearchOptions, DataSet ds, CompressedVectors cv, Path testDirectory) throws IOException {
var floatVectors = new ListRandomAccessVectorValues(ds.baseVectors, ds.baseVectors.get(0).length);
var topK = ds.groundTruth.get(0).size();
Expand Down Expand Up @@ -142,8 +156,8 @@ public static void main(String[] args) throws IOException {
var diskGrid = List.of(false, true);
var pqGrid = List.of(2, 4, 8);

// this dataset contains more than 10k query vectors, so we limit it with .subList
var adaSet = loadWikipediaData();
maybeDownloadData();
var adaSet = fvecLoadData("wikipedia_squad", "wikipedia_squad/100k");
gridSearch(adaSet, pqGrid, mGrid, efConstructionGrid, diskGrid, efSearchGrid);

var files = List.of(
Expand All @@ -167,17 +181,74 @@ public static void main(String[] args) throws IOException {
}
}

private static DataSet loadWikipediaData() throws IOException {
var baseVectors = SiftLoader.readFvecs("fvec/pages_ada_002_100k_base_vectors.fvec");
var queryVectors = SiftLoader.readFvecs("fvec/pages_ada_002_100k_query_vectors_10k.fvec").subList(0, 10_000);
var gt = SiftLoader.readIvecs("fvec/pages_ada_002_100k_indices_query_vectors_10k.ivec").subList(0, 10_000);
var ds = new DataSet("wikipedia",
private static void maybeDownloadData() {
String[] keys = {
"wikipedia_squad/100k/ada_002_100000_base_vectors.fvec",
"wikipedia_squad/100k/ada_002_100000_query_vectors_10000.fvec",
"wikipedia_squad/100k/ada_002_100000_indices_query_10000.ivec"
};

String bucketName = "astra-vector";

S3AsyncClientBuilder s3ClientBuilder = S3AsyncClient.builder()
.region(Region.of("us-east-1"))
.httpClient(AwsCrtAsyncHttpClient.builder()
.maxConcurrency(1)
.build())
.credentialsProvider(AnonymousCredentialsProvider.create());

// get directory from paths in keys
List<String> dirs = Arrays.stream(keys).map(key -> key.substring(0, key.lastIndexOf("/"))).distinct().collect(Collectors.toList());
for (String dir : dirs) {
try {
dir = "fvec/"+dir;
Files.createDirectories(Paths.get(dir));
} catch (IOException e) {
System.err.println("Failed to create directory: " + e.getMessage());
}
}

try (S3AsyncClient s3Client = s3ClientBuilder.build()) {
S3TransferManager tm = S3TransferManager.builder().s3Client(s3Client).build();
for (String key : keys) {
Path path = Paths.get("fvec", key);
if (Files.exists(path)) {
continue;
}

System.out.println("Downloading: "+key);
DownloadFileRequest downloadFileRequest =
DownloadFileRequest.builder()
.getObjectRequest(b -> b.bucket(bucketName).key(key))
.addTransferListener(LoggingTransferListener.create())
.destination(Paths.get(path.toString()))
.build();

FileDownload downloadFile = tm.downloadFile(downloadFileRequest);

CompletedFileDownload downloadResult = downloadFile.completionFuture().join();
System.out.println("Downloaded file of length " + downloadResult.response().contentLength());

}
tm.close();
}
catch(Exception e){
System.out.println("Error downloading data from S3: " + e.getMessage());
System.exit(1);
}
}

private static DataSet fvecLoadData(String name, String path) throws IOException {
var baseVectors = SiftLoader.readFvecs("fvec/"+path+"/ada_002_100000_base_vectors.fvec");
var queryVectors = SiftLoader.readFvecs("fvec/"+path+"/ada_002_100000_query_vectors_10000.fvec");
var gt = SiftLoader.readIvecs("fvec/"+path+"/ada_002_100000_indices_query_10000.ivec");
var ds = new DataSet(name,
VectorSimilarityFunction.DOT_PRODUCT,
baseVectors,
queryVectors,
gt);
System.out.format("%nWikipedia: %d base and %d query vectors loaded, dimensions %d%n",
baseVectors.size(), queryVectors.size(), baseVectors.get(0).length);
System.out.format("%n%s: %d base and %d query vectors loaded, dimensions %d%n",
name, baseVectors.size(), queryVectors.size(), baseVectors.get(0).length);
return ds;
}

Expand Down

0 comments on commit b3309aa

Please sign in to comment.