From b3309aad7fa3fd17ff1665c1b467590c3df2a52d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebasti=C3=A1n=20Est=C3=A9vez?= Date: Mon, 23 Oct 2023 12:47:43 -0400 Subject: [PATCH] downloads wikipedia fvec files for 100k, switched to squad based query vectors (#130) * downloads wikipedia fvec files for 100k, switched to squad based query vectors * update bench readme --- .gitignore | 3 + README.md | 9 +- jvector-examples/README.md | 10 +-- jvector-examples/pom.xml | 10 +++ .../github/jbellis/jvector/example/Bench.java | 89 +++++++++++++++++-- 5 files changed, 98 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index b628049ca..8d91b6762 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,6 @@ target/ .DS_Store **/.flattened-pom.xml + +fvec/ +hdf5/ diff --git a/README.md b/README.md index 5b18bb29c..2f09c1b98 100644 --- a/README.md +++ b/README.md @@ -100,18 +100,13 @@ This may not be correct in all setups (e.g. no hyperthreading or hybrid architec Some sample KNN datasets for testing based on ada-002 embeddings generated on wikipedia data are available in ivec/fvec format for testing at: ``` -aws s3 ls s3://astra-vector/wikipedia/ --no-sign-request +aws s3 ls s3://astra-vector/wikipedia_scout/ --no-sign-request PRE 100k/ PRE 1M/ PRE 4M/ ``` -download them with the aws s3 cli as follows: - -``` -aws s3 sync s3://astra-vector/wikipedia/100k ./ --no-sign-request -``` - +Bench (see below) automatically downloads the 100k dataset to the `./fvec` directory ## Developing and Testing This project is organized as a [multimodule Maven build](https://maven.apache.org/guides/mini/guide-multiple-modules.html). The intent is to produce a multirelease jar suitable for use as diff --git a/jvector-examples/README.md b/jvector-examples/README.md index 4ce338c95..5cca4fa4e 100644 --- a/jvector-examples/README.md +++ b/jvector-examples/README.md @@ -21,17 +21,13 @@ You can use [`plot_output.py`](./plot_output.py) to graph the [pareto-optimal po Some sample KNN datasets for testing based on ada-002 embeddings generated on wikipedia data are available in ivec/fvec format for testing at: ``` -aws s3 ls s3://astra-vector/wikipedia/ --no-sign-request +aws s3 ls s3://astra-vector/wikipedia_squad/ --no-sign-request PRE 100k/ PRE 1M/ PRE 4M/ ``` -download them with the aws s3 cli as follows: - -``` -aws s3 sync s3://astra-vector/wikipedia/100k ./ --no-sign-request -``` +Bench automatically downloads the 100k dataset to the `./fvec` directory . To run `SiftSmall`/`Bench` without the JVM vector module available, you can use the following invocations: @@ -90,4 +86,4 @@ RESULT [2,1,0] * Search index for the top-k closest vectors (ordinals of indexed values returned per query) * `BULKLOAD {localpath}` * Bulk loads a local file in numpy format Rows x Columns - \ No newline at end of file + diff --git a/jvector-examples/pom.xml b/jvector-examples/pom.xml index bfd73462a..ce824aeff 100644 --- a/jvector-examples/pom.xml +++ b/jvector-examples/pom.xml @@ -39,6 +39,16 @@ util-mmap 1.0.52-3042601 + + software.amazon.awssdk + s3-transfer-manager + 2.21.2 + + + software.amazon.awssdk + aws-crt-client + 2.21.2 + com.kohlschutter.junixsocket junixsocket-core diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index a9e7b74db..e05519154 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -18,6 +18,7 @@ import io.github.jbellis.jvector.disk.CachingGraphIndex; import io.github.jbellis.jvector.disk.OnDiskGraphIndex; +import io.github.jbellis.jvector.disk.SimpleMappedReader; import io.github.jbellis.jvector.example.util.DataSet; import io.github.jbellis.jvector.example.util.Hdf5Loader; import io.github.jbellis.jvector.example.util.ReaderSupplierFactory; @@ -28,12 +29,22 @@ import io.github.jbellis.jvector.util.Bits; import io.github.jbellis.jvector.vector.VectorEncoding; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import software.amazon.awssdk.auth.credentials.AnonymousCredentialsProvider; +import software.amazon.awssdk.http.crt.AwsCrtAsyncHttpClient; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.services.s3.S3AsyncClient; +import software.amazon.awssdk.services.s3.S3AsyncClientBuilder; +import software.amazon.awssdk.transfer.s3.S3TransferManager; +import software.amazon.awssdk.transfer.s3.model.*; +import software.amazon.awssdk.transfer.s3.progress.LoggingTransferListener; +import java.util.logging.Logger; import java.io.BufferedOutputStream; import java.io.DataOutputStream; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.Arrays; import java.util.List; import java.util.Set; @@ -41,11 +52,14 @@ import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.IntStream; +import java.util.stream.Stream; /** * Tests GraphIndexes against vectors from various datasets */ public class Bench { + + private static final Logger LOG = Logger.getLogger(SimpleMappedReader.class.getName()); private static void testRecall(int M, int efConstruction, List diskOptions, List efSearchOptions, DataSet ds, CompressedVectors cv, Path testDirectory) throws IOException { var floatVectors = new ListRandomAccessVectorValues(ds.baseVectors, ds.baseVectors.get(0).length); var topK = ds.groundTruth.get(0).size(); @@ -142,8 +156,8 @@ public static void main(String[] args) throws IOException { var diskGrid = List.of(false, true); var pqGrid = List.of(2, 4, 8); - // this dataset contains more than 10k query vectors, so we limit it with .subList - var adaSet = loadWikipediaData(); + maybeDownloadData(); + var adaSet = fvecLoadData("wikipedia_squad", "wikipedia_squad/100k"); gridSearch(adaSet, pqGrid, mGrid, efConstructionGrid, diskGrid, efSearchGrid); var files = List.of( @@ -167,17 +181,74 @@ public static void main(String[] args) throws IOException { } } - private static DataSet loadWikipediaData() throws IOException { - var baseVectors = SiftLoader.readFvecs("fvec/pages_ada_002_100k_base_vectors.fvec"); - var queryVectors = SiftLoader.readFvecs("fvec/pages_ada_002_100k_query_vectors_10k.fvec").subList(0, 10_000); - var gt = SiftLoader.readIvecs("fvec/pages_ada_002_100k_indices_query_vectors_10k.ivec").subList(0, 10_000); - var ds = new DataSet("wikipedia", + private static void maybeDownloadData() { + String[] keys = { + "wikipedia_squad/100k/ada_002_100000_base_vectors.fvec", + "wikipedia_squad/100k/ada_002_100000_query_vectors_10000.fvec", + "wikipedia_squad/100k/ada_002_100000_indices_query_10000.ivec" + }; + + String bucketName = "astra-vector"; + + S3AsyncClientBuilder s3ClientBuilder = S3AsyncClient.builder() + .region(Region.of("us-east-1")) + .httpClient(AwsCrtAsyncHttpClient.builder() + .maxConcurrency(1) + .build()) + .credentialsProvider(AnonymousCredentialsProvider.create()); + + // get directory from paths in keys + List dirs = Arrays.stream(keys).map(key -> key.substring(0, key.lastIndexOf("/"))).distinct().collect(Collectors.toList()); + for (String dir : dirs) { + try { + dir = "fvec/"+dir; + Files.createDirectories(Paths.get(dir)); + } catch (IOException e) { + System.err.println("Failed to create directory: " + e.getMessage()); + } + } + + try (S3AsyncClient s3Client = s3ClientBuilder.build()) { + S3TransferManager tm = S3TransferManager.builder().s3Client(s3Client).build(); + for (String key : keys) { + Path path = Paths.get("fvec", key); + if (Files.exists(path)) { + continue; + } + + System.out.println("Downloading: "+key); + DownloadFileRequest downloadFileRequest = + DownloadFileRequest.builder() + .getObjectRequest(b -> b.bucket(bucketName).key(key)) + .addTransferListener(LoggingTransferListener.create()) + .destination(Paths.get(path.toString())) + .build(); + + FileDownload downloadFile = tm.downloadFile(downloadFileRequest); + + CompletedFileDownload downloadResult = downloadFile.completionFuture().join(); + System.out.println("Downloaded file of length " + downloadResult.response().contentLength()); + + } + tm.close(); + } + catch(Exception e){ + System.out.println("Error downloading data from S3: " + e.getMessage()); + System.exit(1); + } + } + + private static DataSet fvecLoadData(String name, String path) throws IOException { + var baseVectors = SiftLoader.readFvecs("fvec/"+path+"/ada_002_100000_base_vectors.fvec"); + var queryVectors = SiftLoader.readFvecs("fvec/"+path+"/ada_002_100000_query_vectors_10000.fvec"); + var gt = SiftLoader.readIvecs("fvec/"+path+"/ada_002_100000_indices_query_10000.ivec"); + var ds = new DataSet(name, VectorSimilarityFunction.DOT_PRODUCT, baseVectors, queryVectors, gt); - System.out.format("%nWikipedia: %d base and %d query vectors loaded, dimensions %d%n", - baseVectors.size(), queryVectors.size(), baseVectors.get(0).length); + System.out.format("%n%s: %d base and %d query vectors loaded, dimensions %d%n", + name, baseVectors.size(), queryVectors.size(), baseVectors.get(0).length); return ds; }