From b3309aad7fa3fd17ff1665c1b467590c3df2a52d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebasti=C3=A1n=20Est=C3=A9vez?= <estevezsebastian@gmail.com>
Date: Mon, 23 Oct 2023 12:47:43 -0400
Subject: [PATCH] downloads wikipedia fvec files for 100k, switched to squad
 based query vectors (#130)

* downloads wikipedia fvec files for 100k, switched to squad based query vectors

* update bench readme
---
 .gitignore                                    |  3 +
 README.md                                     |  9 +-
 jvector-examples/README.md                    | 10 +--
 jvector-examples/pom.xml                      | 10 +++
 .../github/jbellis/jvector/example/Bench.java | 89 +++++++++++++++++--
 5 files changed, 98 insertions(+), 23 deletions(-)
diff --git a/.gitignore b/.gitignore
index b628049ca..8d91b6762 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,6 @@ target/
 .DS_Store
 
 **/.flattened-pom.xml
+
+fvec/
+hdf5/
diff --git a/README.md b/README.md
index 5b18bb29c..2f09c1b98 100644
--- a/README.md
+++ b/README.md
@@ -100,18 +100,13 @@ This may not be correct in all setups (e.g. no hyperthreading or hybrid architec
 Some sample KNN datasets for testing based on ada-002 embeddings generated on wikipedia data are available in ivec/fvec format for testing at:
 
 ```
-aws s3 ls s3://astra-vector/wikipedia/ --no-sign-request 
+aws s3 ls s3://astra-vector/wikipedia_scout/ --no-sign-request 
                            PRE 100k/
                            PRE 1M/
                            PRE 4M/
 ```
 
-download them with the aws s3 cli as follows:
-
-```
-aws s3 sync s3://astra-vector/wikipedia/100k ./ --no-sign-request
-```
-
+Bench (see below) automatically downloads the 100k dataset to the `./fvec` directory
 
 ## Developing and Testing
 This project is organized as a [multimodule Maven build](https://maven.apache.org/guides/mini/guide-multiple-modules.html). The intent is to produce a multirelease jar suitable for use as
diff --git a/jvector-examples/README.md b/jvector-examples/README.md
index 4ce338c95..5cca4fa4e 100644
--- a/jvector-examples/README.md
+++ b/jvector-examples/README.md
@@ -21,17 +21,13 @@ You can use [`plot_output.py`](./plot_output.py) to graph the [pareto-optimal po
 Some sample KNN datasets for testing based on ada-002 embeddings generated on wikipedia data are available in ivec/fvec format for testing at:
 
 ```
-aws s3 ls s3://astra-vector/wikipedia/ --no-sign-request 
+aws s3 ls s3://astra-vector/wikipedia_squad/ --no-sign-request 
                            PRE 100k/
                            PRE 1M/
                            PRE 4M/
 ```
 
-download them with the aws s3 cli as follows:
-
-```
-aws s3 sync s3://astra-vector/wikipedia/100k ./ --no-sign-request
-```
+Bench automatically downloads the 100k dataset to the `./fvec` directory .
 
 To run `SiftSmall`/`Bench` without the JVM vector module available, you can use the following invocations:
 
@@ -90,4 +86,4 @@ RESULT [2,1,0]
     * Search index for the top-k closest vectors (ordinals of indexed values returned per query)
   * `BULKLOAD {localpath}`
     * Bulk loads a local file in numpy format Rows x Columns
-    
\ No newline at end of file
+    
diff --git a/jvector-examples/pom.xml b/jvector-examples/pom.xml
index bfd73462a..ce824aeff 100644
--- a/jvector-examples/pom.xml
+++ b/jvector-examples/pom.xml
@@ -39,6 +39,16 @@
             <artifactId>util-mmap</artifactId>
             <version>1.0.52-3042601</version>
         </dependency>
+        <dependency>
+            <groupId>software.amazon.awssdk</groupId>
+            <artifactId>s3-transfer-manager</artifactId>
+            <version>2.21.2</version>
+        </dependency>
+        <dependency>
+            <groupId>software.amazon.awssdk</groupId>
+            <artifactId>aws-crt-client</artifactId>
+            <version>2.21.2</version>
+        </dependency>
         <dependency>
             <groupId>com.kohlschutter.junixsocket</groupId>
             <artifactId>junixsocket-core</artifactId>
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java
index a9e7b74db..e05519154 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java
@@ -18,6 +18,7 @@
 
 import io.github.jbellis.jvector.disk.CachingGraphIndex;
 import io.github.jbellis.jvector.disk.OnDiskGraphIndex;
+import io.github.jbellis.jvector.disk.SimpleMappedReader;
 import io.github.jbellis.jvector.example.util.DataSet;
 import io.github.jbellis.jvector.example.util.Hdf5Loader;
 import io.github.jbellis.jvector.example.util.ReaderSupplierFactory;
@@ -28,12 +29,22 @@
 import io.github.jbellis.jvector.util.Bits;
 import io.github.jbellis.jvector.vector.VectorEncoding;
 import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
+import software.amazon.awssdk.auth.credentials.AnonymousCredentialsProvider;
+import software.amazon.awssdk.http.crt.AwsCrtAsyncHttpClient;
+import software.amazon.awssdk.regions.Region;
+import software.amazon.awssdk.services.s3.S3AsyncClient;
+import software.amazon.awssdk.services.s3.S3AsyncClientBuilder;
+import software.amazon.awssdk.transfer.s3.S3TransferManager;
+import software.amazon.awssdk.transfer.s3.model.*;
+import software.amazon.awssdk.transfer.s3.progress.LoggingTransferListener;
+import java.util.logging.Logger;
 
 import java.io.BufferedOutputStream;
 import java.io.DataOutputStream;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Set;
@@ -41,11 +52,14 @@
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
+import java.util.stream.Stream;
 
 /**
  * Tests GraphIndexes against vectors from various datasets
  */
 public class Bench {
+
+    private static final Logger LOG = Logger.getLogger(SimpleMappedReader.class.getName());
     private static void testRecall(int M, int efConstruction, List<Boolean> diskOptions, List<Integer> efSearchOptions, DataSet ds, CompressedVectors cv, Path testDirectory) throws IOException {
         var floatVectors = new ListRandomAccessVectorValues(ds.baseVectors, ds.baseVectors.get(0).length);
         var topK = ds.groundTruth.get(0).size();
@@ -142,8 +156,8 @@ public static void main(String[] args) throws IOException {
         var diskGrid = List.of(false, true);
         var pqGrid = List.of(2, 4, 8);
 
-        // this dataset contains more than 10k query vectors, so we limit it with .subList
-        var adaSet = loadWikipediaData();
+        maybeDownloadData();
+        var adaSet = fvecLoadData("wikipedia_squad", "wikipedia_squad/100k");
         gridSearch(adaSet, pqGrid, mGrid, efConstructionGrid, diskGrid, efSearchGrid);
 
         var files = List.of(
@@ -167,17 +181,74 @@ public static void main(String[] args) throws IOException {
         }
     }
 
-    private static DataSet loadWikipediaData() throws IOException {
-        var baseVectors = SiftLoader.readFvecs("fvec/pages_ada_002_100k_base_vectors.fvec");
-        var queryVectors = SiftLoader.readFvecs("fvec/pages_ada_002_100k_query_vectors_10k.fvec").subList(0, 10_000);
-        var gt = SiftLoader.readIvecs("fvec/pages_ada_002_100k_indices_query_vectors_10k.ivec").subList(0, 10_000);
-        var ds = new DataSet("wikipedia",
+    private static void maybeDownloadData() {
+        String[] keys = {
+                "wikipedia_squad/100k/ada_002_100000_base_vectors.fvec",
+                "wikipedia_squad/100k/ada_002_100000_query_vectors_10000.fvec",
+                "wikipedia_squad/100k/ada_002_100000_indices_query_10000.ivec"
+        };
+
+        String bucketName = "astra-vector";
+
+        S3AsyncClientBuilder s3ClientBuilder = S3AsyncClient.builder()
+                .region(Region.of("us-east-1"))
+                .httpClient(AwsCrtAsyncHttpClient.builder()
+                        .maxConcurrency(1)
+                        .build())
+                .credentialsProvider(AnonymousCredentialsProvider.create());
+
+        // get directory from paths in keys
+        List<String> dirs = Arrays.stream(keys).map(key -> key.substring(0, key.lastIndexOf("/"))).distinct().collect(Collectors.toList());
+        for (String dir : dirs) {
+            try {
+                dir = "fvec/"+dir;
+                Files.createDirectories(Paths.get(dir));
+            } catch (IOException e) {
+                System.err.println("Failed to create directory: " + e.getMessage());
+            }
+        }
+
+       try (S3AsyncClient s3Client = s3ClientBuilder.build()) {
+            S3TransferManager tm = S3TransferManager.builder().s3Client(s3Client).build();
+            for (String key : keys) {
+                Path path = Paths.get("fvec", key);
+                if (Files.exists(path)) {
+                    continue;
+                }
+
+                System.out.println("Downloading: "+key);
+                DownloadFileRequest downloadFileRequest =
+                        DownloadFileRequest.builder()
+                                .getObjectRequest(b -> b.bucket(bucketName).key(key))
+                                .addTransferListener(LoggingTransferListener.create())
+                                .destination(Paths.get(path.toString()))
+                                .build();
+
+                FileDownload downloadFile = tm.downloadFile(downloadFileRequest);
+
+                CompletedFileDownload downloadResult = downloadFile.completionFuture().join();
+                System.out.println("Downloaded file of length " + downloadResult.response().contentLength());
+
+            }
+            tm.close();
+        }
+        catch(Exception e){
+            System.out.println("Error downloading data from S3: " + e.getMessage());
+            System.exit(1);
+        }
+    }
+
+    private static DataSet fvecLoadData(String name, String path) throws IOException {
+        var baseVectors = SiftLoader.readFvecs("fvec/"+path+"/ada_002_100000_base_vectors.fvec");
+        var queryVectors = SiftLoader.readFvecs("fvec/"+path+"/ada_002_100000_query_vectors_10000.fvec");
+        var gt = SiftLoader.readIvecs("fvec/"+path+"/ada_002_100000_indices_query_10000.ivec");
+        var ds = new DataSet(name,
                              VectorSimilarityFunction.DOT_PRODUCT,
                              baseVectors,
                              queryVectors,
                              gt);
-        System.out.format("%nWikipedia: %d base and %d query vectors loaded, dimensions %d%n",
-                          baseVectors.size(), queryVectors.size(), baseVectors.get(0).length);
+        System.out.format("%n%s: %d base and %d query vectors loaded, dimensions %d%n",
+                          name, baseVectors.size(), queryVectors.size(), baseVectors.get(0).length);
         return ds;
     }