From 43d0e358ea601511008a6cc23a12f4714b4f5ceb Mon Sep 17 00:00:00 2001 From: Jonathan Ellis Date: Fri, 10 Nov 2023 10:40:00 -0600 Subject: [PATCH] add e5-v2-base, e5-v2-large, and gecko datasets --- .../github/jbellis/jvector/example/Bench.java | 55 +++++++------------ .../jvector/example/util/DownloadHelper.java | 15 +++++ 2 files changed, 36 insertions(+), 34 deletions(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index efa6761d6..4e0435cd9 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -204,21 +204,23 @@ public static void main(String[] args) throws IOException { // compile regex and do substring matching using find var pattern = Pattern.compile(regex); - if (pattern.matcher("wikipedia_squad/100k/e5-small-v2").find()) { - DownloadHelper.maybeDownloadFvecs("intfloat_e5-small-v2_100000"); - var e5set = loadE5SmallData("wikipedia_squad/100k"); - gridSearch(e5set, compressionGrid, mGrid, efConstructionGrid, efSearchGrid); - cachedCompressors.clear(); - } - - if (pattern.matcher("wikipedia_squad/100k/ada_002").find()) { - DownloadHelper.maybeDownloadFvecs("ada_002_100000"); - var adaSet = loadWikipediaData("wikipedia_squad/100k"); - gridSearch(adaSet, compressionGrid, mGrid, efConstructionGrid, efSearchGrid); - cachedCompressors.clear(); + // large embeddings calculated by Neighborhood Watch. 100k files by default; 1M also available + var nwFiles = List.of( + "intfloat_e5-small-v2_100000", + "intfloat_e5-base-v2_100000", + "intfloat_e5-large-v2_100000", + "textembedding-gecko_100000", + "ada_002_100000"); + for (var nwDatasetName : nwFiles) { + if (pattern.matcher(nwDatasetName).find()) { + DownloadHelper.maybeDownloadFvecs(nwDatasetName); + gridSearch(loadNWDataData(nwDatasetName), compressionGrid, mGrid, efConstructionGrid, efSearchGrid); + cachedCompressors.clear(); + } } - var files = List.of( + // smaller vectors from ann-benchmarks + var hdf5Files = List.of( // large files not yet supported // "hdf5/deep-image-96-angular.hdf5", // "hdf5/gist-960-euclidean.hdf5", @@ -229,7 +231,7 @@ public static void main(String[] args) throws IOException { "glove-200-angular.hdf5", "nytimes-256-angular.hdf5", "sift-128-euclidean.hdf5"); - for (var f : files) { + for (var f : hdf5Files) { if (pattern.matcher(f).find()) { DownloadHelper.maybeDownloadHdf5(f); gridSearch(Hdf5Loader.load(f), compressionGrid, mGrid, efConstructionGrid, efSearchGrid); @@ -238,26 +240,11 @@ public static void main(String[] args) throws IOException { } } - private static DataSet loadE5SmallData(String path) throws IOException { - var baseVectors = SiftLoader.readFvecs("fvec/" + path + "/intfloat_e5-small-v2_100000_base_vectors.fvec"); - var queryVectors = SiftLoader.readFvecs("fvec/" + path + "/intfloat_e5-small-v2_100000_query_vectors_10000.fvec"); - var gt = SiftLoader.readIvecs("fvec/" + path + "/intfloat_e5-small-v2_100000_indices_query_10000.ivec"); - String name = Path.of(path).getName(0).toString(); - var ds = new DataSet(name, - VectorSimilarityFunction.DOT_PRODUCT, - baseVectors, - queryVectors, - gt); - System.out.format("%n%s: %d base and %d query vectors loaded, dimensions %d%n", - name, baseVectors.size(), queryVectors.size(), baseVectors.get(0).length); - return ds; - } - - private static DataSet loadWikipediaData(String path) throws IOException { - var baseVectors = SiftLoader.readFvecs("fvec/" + path + "/ada_002_100000_base_vectors.fvec"); - var queryVectors = SiftLoader.readFvecs("fvec/" + path + "/ada_002_100000_query_vectors_10000.fvec"); - var gt = SiftLoader.readIvecs("fvec/" + path + "/ada_002_100000_indices_query_10000.ivec"); - String name = Path.of(path).getName(0).toString(); + private static DataSet loadNWDataData(String name) throws IOException { + var path = "wikipedia_squad/100k"; + var baseVectors = SiftLoader.readFvecs("fvec/" + path + "/" + name + "_base_vectors.fvec"); + var queryVectors = SiftLoader.readFvecs("fvec/" + path + "/" + name + "_query_vectors_10000.fvec"); + var gt = SiftLoader.readIvecs("fvec/" + path + "/" + name + "_indices_query_10000.ivec"); var ds = new DataSet(name, VectorSimilarityFunction.DOT_PRODUCT, baseVectors, diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java index 7bdb7d593..3f54d6d74 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java @@ -64,6 +64,21 @@ public static void maybeDownloadFvecs(String prefix) { "wikipedia_squad/100k/intfloat_e5-small-v2_100000_query_vectors_10000.fvec", "wikipedia_squad/100k/intfloat_e5-small-v2_100000_indices_query_10000.ivec")); break; + case "intfloat_e5-base-v2_100000": + keys.addAll(List.of("wikipedia_squad/100k/intfloat_e5-base-v2_100000_base_vectors.fvec", + "wikipedia_squad/100k/intfloat_e5-base-v2_100000_query_vectors_10000.fvec", + "wikipedia_squad/100k/intfloat_e5-base-v2_100000_indices_query_10000.ivec")); + break; + case "intfloat_e5-large-v2_100000": + keys.addAll(List.of("wikipedia_squad/100k/intfloat_e5-large-v2_100000_base_vectors.fvec", + "wikipedia_squad/100k/intfloat_e5-large-v2_100000_query_vectors_10000.fvec", + "wikipedia_squad/100k/intfloat_e5-large-v2_100000_indices_query_10000.ivec")); + break; + case "textembedding-gecko_100000": + keys.addAll(List.of("wikipedia_squad/100k/textembedding-gecko_100000_base_vectors.fvec", + "wikipedia_squad/100k/textembedding-gecko_100000_query_vectors_10000.fvec", + "wikipedia_squad/100k/textembedding-gecko_100000_indices_query_10000.ivec")); + break; default: throw new IllegalArgumentException("Unknown prefix: " + prefix); }