From 8e98e50b6768a60b642640180e2c89dd7873dfea Mon Sep 17 00:00:00 2001 From: Jonathan Ellis Date: Mon, 6 Jan 2025 17:50:43 -0600 Subject: [PATCH] add MADV_RANDOM (#382) * add MADV_RANDOM * promote ReaderSupplierFactory to jvector-base * comments --- README.md | 4 ++-- .../jvector/disk/RandomAccessReader.java | 8 +++++++ .../jbellis/jvector/disk/ReaderSupplier.java | 3 +++ .../jvector/disk}/ReaderSupplierFactory.java | 18 +++++++++------ .../github/jbellis/jvector/example/Grid.java | 2 +- .../jbellis/jvector/example/IPCService.java | 5 +--- .../jbellis/jvector/example/SiftSmall.java | 10 ++++---- .../jvector/disk/MemorySegmentReader.java | 23 +++++++++++++++++-- 8 files changed, 52 insertions(+), 21 deletions(-) rename {jvector-examples/src/main/java/io/github/jbellis/jvector/example/util => jvector-base/src/main/java/io/github/jbellis/jvector/disk}/ReaderSupplierFactory.java (70%) diff --git a/README.md b/README.md index f5ed9e5c5..e77f7bb8f 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ The code: // on-disk indexes require a ReaderSupplier (not just a Reader) because we will want it to // open additional readers for searching - ReaderSupplier rs = new SimpleMappedReaderSupplier(indexPath); + ReaderSupplier rs = ReaderSupplierFactor.open(indexPath); OnDiskGraphIndex index = OnDiskGraphIndex.load(rs); // measure our recall against the (exactly computed) ground truth Function, SearchScoreProvider> sspFactory = q -> SearchScoreProvider.exact(q, VectorSimilarityFunction.EUCLIDEAN, ravv); @@ -174,7 +174,7 @@ Compressing the vectors with product quantization is done as follows: Then we can wire up the compressed vectors to a two-phase search by getting the fast ApproximateScoreFunction from PQVectors, and the Reranker from the index View: ```java - ReaderSupplier rs = new MMapReaderSupplier(indexPath); + ReaderSupplier rs = ReaderSupplierFactor.open(indexPath); OnDiskGraphIndex index = OnDiskGraphIndex.load(rs); // load the PQVectors that we just wrote to disk try (RandomAccessReader in = new SimpleMappedReader(pqPath)) { diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/disk/RandomAccessReader.java b/jvector-base/src/main/java/io/github/jbellis/jvector/disk/RandomAccessReader.java index 8fefca210..f3bea3c82 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/disk/RandomAccessReader.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/disk/RandomAccessReader.java @@ -22,6 +22,14 @@ /** * This is a subset of DataInput, plus seek and readFully methods, which allows implementations * to use more efficient options like FloatBuffer for bulk reads. + *

+ * JVector includes production-ready implementations; the recommended way to use these are via + * `ReaderSupplierFactory.open`. For custom implementations, e.g. reading from network storage, + * you should also implement a corresponding `ReaderSupplier`. + *

+ * The general usage pattern is expected to be "seek to a position, then read sequentially from there." + * Thus, RandomAccessReader implementations are expected to be stateful and NOT threadsafe; JVector + * uses the ReaderSupplier API to create a RandomAccessReader per thread, as needed. */ public interface RandomAccessReader extends AutoCloseable { void seek(long offset) throws IOException; diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/disk/ReaderSupplier.java b/jvector-base/src/main/java/io/github/jbellis/jvector/disk/ReaderSupplier.java index f543e1716..8f8d2ae2c 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/disk/ReaderSupplier.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/disk/ReaderSupplier.java @@ -18,6 +18,9 @@ import java.io.IOException; +/** + * A supplier of RandomAccessReaders. + */ public interface ReaderSupplier extends AutoCloseable { /** * @return a new reader. It is up to the caller to re-use these readers or close them, diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/ReaderSupplierFactory.java b/jvector-base/src/main/java/io/github/jbellis/jvector/disk/ReaderSupplierFactory.java similarity index 70% rename from jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/ReaderSupplierFactory.java rename to jvector-base/src/main/java/io/github/jbellis/jvector/disk/ReaderSupplierFactory.java index f57b3e8a4..92e3dbda6 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/ReaderSupplierFactory.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/disk/ReaderSupplierFactory.java @@ -13,10 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.github.jbellis.jvector.example.util; - -import io.github.jbellis.jvector.disk.ReaderSupplier; -import io.github.jbellis.jvector.disk.SimpleMappedReader; +package io.github.jbellis.jvector.disk; import java.io.IOException; import java.lang.reflect.Constructor; @@ -28,9 +25,11 @@ public class ReaderSupplierFactory { private static final Logger LOG = Logger.getLogger(ReaderSupplierFactory.class.getName()); private static final String MEMORY_SEGMENT_READER_CLASSNAME = "io.github.jbellis.jvector.disk.MemorySegmentReader$Supplier"; + private static final String MMAP_READER_CLASSNAME = "io.github.jbellis.jvector.example.util.MMapReader$Supplier"; public static ReaderSupplier open(Path path) throws IOException { try { + // prefer MemorySegmentReader (available under JDK 20+) var supplierClass = Class.forName(MEMORY_SEGMENT_READER_CLASSNAME); Constructor ctor = supplierClass.getConstructor(Path.class); return (ReaderSupplier) ctor.newInstance(path); @@ -40,14 +39,19 @@ public static ReaderSupplier open(Path path) throws IOException { } try { - return new MMapReader.Supplier(path); - } catch (UnsatisfiedLinkError|NoClassDefFoundError e) { + // fall back to MMapReader (requires a 3rd party linux-only native mmap library that is only included + // in the build with jvector-example; this allows Bench to not embarrass us on older JDKs) + var supplierClass = Class.forName(MMAP_READER_CLASSNAME); + Constructor ctor = supplierClass.getConstructor(Path.class); + return (ReaderSupplier) ctor.newInstance(path); + } catch (Exception e) { LOG.log(Level.WARNING, "MMapReaderSupplier not available, falling back to SimpleMappedReaderSupplier. More details available at level FINE."); LOG.log(Level.FINE, "MMapReaderSupplier instantiation exception:", e); if (Files.size(path) > Integer.MAX_VALUE) { - throw new RuntimeException("File sizes greater than 2GB are not supported on Windows--contributions welcome"); + throw new RuntimeException("File sizes greater than 2GB are not supported on older Windows JDKs"); } + // finally, fall back to SimpleMappedReader (available everywhere, but doesn't support files > 2GB) return new SimpleMappedReader.Supplier(path); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java index 4ebbc2280..0ea8d353f 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java @@ -16,9 +16,9 @@ package io.github.jbellis.jvector.example; +import io.github.jbellis.jvector.disk.ReaderSupplierFactory; import io.github.jbellis.jvector.example.util.CompressorParameters; import io.github.jbellis.jvector.example.util.DataSet; -import io.github.jbellis.jvector.example.util.ReaderSupplierFactory; import io.github.jbellis.jvector.graph.GraphIndex; import io.github.jbellis.jvector.graph.GraphIndexBuilder; import io.github.jbellis.jvector.graph.GraphSearcher; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/IPCService.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/IPCService.java index 9e89db81d..ce763bebf 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/IPCService.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/IPCService.java @@ -16,8 +16,8 @@ package io.github.jbellis.jvector.example; +import io.github.jbellis.jvector.disk.ReaderSupplierFactory; import io.github.jbellis.jvector.example.util.MMapRandomAccessVectorValues; -import io.github.jbellis.jvector.example.util.ReaderSupplierFactory; import io.github.jbellis.jvector.example.util.UpdatableRandomAccessVectorValues; import io.github.jbellis.jvector.graph.GraphIndex; import io.github.jbellis.jvector.graph.GraphIndexBuilder; @@ -30,13 +30,10 @@ import io.github.jbellis.jvector.graph.similarity.ScoreFunction; import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider; import io.github.jbellis.jvector.quantization.CompressedVectors; -import io.github.jbellis.jvector.quantization.PQVectors; import io.github.jbellis.jvector.quantization.ProductQuantization; import io.github.jbellis.jvector.util.Bits; -import io.github.jbellis.jvector.util.PhysicalCoreExecutor; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; import io.github.jbellis.jvector.vector.VectorizationProvider; -import io.github.jbellis.jvector.vector.types.ByteSequence; import io.github.jbellis.jvector.vector.types.VectorFloat; import io.github.jbellis.jvector.vector.types.VectorTypeSupport; import org.newsclub.net.unix.AFUNIXServerSocket; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/SiftSmall.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/SiftSmall.java index 334ef1f1d..444d3fae0 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/SiftSmall.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/SiftSmall.java @@ -18,8 +18,8 @@ import io.github.jbellis.jvector.disk.RandomAccessReader; import io.github.jbellis.jvector.disk.ReaderSupplier; +import io.github.jbellis.jvector.disk.ReaderSupplierFactory; import io.github.jbellis.jvector.disk.SimpleMappedReader; -import io.github.jbellis.jvector.example.util.MMapReader; import io.github.jbellis.jvector.example.util.SiftLoader; import io.github.jbellis.jvector.graph.GraphIndex; import io.github.jbellis.jvector.graph.GraphIndexBuilder; @@ -158,7 +158,7 @@ public static void siftPersisted(List> baseVectors, List, SearchScoreProvider> sspFactory = q -> SearchScoreProvider.exact(q, VectorSimilarityFunction.EUCLIDEAN, ravv); @@ -190,7 +190,7 @@ public static void siftDiskAnn(List> baseVectors, List> baseVectors, List> baseVectors, List< } // searching the index does not change - ReaderSupplier rs = new MMapReader.Supplier(indexPath); + ReaderSupplier rs = ReaderSupplierFactory.open(indexPath); OnDiskGraphIndex index = OnDiskGraphIndex.load(rs); try (RandomAccessReader in = new SimpleMappedReader(pqPath)) { var pqvSearch = PQVectors.load(in); diff --git a/jvector-native/src/main/java/io/github/jbellis/jvector/disk/MemorySegmentReader.java b/jvector-native/src/main/java/io/github/jbellis/jvector/disk/MemorySegmentReader.java index 39a6237f6..062769b75 100644 --- a/jvector-native/src/main/java/io/github/jbellis/jvector/disk/MemorySegmentReader.java +++ b/jvector-native/src/main/java/io/github/jbellis/jvector/disk/MemorySegmentReader.java @@ -18,6 +18,8 @@ import java.io.IOException; import java.lang.foreign.Arena; +import java.lang.foreign.FunctionDescriptor; +import java.lang.foreign.Linker; import java.lang.foreign.MemorySegment; import java.lang.foreign.ValueLayout; import java.lang.foreign.ValueLayout.OfFloat; @@ -31,11 +33,15 @@ import java.nio.file.StandardOpenOption; /** - * {@link MemorySegment} based implementation of RandomAccessReader. - * MemorySegmentReader doesn't have 2GB file size limitation of {@link SimpleMappedReader}. + * {@link MemorySegment} based implementation of RandomAccessReader. This is the recommended + * RandomAccessReader implementation included with JVector. + *

+ * MemorySegmentReader applies MADV_RANDOM to the backing storage, and doesn't have the 2GB file size limitation + * of {@link SimpleMappedReader}. */ public class MemorySegmentReader implements RandomAccessReader { + private static final int MADV_RANDOM = 1; // Value for Linux private static final OfInt intLayout = ValueLayout.JAVA_INT_UNALIGNED.withOrder(ByteOrder.BIG_ENDIAN); private static final OfFloat floatLayout = ValueLayout.JAVA_FLOAT_UNALIGNED.withOrder(ByteOrder.BIG_ENDIAN); private static final OfLong longLayout = ValueLayout.JAVA_LONG_UNALIGNED.withOrder(ByteOrder.BIG_ENDIAN); @@ -48,6 +54,19 @@ public MemorySegmentReader(Path path) throws IOException { arena = Arena.ofShared(); try (var ch = FileChannel.open(path, StandardOpenOption.READ)) { memory = ch.map(MapMode.READ_ONLY, 0L, ch.size(), arena); + + // Apply MADV_RANDOM advice + var linker = Linker.nativeLinker(); + var madvise = linker.downcallHandle(linker.defaultLookup().find("posix_madvise").get(), + FunctionDescriptor.of(ValueLayout.JAVA_INT, ValueLayout.ADDRESS, ValueLayout.JAVA_LONG, ValueLayout.JAVA_INT)); + try { + int result = (int) madvise.invokeExact(memory, memory.byteSize(), MADV_RANDOM); + if (result != 0) { + throw new IOException("posix_madvise failed with error code: " + result); + } + } catch (Throwable t) { + throw new RuntimeException(t); + } } catch (Exception e) { arena.close(); throw e;