Skip to content

Commit

Permalink
add MADV_RANDOM (#382)
Browse files Browse the repository at this point in the history
* add MADV_RANDOM

* promote ReaderSupplierFactory to jvector-base

* comments
  • Loading branch information
jbellis authored Jan 6, 2025
1 parent 5b0df95 commit 8e98e50
Show file tree
Hide file tree
Showing 8 changed files with 52 additions and 21 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ The code:

// on-disk indexes require a ReaderSupplier (not just a Reader) because we will want it to
// open additional readers for searching
ReaderSupplier rs = new SimpleMappedReaderSupplier(indexPath);
ReaderSupplier rs = ReaderSupplierFactor.open(indexPath);
OnDiskGraphIndex index = OnDiskGraphIndex.load(rs);
// measure our recall against the (exactly computed) ground truth
Function<VectorFloat<?>, SearchScoreProvider> sspFactory = q -> SearchScoreProvider.exact(q, VectorSimilarityFunction.EUCLIDEAN, ravv);
Expand Down Expand Up @@ -174,7 +174,7 @@ Compressing the vectors with product quantization is done as follows:

Then we can wire up the compressed vectors to a two-phase search by getting the fast ApproximateScoreFunction from PQVectors, and the Reranker from the index View:
```java
ReaderSupplier rs = new MMapReaderSupplier(indexPath);
ReaderSupplier rs = ReaderSupplierFactor.open(indexPath);
OnDiskGraphIndex index = OnDiskGraphIndex.load(rs);
// load the PQVectors that we just wrote to disk
try (RandomAccessReader in = new SimpleMappedReader(pqPath)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@
/**
* This is a subset of DataInput, plus seek and readFully methods, which allows implementations
* to use more efficient options like FloatBuffer for bulk reads.
* <p>
* JVector includes production-ready implementations; the recommended way to use these are via
* `ReaderSupplierFactory.open`. For custom implementations, e.g. reading from network storage,
* you should also implement a corresponding `ReaderSupplier`.
* <p>
* The general usage pattern is expected to be "seek to a position, then read sequentially from there."
* Thus, RandomAccessReader implementations are expected to be stateful and NOT threadsafe; JVector
* uses the ReaderSupplier API to create a RandomAccessReader per thread, as needed.
*/
public interface RandomAccessReader extends AutoCloseable {
void seek(long offset) throws IOException;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@

import java.io.IOException;

/**
* A supplier of RandomAccessReaders.
*/
public interface ReaderSupplier extends AutoCloseable {
/**
* @return a new reader. It is up to the caller to re-use these readers or close them,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.github.jbellis.jvector.example.util;

import io.github.jbellis.jvector.disk.ReaderSupplier;
import io.github.jbellis.jvector.disk.SimpleMappedReader;
package io.github.jbellis.jvector.disk;

import java.io.IOException;
import java.lang.reflect.Constructor;
Expand All @@ -28,9 +25,11 @@
public class ReaderSupplierFactory {
private static final Logger LOG = Logger.getLogger(ReaderSupplierFactory.class.getName());
private static final String MEMORY_SEGMENT_READER_CLASSNAME = "io.github.jbellis.jvector.disk.MemorySegmentReader$Supplier";
private static final String MMAP_READER_CLASSNAME = "io.github.jbellis.jvector.example.util.MMapReader$Supplier";

public static ReaderSupplier open(Path path) throws IOException {
try {
// prefer MemorySegmentReader (available under JDK 20+)
var supplierClass = Class.forName(MEMORY_SEGMENT_READER_CLASSNAME);
Constructor<?> ctor = supplierClass.getConstructor(Path.class);
return (ReaderSupplier) ctor.newInstance(path);
Expand All @@ -40,14 +39,19 @@ public static ReaderSupplier open(Path path) throws IOException {
}

try {
return new MMapReader.Supplier(path);
} catch (UnsatisfiedLinkError|NoClassDefFoundError e) {
// fall back to MMapReader (requires a 3rd party linux-only native mmap library that is only included
// in the build with jvector-example; this allows Bench to not embarrass us on older JDKs)
var supplierClass = Class.forName(MMAP_READER_CLASSNAME);
Constructor<?> ctor = supplierClass.getConstructor(Path.class);
return (ReaderSupplier) ctor.newInstance(path);
} catch (Exception e) {
LOG.log(Level.WARNING, "MMapReaderSupplier not available, falling back to SimpleMappedReaderSupplier. More details available at level FINE.");
LOG.log(Level.FINE, "MMapReaderSupplier instantiation exception:", e);
if (Files.size(path) > Integer.MAX_VALUE) {
throw new RuntimeException("File sizes greater than 2GB are not supported on Windows--contributions welcome");
throw new RuntimeException("File sizes greater than 2GB are not supported on older Windows JDKs");
}

// finally, fall back to SimpleMappedReader (available everywhere, but doesn't support files > 2GB)
return new SimpleMappedReader.Supplier(path);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@

package io.github.jbellis.jvector.example;

import io.github.jbellis.jvector.disk.ReaderSupplierFactory;
import io.github.jbellis.jvector.example.util.CompressorParameters;
import io.github.jbellis.jvector.example.util.DataSet;
import io.github.jbellis.jvector.example.util.ReaderSupplierFactory;
import io.github.jbellis.jvector.graph.GraphIndex;
import io.github.jbellis.jvector.graph.GraphIndexBuilder;
import io.github.jbellis.jvector.graph.GraphSearcher;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

package io.github.jbellis.jvector.example;

import io.github.jbellis.jvector.disk.ReaderSupplierFactory;
import io.github.jbellis.jvector.example.util.MMapRandomAccessVectorValues;
import io.github.jbellis.jvector.example.util.ReaderSupplierFactory;
import io.github.jbellis.jvector.example.util.UpdatableRandomAccessVectorValues;
import io.github.jbellis.jvector.graph.GraphIndex;
import io.github.jbellis.jvector.graph.GraphIndexBuilder;
Expand All @@ -30,13 +30,10 @@
import io.github.jbellis.jvector.graph.similarity.ScoreFunction;
import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider;
import io.github.jbellis.jvector.quantization.CompressedVectors;
import io.github.jbellis.jvector.quantization.PQVectors;
import io.github.jbellis.jvector.quantization.ProductQuantization;
import io.github.jbellis.jvector.util.Bits;
import io.github.jbellis.jvector.util.PhysicalCoreExecutor;
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
import io.github.jbellis.jvector.vector.VectorizationProvider;
import io.github.jbellis.jvector.vector.types.ByteSequence;
import io.github.jbellis.jvector.vector.types.VectorFloat;
import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
import org.newsclub.net.unix.AFUNIXServerSocket;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

import io.github.jbellis.jvector.disk.RandomAccessReader;
import io.github.jbellis.jvector.disk.ReaderSupplier;
import io.github.jbellis.jvector.disk.ReaderSupplierFactory;
import io.github.jbellis.jvector.disk.SimpleMappedReader;
import io.github.jbellis.jvector.example.util.MMapReader;
import io.github.jbellis.jvector.example.util.SiftLoader;
import io.github.jbellis.jvector.graph.GraphIndex;
import io.github.jbellis.jvector.graph.GraphIndexBuilder;
Expand Down Expand Up @@ -158,7 +158,7 @@ public static void siftPersisted(List<VectorFloat<?>> baseVectors, List<VectorFl

// on-disk indexes require a ReaderSupplier (not just a Reader) because we will want it to
// open additional readers for searching
ReaderSupplier rs = new SimpleMappedReader.Supplier(indexPath);
ReaderSupplier rs = ReaderSupplierFactory.open(indexPath);
OnDiskGraphIndex index = OnDiskGraphIndex.load(rs);
// measure our recall against the (exactly computed) ground truth
Function<VectorFloat<?>, SearchScoreProvider> sspFactory = q -> SearchScoreProvider.exact(q, VectorSimilarityFunction.EUCLIDEAN, ravv);
Expand Down Expand Up @@ -190,7 +190,7 @@ public static void siftDiskAnn(List<VectorFloat<?>> baseVectors, List<VectorFloa
pqv.write(out);
}

ReaderSupplier rs = new MMapReader.Supplier(indexPath);
ReaderSupplier rs = ReaderSupplierFactory.open(indexPath);
OnDiskGraphIndex index = OnDiskGraphIndex.load(rs);
// load the PQVectors that we just wrote to disk
try (RandomAccessReader in = new SimpleMappedReader(pqPath)) {
Expand Down Expand Up @@ -253,7 +253,7 @@ public static void siftDiskAnnLTM(List<VectorFloat<?>> baseVectors, List<VectorF
}

// searching the index does not change
ReaderSupplier rs = new MMapReader.Supplier(indexPath);
ReaderSupplier rs = ReaderSupplierFactory.open(indexPath);
OnDiskGraphIndex index = OnDiskGraphIndex.load(rs);
try (RandomAccessReader in = new SimpleMappedReader(pqPath)) {
var pqvSearch = PQVectors.load(in);
Expand Down Expand Up @@ -314,7 +314,7 @@ public static void siftDiskAnnLTMWithNVQ(List<VectorFloat<?>> baseVectors, List<
}

// searching the index does not change
ReaderSupplier rs = new MMapReader.Supplier(indexPath);
ReaderSupplier rs = ReaderSupplierFactory.open(indexPath);
OnDiskGraphIndex index = OnDiskGraphIndex.load(rs);
try (RandomAccessReader in = new SimpleMappedReader(pqPath)) {
var pqvSearch = PQVectors.load(in);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.FunctionDescriptor;
import java.lang.foreign.Linker;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.lang.foreign.ValueLayout.OfFloat;
Expand All @@ -31,11 +33,15 @@
import java.nio.file.StandardOpenOption;

/**
* {@link MemorySegment} based implementation of RandomAccessReader.
* MemorySegmentReader doesn't have 2GB file size limitation of {@link SimpleMappedReader}.
* {@link MemorySegment} based implementation of RandomAccessReader. This is the recommended
* RandomAccessReader implementation included with JVector.
* <p>
* MemorySegmentReader applies MADV_RANDOM to the backing storage, and doesn't have the 2GB file size limitation
* of {@link SimpleMappedReader}.
*/
public class MemorySegmentReader implements RandomAccessReader {

private static final int MADV_RANDOM = 1; // Value for Linux
private static final OfInt intLayout = ValueLayout.JAVA_INT_UNALIGNED.withOrder(ByteOrder.BIG_ENDIAN);
private static final OfFloat floatLayout = ValueLayout.JAVA_FLOAT_UNALIGNED.withOrder(ByteOrder.BIG_ENDIAN);
private static final OfLong longLayout = ValueLayout.JAVA_LONG_UNALIGNED.withOrder(ByteOrder.BIG_ENDIAN);
Expand All @@ -48,6 +54,19 @@ public MemorySegmentReader(Path path) throws IOException {
arena = Arena.ofShared();
try (var ch = FileChannel.open(path, StandardOpenOption.READ)) {
memory = ch.map(MapMode.READ_ONLY, 0L, ch.size(), arena);

// Apply MADV_RANDOM advice
var linker = Linker.nativeLinker();
var madvise = linker.downcallHandle(linker.defaultLookup().find("posix_madvise").get(),
FunctionDescriptor.of(ValueLayout.JAVA_INT, ValueLayout.ADDRESS, ValueLayout.JAVA_LONG, ValueLayout.JAVA_INT));
try {
int result = (int) madvise.invokeExact(memory, memory.byteSize(), MADV_RANDOM);
if (result != 0) {
throw new IOException("posix_madvise failed with error code: " + result);
}
} catch (Throwable t) {
throw new RuntimeException(t);
}
} catch (Exception e) {
arena.close();
throw e;
Expand Down

0 comments on commit 8e98e50

Please sign in to comment.