Skip to content

Commit a3edcbe

Browse files
authored
Deletes (#117)
* rename * new node value won't change, pull it out of loop * TODO * wip * Add markNodeDeleted Bits parameter to search() is not required to be non-null Add Bits.ALL and convenience methods No cleanup yet * remove View.getSortedNodes * wip * formatting * replace validateGraph with assertGraphEquals * clean out vestigial document cruft from mock vectorvalues * format * r/m numVectors field (always equal to array length) * createRandom[]Vectors no longer leaves null entries that need to be cleaned outc * formatting * first test for deletions * wiring in the purge. almost passes tests * fix mergeNeighbors to not add duplicate nodes, and fix test to check for duplicates * - fix removeDeletedNeighbors - NN path can include self, so exclude it from candidates array * - fix removeDeletedNeighbors - NN path can include self, so exclude it from candidates array * finish implementing renumbering for writes * rename nsize0 -> maxDegree * show input vectors when assert fails * re-use buildSequentially * encapsulate OHGI better * instead of renumbering implicitly, let caller provide remapper * add save and load methods for OHGI * r/m unused CNS.insert method with confusing semantics * fix insertDiverse ignoring current neighbors * ram freed is proportional to nodes removed * merge ConcurrentNeighborArray into NeighborArray * fix node-present check * make getSequentialRenumbering public * add failing testRenumberingOnDelete * refactor to take Map instead of Function; sort writes by new ordinal instead of old * fix ci bitching about javadoc * fix typos
1 parent 73428d5 commit a3edcbe

30 files changed

+2122
-1688
lines changed

jvector-base/src/main/java/io/github/jbellis/jvector/disk/CachingGraphIndex.java

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import io.github.jbellis.jvector.graph.GraphIndex;
2020
import io.github.jbellis.jvector.graph.NodesIterator;
2121
import io.github.jbellis.jvector.util.Accountable;
22+
import io.github.jbellis.jvector.util.Bits;
2223

2324
import java.io.IOException;
2425
import java.io.UncheckedIOException;
@@ -106,13 +107,8 @@ public int entryNode() {
106107
}
107108

108109
@Override
109-
public int[] getSortedNodes() {
110-
return View.super.getSortedNodes();
111-
}
112-
113-
@Override
114-
public int getNeighborCount(int node) {
115-
return View.super.getNeighborCount(node);
110+
public Bits liveNodes() {
111+
return view.liveNodes();
116112
}
117113

118114
@Override

jvector-base/src/main/java/io/github/jbellis/jvector/disk/OnDiskGraphIndex.java

Lines changed: 116 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,19 @@
1818

1919
import io.github.jbellis.jvector.graph.GraphIndex;
2020
import io.github.jbellis.jvector.graph.NodesIterator;
21+
import io.github.jbellis.jvector.graph.OnHeapGraphIndex;
2122
import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
2223
import io.github.jbellis.jvector.util.Accountable;
24+
import io.github.jbellis.jvector.util.Bits;
2325

2426
import java.io.DataOutput;
2527
import java.io.IOException;
2628
import java.io.UncheckedIOException;
29+
import java.util.ArrayList;
30+
import java.util.Comparator;
31+
import java.util.HashMap;
32+
import java.util.Map;
33+
import java.util.stream.IntStream;
2734

2835
public class OnDiskGraphIndex<T> implements GraphIndex<T>, AutoCloseable, Accountable
2936
{
@@ -49,6 +56,26 @@ public OnDiskGraphIndex(ReaderSupplier readerSupplier, long offset)
4956
}
5057
}
5158

59+
/**
60+
* @return a Map of old to new graph ordinals where the new ordinals are sequential starting at 0,
61+
* while preserving the original relative ordering in `graph`. That is, for all node ids i and j,
62+
* if i &lt; j in `graph` then map[i] &lt; map[j] in the returned map.
63+
*/
64+
public static <T> Map<Integer, Integer> getSequentialRenumbering(GraphIndex<T> graph) {
65+
try (var view = graph.getView()) {
66+
Map<Integer, Integer> oldToNewMap = new HashMap<>();
67+
int nextOrdinal = 0;
68+
for (int i = 0; i < view.getIdUpperBound(); i++) {
69+
if (graph.containsNode(i)) {
70+
oldToNewMap.put(i, nextOrdinal++);
71+
}
72+
}
73+
return oldToNewMap;
74+
} catch (Exception e) {
75+
throw new RuntimeException(e);
76+
}
77+
}
78+
5279
@Override
5380
public int size() {
5481
return size;
@@ -118,6 +145,11 @@ public int entryNode() {
118145
return OnDiskGraphIndex.this.entryNode;
119146
}
120147

148+
@Override
149+
public Bits liveNodes() {
150+
return Bits.ALL;
151+
}
152+
121153
@Override
122154
public void close() throws IOException {
123155
reader.close();
@@ -127,7 +159,7 @@ public void close() throws IOException {
127159
@Override
128160
public NodesIterator getNodes()
129161
{
130-
throw new UnsupportedOperationException();
162+
return NodesIterator.fromPrimitiveIterator(IntStream.range(0, size).iterator(), size);
131163
}
132164

133165
@Override
@@ -139,37 +171,92 @@ public void close() throws IOException {
139171
readerSupplier.close();
140172
}
141173

142-
// takes Graph and Vectors separately since I'm reluctant to introduce a Vectors reference
143-
// to OnHeapGraphIndex just for this method. Maybe that will end up the best solution,
144-
// but I'm not sure yet.
145-
public static <T> void write(GraphIndex<T> graph, RandomAccessVectorValues<T> vectors, DataOutput out) throws IOException {
146-
assert graph.size() == vectors.size() : String.format("graph size %d != vectors size %d", graph.size(), vectors.size());
147-
148-
var view = graph.getView();
149-
150-
// graph-level properties
151-
out.writeInt(graph.size());
152-
out.writeInt(vectors.dimension());
153-
out.writeInt(view.entryNode());
154-
out.writeInt(graph.maxDegree());
155-
156-
// for each graph node, write the associated vector and its neighbors
157-
for (int node = 0; node < graph.size(); node++) {
158-
out.writeInt(node); // unnecessary, but a reasonable sanity check
159-
Io.writeFloats(out, (float[]) vectors.vectorValue(node));
160-
161-
var neighbors = view.getNeighborsIterator(node);
162-
out.writeInt(neighbors.size());
163-
int n = 0;
164-
for ( ; n < neighbors.size(); n++) {
165-
out.writeInt(neighbors.nextInt());
174+
/**
175+
* @param graph the graph to write
176+
* @param vectors the vectors associated with each node
177+
* @param out the output to write to
178+
*
179+
* If any nodes have been deleted, you must use the overload specifying `oldToNewOrdinals` instead.
180+
*/
181+
public static <T> void write(GraphIndex<T> graph, RandomAccessVectorValues<T> vectors, DataOutput out)
182+
throws IOException
183+
{
184+
try (var view = graph.getView()) {
185+
if (view.getIdUpperBound() > graph.size()) {
186+
throw new IllegalArgumentException("Graph contains deletes, must specify oldToNewOrdinals map");
187+
}
188+
} catch (Exception e) {
189+
throw new IOException(e);
190+
}
191+
write(graph, vectors, getSequentialRenumbering(graph), out);
192+
}
193+
194+
/**
195+
* @param graph the graph to write
196+
* @param vectors the vectors associated with each node
197+
* @param oldToNewOrdinals A map from old to new ordinals. If ordinal numbering does not matter,
198+
* you can use `getSequentialRenumbering`, which will "fill in" holes left by
199+
* any deleted nodes.
200+
* @param out the output to write to
201+
*/
202+
public static <T> void write(GraphIndex<T> graph,
203+
RandomAccessVectorValues<T> vectors,
204+
Map<Integer, Integer> oldToNewOrdinals,
205+
DataOutput out)
206+
throws IOException
207+
{
208+
if (graph instanceof OnHeapGraphIndex) {
209+
var ohgi = (OnHeapGraphIndex<T>) graph;
210+
if (ohgi.getDeletedNodes().cardinality() > 0) {
211+
throw new IllegalArgumentException("Run builder.cleanup() before writing the graph");
166212
}
167-
assert !neighbors.hasNext();
213+
}
214+
if (oldToNewOrdinals.size() != graph.size()) {
215+
throw new IllegalArgumentException(String.format("ordinalMapper size %d does not match graph size %d",
216+
oldToNewOrdinals.size(), graph.size()));
217+
}
218+
219+
var entriesByNewOrdinal = new ArrayList<>(oldToNewOrdinals.entrySet());
220+
entriesByNewOrdinal.sort(Comparator.comparingInt(Map.Entry::getValue));
221+
// the last new ordinal should be size-1
222+
if (graph.size() > 0 && entriesByNewOrdinal.get(entriesByNewOrdinal.size() - 1).getValue() != graph.size() - 1) {
223+
throw new IllegalArgumentException("oldToNewOrdinals produced out-of-range entries");
224+
}
225+
226+
try (var view = graph.getView()) {
227+
// graph-level properties
228+
out.writeInt(graph.size());
229+
out.writeInt(vectors.dimension());
230+
out.writeInt(view.entryNode());
231+
out.writeInt(graph.maxDegree());
232+
233+
// for each graph node, write the associated vector and its neighbors
234+
for (int i = 0; i < oldToNewOrdinals.size(); i++) {
235+
var entry = entriesByNewOrdinal.get(i);
236+
int originalOrdinal = entry.getKey();
237+
int newOrdinal = entry.getValue();
238+
if (!graph.containsNode(originalOrdinal)) {
239+
continue;
240+
}
168241

169-
// pad out to maxEdgesPerNode
170-
for (; n < graph.maxDegree(); n++) {
171-
out.writeInt(-1);
242+
out.writeInt(newOrdinal); // unnecessary, but a reasonable sanity check
243+
Io.writeFloats(out, (float[]) vectors.vectorValue(originalOrdinal));
244+
245+
var neighbors = view.getNeighborsIterator(originalOrdinal);
246+
out.writeInt(neighbors.size());
247+
int n = 0;
248+
for (; n < neighbors.size(); n++) {
249+
out.writeInt(oldToNewOrdinals.get(neighbors.nextInt()));
250+
}
251+
assert !neighbors.hasNext();
252+
253+
// pad out to maxEdgesPerNode
254+
for (; n < graph.maxDegree(); n++) {
255+
out.writeInt(-1);
256+
}
172257
}
258+
} catch (Exception e) {
259+
throw new IOException(e);
173260
}
174261
}
175262
}

0 commit comments

Comments
 (0)