Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions src/store/src/Bridge/ChromaDb/Store.php
Original file line number Diff line number Diff line change
Expand Up @@ -53,23 +53,40 @@ public function add(VectorDocument ...$documents): void
}

/**
* @param array{where?: array<string, string>, whereDocument?: array<string, mixed>} $options
* @param array{where?: array<string, string>, whereDocument?: array<string, mixed>, include?: array<string>} $options
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which expectations do we have here for the include option?

if i just want embeddings, metadatas and distances - how would i enable that?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you don't want to specify anything, simply leave it out. Then the default from the ChromaDB extension will be used. That's how it was before. However, as soon as something is passed by the user, we should ensure that at least the metadata and embeddings are included, which the array merge does (maybe we can remove distance)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you don't want to specify anything, simply leave it out.

yup, got that, that's good 👍

i understand that we basically treat ['embeddings', 'metadatas', 'distances'] as the default here, right?
what would i need to do, to enable that default? i'm questioning if that even works with that current implementation. adding a test here would be great

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for reviewing the code and for the comments.

I've added some tests.
The default $include = null should work since CodeWithKyrian/chromadb-php has the correct default (see link).

https://github.com/CodeWithKyrian/chromadb-php/blob/3bf9f8db6ebfbd805774fa81176f5029afab0371/src/Resources/CollectionResource.php#L279

However, we can also define our own default if you like...

*/
public function query(Vector $vector, array $options = []): iterable
{
$include = null;
if ([] !== ($options['include'] ?? [])) {
$include = array_values(
array_unique(
array_merge(['embeddings', 'metadatas', 'distances'], $options['include'])
)
);
}

$collection = $this->client->getOrCreateCollection($this->collectionName);
$queryResponse = $collection->query(
queryEmbeddings: [$vector->getData()],
nResults: 4,
where: $options['where'] ?? null,
whereDocument: $options['whereDocument'] ?? null,
include: $include,
);

for ($i = 0; $i < \count($queryResponse->metadatas[0]); ++$i) {
$metaCount = \count($queryResponse->metadatas[0]);

for ($i = 0; $i < $metaCount; ++$i) {
$metaData = new Metadata($queryResponse->metadatas[0][$i]);
if (isset($queryResponse->documents[0][$i])) {
$metaData->setText($queryResponse->documents[0][$i]);
}

yield new VectorDocument(
id: Uuid::fromString($queryResponse->ids[0][$i]),
vector: new Vector($queryResponse->embeddings[0][$i]),
metadata: new Metadata($queryResponse->metadatas[0][$i]),
metadata: $metaData,
score: $queryResponse->distances[0][$i] ?? null,
);
}
Expand Down
102 changes: 102 additions & 0 deletions src/store/tests/Bridge/ChromaDb/StoreTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,108 @@ public function testQueryWithVariousFilterCombinations(
$this->assertCount(1, $documents);
}

public function testQueryReturnsMetadatasEmbeddingsDistanceWithoutInclude()
{
$queryVector = new Vector([0.15, 0.25, 0.35]);
$queryResponse = new QueryItemsResponse(
ids: [['01234567-89ab-cdef-0123-456789abcdef']],
embeddings: [[[0.1, 0.2, 0.3]]],
metadatas: [[['title' => 'Doc 1']]],
documents: null,
data: null,
uris: null,
distances: null
);

$collection = $this->createMock(CollectionResource::class);
$client = $this->createMock(Client::class);

$client->expects($this->once())
->method('getOrCreateCollection')
->with('test-collection')
->willReturn($collection);

$collection->expects($this->once())
->method('query')
->willReturn($queryResponse);

$store = new Store($client, 'test-collection');
$documents = iterator_to_array($store->query($queryVector));

$this->assertCount(1, $documents);
$this->assertSame('01234567-89ab-cdef-0123-456789abcdef', (string) $documents[0]->id);
$this->assertSame([0.1, 0.2, 0.3], $documents[0]->vector->getData());
$this->assertSame(['title' => 'Doc 1'], $documents[0]->metadata->getArrayCopy());
}

public function testQueryReturnsMetadatasEmbeddingsDistanceWithOnlyDocuments()
{
$queryVector = new Vector([0.15, 0.25, 0.35]);
$queryResponse = new QueryItemsResponse(
ids: [['01234567-89ab-cdef-0123-456789abcdef']],
embeddings: [[[0.1, 0.2, 0.3]]],
metadatas: [[['title' => 'Doc 1']]],
documents: [['Document content here']],
data: null,
uris: null,
distances: null
);

$collection = $this->createMock(CollectionResource::class);
$client = $this->createMock(Client::class);

$client->expects($this->once())
->method('getOrCreateCollection')
->with('test-collection')
->willReturn($collection);

$collection->expects($this->once())
->method('query')
->willReturn($queryResponse);

$store = new Store($client, 'test-collection');
$documents = iterator_to_array($store->query($queryVector, ['include' => ['documents']]));

$this->assertCount(1, $documents);
$this->assertSame('01234567-89ab-cdef-0123-456789abcdef', (string) $documents[0]->id);
$this->assertSame([0.1, 0.2, 0.3], $documents[0]->vector->getData());
$this->assertSame(['title' => 'Doc 1', '_text' => 'Document content here'], $documents[0]->metadata->getArrayCopy());
}

public function testQueryReturnsMetadatasEmbeddingsDistanceWithAll()
{
$queryVector = new Vector([0.15, 0.25, 0.35]);
$queryResponse = new QueryItemsResponse(
ids: [['01234567-89ab-cdef-0123-456789abcdef']],
embeddings: [[[0.1, 0.2, 0.3]]],
metadatas: [[['title' => 'Doc 1']]],
documents: [['Document content here']],
data: null,
uris: null,
distances: null
);

$collection = $this->createMock(CollectionResource::class);
$client = $this->createMock(Client::class);

$client->expects($this->once())
->method('getOrCreateCollection')
->with('test-collection')
->willReturn($collection);

$collection->expects($this->once())
->method('query')
->willReturn($queryResponse);

$store = new Store($client, 'test-collection');
$documents = iterator_to_array($store->query($queryVector, ['include' => ['embeddings', 'metadatas', 'distances', 'documents']]));

$this->assertCount(1, $documents);
$this->assertSame('01234567-89ab-cdef-0123-456789abcdef', (string) $documents[0]->id);
$this->assertSame([0.1, 0.2, 0.3], $documents[0]->vector->getData());
$this->assertSame(['title' => 'Doc 1', '_text' => 'Document content here'], $documents[0]->metadata->getArrayCopy());
}

/**
* @return \Iterator<string, array{
* options: array{where?: array<string, string>, whereDocument?: array<string, mixed>},
Expand Down
Loading