Skip to content
This repository was archived by the owner on Jul 16, 2025. It is now read-only.

Add MongoDB vector store #42

Merged
merged 1 commit into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
name: pipeline
on: pull_request

env:
REQUIRED_PHP_EXTENSIONS: 'mongodb'

jobs:
tests:
runs-on: ubuntu-latest
Expand All @@ -16,6 +19,8 @@ jobs:
uses: shivammathur/setup-php@v2
with:
php-version: ${{ matrix.php }}
coverage: "none"
extensions: "${{ env.REQUIRED_PHP_EXTENSIONS }}"

- name: Install Composer
uses: "ramsey/composer-install@v3"
Expand All @@ -41,6 +46,8 @@ jobs:
uses: shivammathur/setup-php@v2
with:
php-version: '8.2'
coverage: "none"
extensions: "${{ env.REQUIRED_PHP_EXTENSIONS }}"

- name: Install Composer
uses: "ramsey/composer-install@v3"
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Supported Stores

* [x] [ChromaDB](https://trychroma.com)
* [x] [Azure AI Search](https://azure.microsoft.com/en-us/products/ai-services/ai-search)
* [x] [MongoDB Atlas Search](https://mongodb.com/products/platform/atlas-vector-search)
* [ ] [Pinecone](https://pinecone.io)

Provided Tools
Expand Down
2 changes: 2 additions & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
},
"require-dev": {
"codewithkyrian/chromadb-php": "^0.2.1",
"mongodb/mongodb": "^1.19",
"php-cs-fixer/shim": "^3.64",
"phpstan/phpstan": "^1.12",
"phpunit/phpunit": "^11.3",
Expand All @@ -36,6 +37,7 @@
"symfony/var-dumper": "^6.4 || ^7.1"
},
"suggest": {
"mongodb/mongodb": "For using MongoDB Atlas as retrieval vector store.",
"codewithkyrian/chromadb-php": "For using the ChromaDB as retrieval vector store.",
"symfony/clock": "For using the clock tool.",
"symfony/css-selector": "For using the YouTube transcription tool.",
Expand Down
152 changes: 152 additions & 0 deletions src/Store/MongoDB/Store.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
<?php

declare(strict_types=1);

namespace PhpLlm\LlmChain\Store\MongoDB;

use MongoDB\BSON\Binary;
use MongoDB\Client;
use MongoDB\Collection;
use PhpLlm\LlmChain\Document\Document;
use PhpLlm\LlmChain\Document\Metadata;
use PhpLlm\LlmChain\Document\Vector;
use PhpLlm\LlmChain\Store\VectorStoreInterface;
use Psr\Log\LoggerInterface;
use Symfony\Component\Uid\Uuid;

/**
* @see https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-overview/
*
* For this store you need to create a separate MongoDB Atlas Search index.
* The index needs to be created with the following settings:
* {
* "fields": [
* {
* "numDimensions": 1536,
* "path": "vector",
* "similarity": "euclidean",
* "type": "vector"
* }
* ]
* }
*
* Note, that the `path` key needs to match the $vectorFieldName.
*
* For the `similarity` key you can choose between `euclidean`, `cosine` and `dotProduct`.
* {@see https://www.mongodb.com/docs/atlas/atlas-search/field-types/knn-vector/#define-the-index-for-the-fts-field-type-type}
*
* @author Oskar Stark <[email protected]>
*/
final readonly class Store implements VectorStoreInterface
{
/**
* @param string $databaseName The name of the database
* @param string $collectionName The name of the collection
* @param string $indexName The name of the Atlas Search index
* @param string $vectorFieldName The name of the field int the index that contains the vector
* @param bool $bulkWrite Use bulk write operations
*/
public function __construct(
private Client $client,
private LoggerInterface $logger,
private string $databaseName,
private string $collectionName,
private string $indexName,
private string $vectorFieldName = 'vector',
private bool $bulkWrite = false,
) {
}

public function addDocument(Document $document): void
{
$this->addDocuments([$document]);
}

public function addDocuments(array $documents): void
{
$operations = [];

foreach ($documents as $document) {
if (!$document->hasVector()) {
$this->logger->warning('Document {id} does not have a vector', ['id' => $document->id]);
}

$operation = [
['_id' => $this->toBinary($document->id)], // we use binary for the id, because of storage efficiency
array_filter([
'metadata' => $document->metadata,
$this->vectorFieldName => $document->vector->getData(),
'text' => $document->text,
]),
['upsert' => true], // insert if not exists
];

if ($this->bulkWrite) {
$operations[] = ['replaceOne' => $operation];
continue;
}

$this->getCollection()->replaceOne(...$operation);
}

if ($this->bulkWrite) {
$this->getCollection()->bulkWrite($operations);
}
}

/**
* @param array{
* limit?: positive-int,
* numCandidates?: positive-int,
* filter?: array<mixed>
* } $options
*
* @return Document[]
*/
public function query(Vector $vector, array $options = []): array
{
$results = $this->getCollection()->aggregate([
[
'$vectorSearch' => array_merge([
'index' => $this->indexName,
'path' => $this->vectorFieldName,
'queryVector' => $vector->getData(),
'numCandidates' => 200,
'limit' => 5,
], $options),
],
[
'$addFields' => [
'score' => ['$meta' => 'vectorSearchScore'],
],
],
], ['typeMap' => ['root' => 'array', 'document' => 'array', 'array' => 'array']]);

$documents = [];

foreach ($results as $result) {
$documents[] = Document::fromVector(
Vector::create1536($result[$this->vectorFieldName]),
$this->toUuid($result['_id']),
new Metadata($result['metadata'] ?? []),
);
}

return $documents;
}

private function getCollection(): Collection
{
return $this->client->selectCollection($this->databaseName, $this->collectionName);
}

private function toBinary(Uuid $uuid): Binary
{
return new Binary($uuid->toBinary(), Binary::TYPE_UUID);
}

private function toUuid(Binary $binary): Uuid
{
return Uuid::fromString($binary->getData());
}
}