diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml index b8ba5fae..2a4f6695 100644 --- a/.github/workflows/pipeline.yml +++ b/.github/workflows/pipeline.yml @@ -1,6 +1,9 @@ name: pipeline on: pull_request +env: + REQUIRED_PHP_EXTENSIONS: 'mongodb' + jobs: tests: runs-on: ubuntu-latest @@ -16,6 +19,8 @@ jobs: uses: shivammathur/setup-php@v2 with: php-version: ${{ matrix.php }} + coverage: "none" + extensions: "${{ env.REQUIRED_PHP_EXTENSIONS }}" - name: Install Composer uses: "ramsey/composer-install@v3" @@ -41,6 +46,8 @@ jobs: uses: shivammathur/setup-php@v2 with: php-version: '8.2' + coverage: "none" + extensions: "${{ env.REQUIRED_PHP_EXTENSIONS }}" - name: Install Composer uses: "ramsey/composer-install@v3" diff --git a/README.md b/README.md index bf1aef63..8ad25701 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ Supported Stores * [x] [ChromaDB](https://trychroma.com) * [x] [Azure AI Search](https://azure.microsoft.com/en-us/products/ai-services/ai-search) +* [x] [MongoDB Atlas Search](https://mongodb.com/products/platform/atlas-vector-search) * [ ] [Pinecone](https://pinecone.io) Provided Tools diff --git a/composer.json b/composer.json index 545a2fa8..fdb83328 100644 --- a/composer.json +++ b/composer.json @@ -24,6 +24,7 @@ }, "require-dev": { "codewithkyrian/chromadb-php": "^0.2.1", + "mongodb/mongodb": "^1.19", "php-cs-fixer/shim": "^3.64", "phpstan/phpstan": "^1.12", "phpunit/phpunit": "^11.3", @@ -36,6 +37,7 @@ "symfony/var-dumper": "^6.4 || ^7.1" }, "suggest": { + "mongodb/mongodb": "For using MongoDB Atlas as retrieval vector store.", "codewithkyrian/chromadb-php": "For using the ChromaDB as retrieval vector store.", "symfony/clock": "For using the clock tool.", "symfony/css-selector": "For using the YouTube transcription tool.", diff --git a/src/Store/MongoDB/Store.php b/src/Store/MongoDB/Store.php new file mode 100644 index 00000000..16836768 --- /dev/null +++ b/src/Store/MongoDB/Store.php @@ -0,0 +1,152 @@ + + */ +final readonly class Store implements VectorStoreInterface +{ + /** + * @param string $databaseName The name of the database + * @param string $collectionName The name of the collection + * @param string $indexName The name of the Atlas Search index + * @param string $vectorFieldName The name of the field int the index that contains the vector + * @param bool $bulkWrite Use bulk write operations + */ + public function __construct( + private Client $client, + private LoggerInterface $logger, + private string $databaseName, + private string $collectionName, + private string $indexName, + private string $vectorFieldName = 'vector', + private bool $bulkWrite = false, + ) { + } + + public function addDocument(Document $document): void + { + $this->addDocuments([$document]); + } + + public function addDocuments(array $documents): void + { + $operations = []; + + foreach ($documents as $document) { + if (!$document->hasVector()) { + $this->logger->warning('Document {id} does not have a vector', ['id' => $document->id]); + } + + $operation = [ + ['_id' => $this->toBinary($document->id)], // we use binary for the id, because of storage efficiency + array_filter([ + 'metadata' => $document->metadata, + $this->vectorFieldName => $document->vector->getData(), + 'text' => $document->text, + ]), + ['upsert' => true], // insert if not exists + ]; + + if ($this->bulkWrite) { + $operations[] = ['replaceOne' => $operation]; + continue; + } + + $this->getCollection()->replaceOne(...$operation); + } + + if ($this->bulkWrite) { + $this->getCollection()->bulkWrite($operations); + } + } + + /** + * @param array{ + * limit?: positive-int, + * numCandidates?: positive-int, + * filter?: array + * } $options + * + * @return Document[] + */ + public function query(Vector $vector, array $options = []): array + { + $results = $this->getCollection()->aggregate([ + [ + '$vectorSearch' => array_merge([ + 'index' => $this->indexName, + 'path' => $this->vectorFieldName, + 'queryVector' => $vector->getData(), + 'numCandidates' => 200, + 'limit' => 5, + ], $options), + ], + [ + '$addFields' => [ + 'score' => ['$meta' => 'vectorSearchScore'], + ], + ], + ], ['typeMap' => ['root' => 'array', 'document' => 'array', 'array' => 'array']]); + + $documents = []; + + foreach ($results as $result) { + $documents[] = Document::fromVector( + Vector::create1536($result[$this->vectorFieldName]), + $this->toUuid($result['_id']), + new Metadata($result['metadata'] ?? []), + ); + } + + return $documents; + } + + private function getCollection(): Collection + { + return $this->client->selectCollection($this->databaseName, $this->collectionName); + } + + private function toBinary(Uuid $uuid): Binary + { + return new Binary($uuid->toBinary(), Binary::TYPE_UUID); + } + + private function toUuid(Binary $binary): Uuid + { + return Uuid::fromString($binary->getData()); + } +}