From bd265703f893d5b7c4c05ea563fbd60cde2698ea Mon Sep 17 00:00:00 2001 From: valtzu Date: Tue, 24 Jun 2025 21:14:33 +0300 Subject: [PATCH] feat: add MariaDB store --- .env | 3 + compose.yaml | 8 + composer.json | 4 + examples/store/mariadb-similarity-search.php | 74 +++++++++ src/Store/Bridge/MariaDB/Store.php | 159 +++++++++++++++++++ 5 files changed, 248 insertions(+) create mode 100644 compose.yaml create mode 100644 examples/store/mariadb-similarity-search.php create mode 100644 src/Store/Bridge/MariaDB/Store.php diff --git a/.env b/.env index 79f19d23..7dbea184 100644 --- a/.env +++ b/.env @@ -64,3 +64,6 @@ RUN_EXPENSIVE_EXAMPLES=false # For using Gemini GOOGLE_API_KEY= + +# For MariaDB store. Server defined in compose.yaml +MARIADB_URI=pdo-mysql://root@127.0.0.1:3309/my_database diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 00000000..cc3dab63 --- /dev/null +++ b/compose.yaml @@ -0,0 +1,8 @@ +services: + mariadb: + image: mariadb:11.7 + environment: + MARIADB_ALLOW_EMPTY_ROOT_PASSWORD: 1 + MARIADB_DATABASE: my_database + ports: + - "3309:3306" diff --git a/composer.json b/composer.json index fa1a0d3e..30ed7360 100644 --- a/composer.json +++ b/composer.json @@ -37,9 +37,11 @@ "webmozart/assert": "^1.11" }, "require-dev": { + "ext-pdo": "*", "codewithkyrian/chromadb-php": "^0.2.1 || ^0.3 || ^0.4", "codewithkyrian/transformers": "^0.5.3", "async-aws/bedrock-runtime": "^0.1.0", + "doctrine/dbal": "^3.0 || ^4.0", "mongodb/mongodb": "^1.21 || ^2.0", "php-cs-fixer/shim": "^3.70", "phpstan/phpstan": "^2.0", @@ -58,9 +60,11 @@ "symfony/var-dumper": "^6.4 || ^7.1" }, "suggest": { + "ext-pdo": "For using MariaDB as retrieval vector store.", "async-aws/bedrock-runtime": "For using the Bedrock platform.", "codewithkyrian/chromadb-php": "For using the ChromaDB as retrieval vector store.", "codewithkyrian/transformers": "For using the TransformersPHP with FFI to run models in PHP.", + "doctrine/dbal": "For using MariaDB via Doctrine as retrieval vector store", "mongodb/mongodb": "For using MongoDB Atlas as retrieval vector store.", "probots-io/pinecone-php": "For using the Pinecone as retrieval vector store.", "symfony/css-selector": "For using the YouTube transcription tool.", diff --git a/examples/store/mariadb-similarity-search.php b/examples/store/mariadb-similarity-search.php new file mode 100644 index 00000000..65332d59 --- /dev/null +++ b/examples/store/mariadb-similarity-search.php @@ -0,0 +1,74 @@ +loadEnv(dirname(__DIR__, 2).'/.env'); + +if (empty($_ENV['OPENAI_API_KEY']) || empty($_ENV['MARIADB_URI'])) { + echo 'Please set OPENAI_API_KEY and MARIADB_URI environment variables.'.\PHP_EOL; + exit(1); +} + +// initialize the store +$store = Store::fromDbal( + connection: DriverManager::getConnection((new DsnParser())->parse($_ENV['MARIADB_URI'])), + tableName: 'my_table', + indexName: 'my_index', + vectorFieldName: 'embedding', +); + +// our data +$movies = [ + ['title' => 'Inception', 'description' => 'A skilled thief is given a chance at redemption if he can successfully perform inception, the act of planting an idea in someone\'s subconscious.', 'director' => 'Christopher Nolan'], + ['title' => 'The Matrix', 'description' => 'A hacker discovers the world he lives in is a simulated reality and joins a rebellion to overthrow its controllers.', 'director' => 'The Wachowskis'], + ['title' => 'The Godfather', 'description' => 'The aging patriarch of an organized crime dynasty transfers control of his empire to his reluctant son.', 'director' => 'Francis Ford Coppola'], +]; + +// create embeddings and documents +foreach ($movies as $i => $movie) { + $documents[] = new TextDocument( + id: Uuid::v4(), + content: 'Title: '.$movie['title'].\PHP_EOL.'Director: '.$movie['director'].\PHP_EOL.'Description: '.$movie['description'], + metadata: new Metadata($movie), + ); +} + +// initialize the table +$store->initialize(); + +// create embeddings for documents +$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']); +$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store); +$indexer->index($documents); + +$model = new GPT(GPT::GPT_4O_MINI); + +$similaritySearch = new SimilaritySearch($platform, $embeddings, $store); +$toolbox = Toolbox::create($similaritySearch); +$processor = new ChainProcessor($toolbox); +$chain = new Chain($platform, $model, [$processor], [$processor]); + +$messages = new MessageBag( + Message::forSystem('Please answer all user questions only using SimilaritySearch function.'), + Message::ofUser('Which movie fits the theme of the mafia?') +); +$response = $chain->call($messages); + +echo $response->getContent().\PHP_EOL; diff --git a/src/Store/Bridge/MariaDB/Store.php b/src/Store/Bridge/MariaDB/Store.php new file mode 100644 index 00000000..dc85a1f9 --- /dev/null +++ b/src/Store/Bridge/MariaDB/Store.php @@ -0,0 +1,159 @@ +=11.7. + * + * @see https://mariadb.org/rag-with-mariadb-vector/ + * + * @author Valtteri R + */ +final readonly class Store implements VectorStoreInterface, InitializableStoreInterface +{ + /** + * @param string $tableName The name of the table + * @param string $indexName The name of the vector search index + * @param string $vectorFieldName The name of the field in the index that contains the vector + */ + public function __construct( + private \PDO $connection, + private string $tableName, + private string $indexName, + private string $vectorFieldName, + ) { + } + + public static function fromPdo(\PDO $connection, string $tableName, string $indexName = 'embedding', string $vectorFieldName = 'embedding'): self + { + return new self($connection, $tableName, $indexName, $vectorFieldName); + } + + /** + * @throws DBALException + */ + public static function fromDbal(Connection $connection, string $tableName, string $indexName = 'embedding', string $vectorFieldName = 'embedding'): self + { + $pdo = $connection->getNativeConnection(); + + if (!$pdo instanceof \PDO) { + throw new InvalidArgumentException('Only DBAL connections using PDO driver are supported.'); + } + + return self::fromPdo($pdo, $tableName, $indexName, $vectorFieldName); + } + + public function add(VectorDocument ...$documents): void + { + $statement = $this->connection->prepare( + \sprintf( + <<<'SQL' + INSERT INTO %1$s (id, metadata, %2$s) + VALUES (:id, :metadata, VEC_FromText(:vector)) + ON DUPLICATE KEY UPDATE metadata = :metadata, %2$s = VEC_FromText(:vector) + SQL, + $this->tableName, + $this->vectorFieldName, + ), + ); + + foreach ($documents as $document) { + $operation = [ + 'id' => $document->id->toBinary(), + 'metadata' => json_encode($document->metadata->getArrayCopy()), + 'vector' => json_encode($document->vector->getData()), + ]; + + $statement->execute($operation); + } + } + + /** + * @param array{ + * limit?: positive-int, + * } $options + */ + public function query(Vector $vector, array $options = [], ?float $minScore = null): array + { + $statement = $this->connection->prepare( + \sprintf( + <<<'SQL' + SELECT id, VEC_ToText(%1$s) embedding, metadata, VEC_DISTANCE_EUCLIDEAN(%1$s, VEC_FromText(:embedding)) AS score + FROM %2$s + %3$s + ORDER BY score ASC + LIMIT %4$d + SQL, + $this->vectorFieldName, + $this->tableName, + null !== $minScore ? 'WHERE VEC_DISTANCE_EUCLIDEAN(%1$s, VEC_FromText(:embedding)) >= :minScore' : '', + $options['limit'] ?? 5, + ), + ); + + $params = ['embedding' => json_encode($vector->getData())]; + + if (null !== $minScore) { + $params['minScore'] = $minScore; + } + + $documents = []; + + $statement->execute($params); + + foreach ($statement->fetchAll(\PDO::FETCH_ASSOC) as $result) { + $documents[] = new VectorDocument( + id: Uuid::fromBinary($result['id']), + vector: new Vector(json_decode((string) $result['embedding'], true)), + metadata: new Metadata(json_decode($result['metadata'] ?? '{}', true)), + score: $result['score'], + ); + } + + return $documents; + } + + /** + * @param array{} $options + */ + public function initialize(array $options = []): void + { + if ([] !== $options) { + throw new InvalidArgumentException('No supported options'); + } + + $serverVersion = $this->connection->getAttribute(\PDO::ATTR_SERVER_VERSION); + + if (!str_contains((string) $serverVersion, 'MariaDB') || version_compare($serverVersion, '11.7.0') < 0) { + throw new InvalidArgumentException('You need MariaDB >=11.7 to use this feature'); + } + + $this->connection->exec( + \sprintf( + <<<'SQL' + CREATE TABLE IF NOT EXISTS %1$s ( + id BINARY(16) NOT NULL PRIMARY KEY, + metadata JSON, + %2$s VECTOR(1536) NOT NULL, + VECTOR INDEX %3$s (%2$s) + ) + SQL, + $this->tableName, + $this->vectorFieldName, + $this->indexName, + ), + ); + } +}