Skip to content

feat: implement document loader & transformer for store indexing #57

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions examples/store/document-splitting.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?php

/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <[email protected]>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

use Symfony\AI\Store\Document\Loader\TextFileLoader;
use Symfony\AI\Store\Document\Transformer\TextSplitTransformer;

require_once dirname(__DIR__).'/vendor/autoload.php';

$loader = new TextFileLoader();
$splitter = new TextSplitTransformer();
$source = dirname(__DIR__, 2).'/fixtures/lorem.txt';

$documents = iterator_to_array($splitter($loader($source)));

dump($documents);
40 changes: 40 additions & 0 deletions examples/store/document-vectorizing.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
<?php

/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <[email protected]>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

use Symfony\AI\Platform\Bridge\OpenAI\Embeddings;
use Symfony\AI\Platform\Bridge\OpenAI\PlatformFactory;
use Symfony\AI\Store\Document\TextDocument;
use Symfony\AI\Store\Document\VectorDocument;
use Symfony\AI\Store\Document\Vectorizer;
use Symfony\Component\Dotenv\Dotenv;
use Symfony\Component\Uid\Uuid;

require_once dirname(__DIR__).'/vendor/autoload.php';
(new Dotenv())->loadEnv(dirname(__DIR__).'/.env');

if (empty($_ENV['OPENAI_API_KEY'])) {
echo 'Please set the OPENAI_API_KEY environment variable.'.\PHP_EOL;
exit(1);
}

$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
$embeddings = new Embeddings(Embeddings::TEXT_3_LARGE);

$textDocuments = [
new TextDocument(Uuid::v4(), 'Hello World'),
new TextDocument(Uuid::v4(), 'Lorem ipsum dolor sit amet'),
new TextDocument(Uuid::v4(), 'PHP Hypertext Preprocessor'),
];

$vectorizer = new Vectorizer($platform, $embeddings);
$vectorDocuments = $vectorizer->vectorizeDocuments($textDocuments);

dump(array_map(fn (VectorDocument $document) => $document->vector->getDimensions(), $vectorDocuments));
38 changes: 20 additions & 18 deletions examples/store/mariadb-similarity-search-gemini.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,21 @@

use Doctrine\DBAL\DriverManager;
use Doctrine\DBAL\Tools\DsnParser;
use PhpLlm\LlmChain\Chain\Chain;
use PhpLlm\LlmChain\Chain\Toolbox\ChainProcessor;
use PhpLlm\LlmChain\Chain\Toolbox\Tool\SimilaritySearch;
use PhpLlm\LlmChain\Chain\Toolbox\Toolbox;
use PhpLlm\LlmChain\Platform\Bridge\Google\Embeddings;
use PhpLlm\LlmChain\Platform\Bridge\Google\Embeddings\TaskType;
use PhpLlm\LlmChain\Platform\Bridge\Google\Gemini;
use PhpLlm\LlmChain\Platform\Bridge\Google\PlatformFactory;
use PhpLlm\LlmChain\Platform\Message\Message;
use PhpLlm\LlmChain\Platform\Message\MessageBag;
use PhpLlm\LlmChain\Store\Bridge\MariaDB\Store;
use PhpLlm\LlmChain\Store\Document\Metadata;
use PhpLlm\LlmChain\Store\Document\TextDocument;
use PhpLlm\LlmChain\Store\Indexer;
use Symfony\AI\Agent\Agent;
use Symfony\AI\Agent\Toolbox\AgentProcessor;
use Symfony\AI\Agent\Toolbox\Tool\SimilaritySearch;
use Symfony\AI\Agent\Toolbox\Toolbox;
use Symfony\AI\Platform\Bridge\Google\Embeddings;
use Symfony\AI\Platform\Bridge\Google\Embeddings\TaskType;
use Symfony\AI\Platform\Bridge\Google\Gemini;
use Symfony\AI\Platform\Bridge\Google\PlatformFactory;
use Symfony\AI\Platform\Message\Message;
use Symfony\AI\Platform\Message\MessageBag;
use Symfony\AI\Store\Bridge\MariaDB\Store;
use Symfony\AI\Store\Document\Metadata;
use Symfony\AI\Store\Document\TextDocument;
use Symfony\AI\Store\Document\Vectorizer;
use Symfony\AI\Store\Indexer;
use Symfony\Component\Dotenv\Dotenv;
use Symfony\Component\Uid\Uuid;

Expand Down Expand Up @@ -66,20 +67,21 @@
// create embeddings for documents
$platform = PlatformFactory::create($_ENV['GOOGLE_API_KEY']);
$embeddings = new Embeddings(options: ['dimensions' => 768, 'task_type' => TaskType::SemanticSimilarity]);
$indexer = new Indexer($platform, $embeddings, $store);
$vectorizer = new Vectorizer($platform, $embeddings);
$indexer = new Indexer($vectorizer, $store);
$indexer->index($documents);

$model = new Gemini(Gemini::GEMINI_2_FLASH_LITE);

$similaritySearch = new SimilaritySearch($platform, $embeddings, $store);
$toolbox = Toolbox::create($similaritySearch);
$processor = new ChainProcessor($toolbox);
$chain = new Chain($platform, $model, [$processor], [$processor]);
$processor = new AgentProcessor($toolbox);
$agent = new Agent($platform, $model, [$processor], [$processor]);

$messages = new MessageBag(
Message::forSystem('Please answer all user questions only using SimilaritySearch function.'),
Message::ofUser('Which movie fits the theme of the mafia?')
);
$response = $chain->call($messages);
$response = $agent->call($messages);

echo $response->getContent().\PHP_EOL;
4 changes: 3 additions & 1 deletion examples/store/mariadb-similarity-search.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
use Symfony\AI\Store\Bridge\MariaDB\Store;
use Symfony\AI\Store\Document\Metadata;
use Symfony\AI\Store\Document\TextDocument;
use Symfony\AI\Store\Document\Vectorizer;
use Symfony\AI\Store\Indexer;
use Symfony\Component\Dotenv\Dotenv;
use Symfony\Component\Uid\Uuid;
Expand Down Expand Up @@ -62,7 +63,8 @@

// create embeddings for documents
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store);
$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings());
$indexer = new Indexer($vectorizer, $store);
$indexer->index($documents);

$model = new GPT(GPT::GPT_4O_MINI);
Expand Down
4 changes: 3 additions & 1 deletion examples/store/mongodb-similarity-search.php
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
use Symfony\AI\Store\Bridge\MongoDB\Store;
use Symfony\AI\Store\Document\Metadata;
use Symfony\AI\Store\Document\TextDocument;
use Symfony\AI\Store\Document\Vectorizer;
use Symfony\AI\Store\Indexer;
use Symfony\Component\Dotenv\Dotenv;
use Symfony\Component\Uid\Uuid;
Expand Down Expand Up @@ -61,7 +62,8 @@

// create embeddings for documents
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store);
$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings());
$indexer = new Indexer($vectorizer, $store);
$indexer->index($documents);

// initialize the index
Expand Down
4 changes: 3 additions & 1 deletion examples/store/pinecone-similarity-search.php
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
use Symfony\AI\Store\Bridge\Pinecone\Store;
use Symfony\AI\Store\Document\Metadata;
use Symfony\AI\Store\Document\TextDocument;
use Symfony\AI\Store\Document\Vectorizer;
use Symfony\AI\Store\Indexer;
use Symfony\Component\Dotenv\Dotenv;
use Symfony\Component\Uid\Uuid;
Expand Down Expand Up @@ -55,7 +56,8 @@

// create embeddings for documents
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store);
$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings());
$indexer = new Indexer($vectorizer, $store);
$indexer->index($documents);

$model = new GPT(GPT::GPT_4O_MINI);
Expand Down
15 changes: 15 additions & 0 deletions fixtures/lorem.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa.
Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis,
ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo,
fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae,
justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper
nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu, consequat vitae, eleifend ac, enim.
Aliquam lorem ante, dapibus in, viverra quis, feugiat a, tellus. Phasellus viverra nulla ut metus varius
laoreet. Quisque rutrum. Aenean imperdiet. Etiam ultricies nisi vel augue. Curabitur ullamcorper ultricies
nisi. Nam eget dui. Etiam rhoncus. Maecenas tempus, tellus eget condimentum rhoncus, sem quam semper libero,
sit amet adipiscing sem neque sed ipsum. Nam quam nunc, blandit vel, luctus pulvinar, hendrerit id, lorem.
Maecenas nec odio et ante tincidunt tempus. Donec vitae sapien ut libero venenatis faucibus. Nullam quis
ante. Etiam sit amet orci eget eros faucibus tincidunt. Duis leo. Sed fringilla mauris sit amet nibh. Donec
sodales sagittis magna. Sed consequat, leo eget bibendum sodales, augue velit cursus nunc, quis gravida
magna mi a libero. Fusce vulputate eleifend sapien. Vestibulum purus quam, scelerisque ut, mollis sed,
nonummy id, met
41 changes: 41 additions & 0 deletions src/store/src/Document/Loader/TextFileLoader.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<?php

/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <[email protected]>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace Symfony\AI\Store\Document\Loader;

use Symfony\AI\Store\Document\LoaderInterface;
use Symfony\AI\Store\Document\Metadata;
use Symfony\AI\Store\Document\TextDocument;
use Symfony\AI\Store\Exception\RuntimeException;
use Symfony\Component\Uid\Uuid;

/**
* @author Christopher Hertel <[email protected]>
*/
final readonly class TextFileLoader implements LoaderInterface
{
public function __invoke(string $source, array $options = []): iterable
{
if (!is_file($source)) {
throw new RuntimeException(\sprintf('File "%s" does not exist.', $source));
}

$content = file_get_contents($source);

if (false === $content) {
throw new RuntimeException(\sprintf('Unable to read file "%s"', $source));
}

yield new TextDocument(Uuid::v4(), trim($content), new Metadata([
'source' => $source,
]));
}
}
26 changes: 26 additions & 0 deletions src/store/src/Document/LoaderInterface.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?php

/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <[email protected]>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace Symfony\AI\Store\Document;

/**
* @author Christopher Hertel <[email protected]>
*/
interface LoaderInterface
{
/**
* @param string $source Identifier for the loader to load the documents from, e.g. file path, folder, or URL.
* @param array<string, mixed> $options loader specific set of options to control the loading process
*
* @return iterable<TextDocument> iterable of TextDocuments loaded from the source
*/
public function __invoke(string $source, array $options = []): iterable;
}
39 changes: 39 additions & 0 deletions src/store/src/Document/Transformer/ChainTransformer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
<?php

/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <[email protected]>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace Symfony\AI\Store\Document\Transformer;

use Symfony\AI\Store\Document\TransformerInterface;

final readonly class ChainTransformer implements TransformerInterface
{
/**
* @var TransformerInterface[]
*/
private array $transformers;

/**
* @param iterable<TransformerInterface> $transformers
*/
public function __construct(iterable $transformers)
{
$this->transformers = $transformers instanceof \Traversable ? iterator_to_array($transformers) : $transformers;
}

public function __invoke(iterable $documents, array $options = []): iterable
{
foreach ($this->transformers as $transformer) {
$documents = $transformer($documents, $options);
}

return $documents;
}
}
51 changes: 51 additions & 0 deletions src/store/src/Document/Transformer/ChunkDelayTransformer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<?php

/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <[email protected]>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace Symfony\AI\Store\Document\Transformer;

use Symfony\AI\Store\Document\TransformerInterface;
use Symfony\Component\Clock\ClockInterface;

/**
* This transformer splits the batch of documents into chunks and delays in-between with x seconds, which is useful
* when indexing a lot of documents and facing API rate limits.
*
* @author Christopher Hertel <[email protected]>
*/
final readonly class ChunkDelayTransformer implements TransformerInterface
{
public const OPTION_CHUNK_SIZE = 'chunk_size';
public const OPTION_DELAY = 'delay';

public function __construct(
private ClockInterface $clock,
) {
}

/**
* @param array{chunk_size?: int, delay?: int} $options
*/
public function __invoke(iterable $documents, array $options = []): iterable
{
$chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? 50;
$delay = $options[self::OPTION_DELAY] ?? 10;

$counter = 0;
foreach ($documents as $document) {
yield $document;
++$counter;

if ($chunkSize === $counter && 0 !== $delay) {
$this->clock->sleep($delay);
}
}
}
}
Loading