diff --git a/examples/store/document-splitting.php b/examples/store/document-splitting.php new file mode 100644 index 00000000..153743c2 --- /dev/null +++ b/examples/store/document-splitting.php @@ -0,0 +1,23 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Symfony\AI\Store\Document\Loader\TextFileLoader; +use Symfony\AI\Store\Document\Transformer\TextSplitTransformer; + +require_once dirname(__DIR__).'/vendor/autoload.php'; + +$loader = new TextFileLoader(); +$splitter = new TextSplitTransformer(); +$source = dirname(__DIR__, 2).'/fixtures/lorem.txt'; + +$documents = iterator_to_array($splitter($loader($source))); + +dump($documents); diff --git a/examples/store/document-vectorizing.php b/examples/store/document-vectorizing.php new file mode 100644 index 00000000..71beda9a --- /dev/null +++ b/examples/store/document-vectorizing.php @@ -0,0 +1,40 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Symfony\AI\Platform\Bridge\OpenAI\Embeddings; +use Symfony\AI\Platform\Bridge\OpenAI\PlatformFactory; +use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\VectorDocument; +use Symfony\AI\Store\Document\Vectorizer; +use Symfony\Component\Dotenv\Dotenv; +use Symfony\Component\Uid\Uuid; + +require_once dirname(__DIR__).'/vendor/autoload.php'; +(new Dotenv())->loadEnv(dirname(__DIR__).'/.env'); + +if (empty($_ENV['OPENAI_API_KEY'])) { + echo 'Please set the OPENAI_API_KEY environment variable.'.\PHP_EOL; + exit(1); +} + +$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']); +$embeddings = new Embeddings(Embeddings::TEXT_3_LARGE); + +$textDocuments = [ + new TextDocument(Uuid::v4(), 'Hello World'), + new TextDocument(Uuid::v4(), 'Lorem ipsum dolor sit amet'), + new TextDocument(Uuid::v4(), 'PHP Hypertext Preprocessor'), +]; + +$vectorizer = new Vectorizer($platform, $embeddings); +$vectorDocuments = $vectorizer->vectorizeDocuments($textDocuments); + +dump(array_map(fn (VectorDocument $document) => $document->vector->getDimensions(), $vectorDocuments)); diff --git a/examples/store/mariadb-similarity-search-gemini.php b/examples/store/mariadb-similarity-search-gemini.php index a3273488..0350af39 100644 --- a/examples/store/mariadb-similarity-search-gemini.php +++ b/examples/store/mariadb-similarity-search-gemini.php @@ -11,20 +11,21 @@ use Doctrine\DBAL\DriverManager; use Doctrine\DBAL\Tools\DsnParser; -use PhpLlm\LlmChain\Chain\Chain; -use PhpLlm\LlmChain\Chain\Toolbox\ChainProcessor; -use PhpLlm\LlmChain\Chain\Toolbox\Tool\SimilaritySearch; -use PhpLlm\LlmChain\Chain\Toolbox\Toolbox; -use PhpLlm\LlmChain\Platform\Bridge\Google\Embeddings; -use PhpLlm\LlmChain\Platform\Bridge\Google\Embeddings\TaskType; -use PhpLlm\LlmChain\Platform\Bridge\Google\Gemini; -use PhpLlm\LlmChain\Platform\Bridge\Google\PlatformFactory; -use PhpLlm\LlmChain\Platform\Message\Message; -use PhpLlm\LlmChain\Platform\Message\MessageBag; -use PhpLlm\LlmChain\Store\Bridge\MariaDB\Store; -use PhpLlm\LlmChain\Store\Document\Metadata; -use PhpLlm\LlmChain\Store\Document\TextDocument; -use PhpLlm\LlmChain\Store\Indexer; +use Symfony\AI\Agent\Agent; +use Symfony\AI\Agent\Toolbox\AgentProcessor; +use Symfony\AI\Agent\Toolbox\Tool\SimilaritySearch; +use Symfony\AI\Agent\Toolbox\Toolbox; +use Symfony\AI\Platform\Bridge\Google\Embeddings; +use Symfony\AI\Platform\Bridge\Google\Embeddings\TaskType; +use Symfony\AI\Platform\Bridge\Google\Gemini; +use Symfony\AI\Platform\Bridge\Google\PlatformFactory; +use Symfony\AI\Platform\Message\Message; +use Symfony\AI\Platform\Message\MessageBag; +use Symfony\AI\Store\Bridge\MariaDB\Store; +use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\Vectorizer; +use Symfony\AI\Store\Indexer; use Symfony\Component\Dotenv\Dotenv; use Symfony\Component\Uid\Uuid; @@ -66,20 +67,21 @@ // create embeddings for documents $platform = PlatformFactory::create($_ENV['GOOGLE_API_KEY']); $embeddings = new Embeddings(options: ['dimensions' => 768, 'task_type' => TaskType::SemanticSimilarity]); -$indexer = new Indexer($platform, $embeddings, $store); +$vectorizer = new Vectorizer($platform, $embeddings); +$indexer = new Indexer($vectorizer, $store); $indexer->index($documents); $model = new Gemini(Gemini::GEMINI_2_FLASH_LITE); $similaritySearch = new SimilaritySearch($platform, $embeddings, $store); $toolbox = Toolbox::create($similaritySearch); -$processor = new ChainProcessor($toolbox); -$chain = new Chain($platform, $model, [$processor], [$processor]); +$processor = new AgentProcessor($toolbox); +$agent = new Agent($platform, $model, [$processor], [$processor]); $messages = new MessageBag( Message::forSystem('Please answer all user questions only using SimilaritySearch function.'), Message::ofUser('Which movie fits the theme of the mafia?') ); -$response = $chain->call($messages); +$response = $agent->call($messages); echo $response->getContent().\PHP_EOL; diff --git a/examples/store/mariadb-similarity-search.php b/examples/store/mariadb-similarity-search.php index 383e3c56..9a38fdf9 100644 --- a/examples/store/mariadb-similarity-search.php +++ b/examples/store/mariadb-similarity-search.php @@ -21,6 +21,7 @@ use Symfony\AI\Store\Bridge\MariaDB\Store; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\Vectorizer; use Symfony\AI\Store\Indexer; use Symfony\Component\Dotenv\Dotenv; use Symfony\Component\Uid\Uuid; @@ -62,7 +63,8 @@ // create embeddings for documents $platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']); -$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store); +$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings()); +$indexer = new Indexer($vectorizer, $store); $indexer->index($documents); $model = new GPT(GPT::GPT_4O_MINI); diff --git a/examples/store/mongodb-similarity-search.php b/examples/store/mongodb-similarity-search.php index 63714deb..c394c7f9 100644 --- a/examples/store/mongodb-similarity-search.php +++ b/examples/store/mongodb-similarity-search.php @@ -22,6 +22,7 @@ use Symfony\AI\Store\Bridge\MongoDB\Store; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\Vectorizer; use Symfony\AI\Store\Indexer; use Symfony\Component\Dotenv\Dotenv; use Symfony\Component\Uid\Uuid; @@ -61,7 +62,8 @@ // create embeddings for documents $platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']); -$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store); +$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings()); +$indexer = new Indexer($vectorizer, $store); $indexer->index($documents); // initialize the index diff --git a/examples/store/pinecone-similarity-search.php b/examples/store/pinecone-similarity-search.php index 90ee08a2..be00797d 100644 --- a/examples/store/pinecone-similarity-search.php +++ b/examples/store/pinecone-similarity-search.php @@ -22,6 +22,7 @@ use Symfony\AI\Store\Bridge\Pinecone\Store; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\Vectorizer; use Symfony\AI\Store\Indexer; use Symfony\Component\Dotenv\Dotenv; use Symfony\Component\Uid\Uuid; @@ -55,7 +56,8 @@ // create embeddings for documents $platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']); -$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store); +$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings()); +$indexer = new Indexer($vectorizer, $store); $indexer->index($documents); $model = new GPT(GPT::GPT_4O_MINI); diff --git a/fixtures/lorem.txt b/fixtures/lorem.txt new file mode 100644 index 00000000..a2600b83 --- /dev/null +++ b/fixtures/lorem.txt @@ -0,0 +1,15 @@ +Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. +Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, +ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, +fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, +justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper +nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu, consequat vitae, eleifend ac, enim. +Aliquam lorem ante, dapibus in, viverra quis, feugiat a, tellus. Phasellus viverra nulla ut metus varius +laoreet. Quisque rutrum. Aenean imperdiet. Etiam ultricies nisi vel augue. Curabitur ullamcorper ultricies +nisi. Nam eget dui. Etiam rhoncus. Maecenas tempus, tellus eget condimentum rhoncus, sem quam semper libero, +sit amet adipiscing sem neque sed ipsum. Nam quam nunc, blandit vel, luctus pulvinar, hendrerit id, lorem. +Maecenas nec odio et ante tincidunt tempus. Donec vitae sapien ut libero venenatis faucibus. Nullam quis +ante. Etiam sit amet orci eget eros faucibus tincidunt. Duis leo. Sed fringilla mauris sit amet nibh. Donec +sodales sagittis magna. Sed consequat, leo eget bibendum sodales, augue velit cursus nunc, quis gravida +magna mi a libero. Fusce vulputate eleifend sapien. Vestibulum purus quam, scelerisque ut, mollis sed, +nonummy id, met diff --git a/src/store/src/Document/Loader/TextFileLoader.php b/src/store/src/Document/Loader/TextFileLoader.php new file mode 100644 index 00000000..d3e1890e --- /dev/null +++ b/src/store/src/Document/Loader/TextFileLoader.php @@ -0,0 +1,41 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document\Loader; + +use Symfony\AI\Store\Document\LoaderInterface; +use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Exception\RuntimeException; +use Symfony\Component\Uid\Uuid; + +/** + * @author Christopher Hertel + */ +final readonly class TextFileLoader implements LoaderInterface +{ + public function __invoke(string $source, array $options = []): iterable + { + if (!is_file($source)) { + throw new RuntimeException(\sprintf('File "%s" does not exist.', $source)); + } + + $content = file_get_contents($source); + + if (false === $content) { + throw new RuntimeException(\sprintf('Unable to read file "%s"', $source)); + } + + yield new TextDocument(Uuid::v4(), trim($content), new Metadata([ + 'source' => $source, + ])); + } +} diff --git a/src/store/src/Document/LoaderInterface.php b/src/store/src/Document/LoaderInterface.php new file mode 100644 index 00000000..6b4aa689 --- /dev/null +++ b/src/store/src/Document/LoaderInterface.php @@ -0,0 +1,26 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document; + +/** + * @author Christopher Hertel + */ +interface LoaderInterface +{ + /** + * @param string $source Identifier for the loader to load the documents from, e.g. file path, folder, or URL. + * @param array $options loader specific set of options to control the loading process + * + * @return iterable iterable of TextDocuments loaded from the source + */ + public function __invoke(string $source, array $options = []): iterable; +} diff --git a/src/store/src/Document/Transformer/ChainTransformer.php b/src/store/src/Document/Transformer/ChainTransformer.php new file mode 100644 index 00000000..686e1c49 --- /dev/null +++ b/src/store/src/Document/Transformer/ChainTransformer.php @@ -0,0 +1,39 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document\Transformer; + +use Symfony\AI\Store\Document\TransformerInterface; + +final readonly class ChainTransformer implements TransformerInterface +{ + /** + * @var TransformerInterface[] + */ + private array $transformers; + + /** + * @param iterable $transformers + */ + public function __construct(iterable $transformers) + { + $this->transformers = $transformers instanceof \Traversable ? iterator_to_array($transformers) : $transformers; + } + + public function __invoke(iterable $documents, array $options = []): iterable + { + foreach ($this->transformers as $transformer) { + $documents = $transformer($documents, $options); + } + + return $documents; + } +} diff --git a/src/store/src/Document/Transformer/ChunkDelayTransformer.php b/src/store/src/Document/Transformer/ChunkDelayTransformer.php new file mode 100644 index 00000000..045149eb --- /dev/null +++ b/src/store/src/Document/Transformer/ChunkDelayTransformer.php @@ -0,0 +1,51 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document\Transformer; + +use Symfony\AI\Store\Document\TransformerInterface; +use Symfony\Component\Clock\ClockInterface; + +/** + * This transformer splits the batch of documents into chunks and delays in-between with x seconds, which is useful + * when indexing a lot of documents and facing API rate limits. + * + * @author Christopher Hertel + */ +final readonly class ChunkDelayTransformer implements TransformerInterface +{ + public const OPTION_CHUNK_SIZE = 'chunk_size'; + public const OPTION_DELAY = 'delay'; + + public function __construct( + private ClockInterface $clock, + ) { + } + + /** + * @param array{chunk_size?: int, delay?: int} $options + */ + public function __invoke(iterable $documents, array $options = []): iterable + { + $chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? 50; + $delay = $options[self::OPTION_DELAY] ?? 10; + + $counter = 0; + foreach ($documents as $document) { + yield $document; + ++$counter; + + if ($chunkSize === $counter && 0 !== $delay) { + $this->clock->sleep($delay); + } + } + } +} diff --git a/src/store/src/Document/Transformer/TextSplitTransformer.php b/src/store/src/Document/Transformer/TextSplitTransformer.php new file mode 100644 index 00000000..f13b444b --- /dev/null +++ b/src/store/src/Document/Transformer/TextSplitTransformer.php @@ -0,0 +1,69 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document\Transformer; + +use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\TransformerInterface; +use Symfony\AI\Store\Exception\InvalidArgumentException; +use Symfony\Component\Uid\Uuid; + +/** + * Splits a TextDocument into smaller chunks of specified size with optional overlap. + * If the document's content is shorter than the specified chunk size, it returns the original document as a single chunk. + * Overlap cannot be negative and must be less than the chunk size. + * + * @author Christopher Hertel + */ +final readonly class TextSplitTransformer implements TransformerInterface +{ + public const OPTION_CHUNK_SIZE = 'chunk_size'; + public const OPTION_OVERLAP = 'overlap'; + + /** + * @param array{chunk_size?: int, overlap?: int} $options + */ + public function __invoke(iterable $documents, array $options = []): iterable + { + $chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? 1000; + $overlap = $options[self::OPTION_OVERLAP] ?? 200; + + if ($overlap < 0 || $overlap >= $chunkSize) { + throw new InvalidArgumentException('Overlap must be non-negative and less than chunk size.'); + } + + foreach ($documents as $document) { + if (mb_strlen($document->content) <= $chunkSize) { + yield $document; + + continue; + } + + $text = $document->content; + $length = mb_strlen($text); + $start = 0; + + while ($start < $length) { + $end = min($start + $chunkSize, $length); + $chunkText = mb_substr($text, $start, $end - $start); + + yield new TextDocument(Uuid::v4(), $chunkText, new Metadata([ + 'parent_id' => $document->id, + 'text' => $chunkText, + ...$document->metadata, + ])); + + $start += ($chunkSize - $overlap); + } + } + } +} diff --git a/src/store/src/Document/TransformerInterface.php b/src/store/src/Document/TransformerInterface.php new file mode 100644 index 00000000..cdb022bd --- /dev/null +++ b/src/store/src/Document/TransformerInterface.php @@ -0,0 +1,30 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document; + +/** + * A Transformer is designed to mutate a stream of TextDocuments with the purpose of preparing them for indexing. + * It can reduce or expand the number of documents, modify their content or metadata. + * It should not act blocking, but is expected to iterate over incoming documents and yield prepared ones. + * + * @author Christopher Hertel + */ +interface TransformerInterface +{ + /** + * @param iterable $documents + * @param array $options + * + * @return iterable + */ + public function __invoke(iterable $documents, array $options = []): iterable; +} diff --git a/src/store/src/Document/Vectorizer.php b/src/store/src/Document/Vectorizer.php new file mode 100644 index 00000000..03ad8621 --- /dev/null +++ b/src/store/src/Document/Vectorizer.php @@ -0,0 +1,60 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document; + +use Symfony\AI\Platform\Capability; +use Symfony\AI\Platform\Model; +use Symfony\AI\Platform\PlatformInterface; + +/** + * The Vectorizer encapsulates the logic to convert a collection of TextDocuments into VectorDocuments. It checks for + * the model's capabilities to handle batch processing or handles it with HttpClient's concurrency feature. + */ +final readonly class Vectorizer +{ + public function __construct( + private PlatformInterface $platform, + private Model $model, + ) { + } + + /** + * @param TextDocument[] $documents + * + * @return VectorDocument[] + */ + public function vectorizeDocuments(array $documents): array + { + if ($this->model->supports(Capability::INPUT_MULTIPLE)) { + $response = $this->platform->request($this->model, array_map(fn (TextDocument $document) => $document->content, $documents)); + + $vectors = $response->getContent(); + } else { + $responses = []; + foreach ($documents as $document) { + $responses[] = $this->platform->request($this->model, $document->content); + } + + $vectors = []; + foreach ($responses as $response) { + $vectors = array_merge($vectors, $response->getContent()); + } + } + + $vectorDocuments = []; + foreach ($documents as $i => $document) { + $vectorDocuments[] = new VectorDocument($document->id, $vectors[$i], $document->metadata); + } + + return $vectorDocuments; + } +} diff --git a/src/store/src/Indexer.php b/src/store/src/Indexer.php index fea15d67..385f10a5 100644 --- a/src/store/src/Indexer.php +++ b/src/store/src/Indexer.php @@ -13,85 +13,49 @@ use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; -use Symfony\AI\Platform\Capability; -use Symfony\AI\Platform\Model; -use Symfony\AI\Platform\PlatformInterface; use Symfony\AI\Store\Document\TextDocument; -use Symfony\AI\Store\Document\VectorDocument; -use Symfony\Component\Clock\Clock; -use Symfony\Component\Clock\ClockInterface; +use Symfony\AI\Store\Document\Vectorizer; /** + * Converts a collection of TextDocuments into VectorDocuments and pushes them to a store implementation. + * * @author Christopher Hertel */ final readonly class Indexer { - private ClockInterface $clock; - public function __construct( - private PlatformInterface $platform, - private Model $model, + private Vectorizer $vectorizer, private StoreInterface $store, - ?ClockInterface $clock = null, private LoggerInterface $logger = new NullLogger(), ) { - $this->clock = $clock ?? Clock::get(); } /** * @param TextDocument|iterable $documents + * @param int $chunkSize number of documents to vectorize and store in one batch */ - public function index(TextDocument|iterable $documents, int $chunkSize = 0, int $sleep = 0): void + public function index(TextDocument|iterable $documents, int $chunkSize = 50): void { if ($documents instanceof TextDocument) { $documents = [$documents]; } - if ([] === $documents) { - $this->logger->debug('No documents to index'); - - return; - } - - $chunks = 0 !== $chunkSize ? array_chunk($documents, $chunkSize) : [$documents]; - - foreach ($chunks as $chunk) { - $this->store->add(...$this->createVectorDocuments($chunk)); - - if (0 !== $sleep) { - $this->clock->sleep($sleep); - } - } - } - - /** - * @param TextDocument[] $documents - * - * @return VectorDocument[] - */ - private function createVectorDocuments(array $documents): array - { - if ($this->model->supports(Capability::INPUT_MULTIPLE)) { - $response = $this->platform->request($this->model, array_map(fn (TextDocument $document) => $document->content, $documents)); - - $vectors = $response->getContent(); - } else { - $responses = []; - foreach ($documents as $document) { - $responses[] = $this->platform->request($this->model, $document->content); - } + $counter = 0; + $chunk = []; + foreach ($documents as $document) { + $chunk[] = $document; + ++$counter; - $vectors = []; - foreach ($responses as $response) { - $vectors = array_merge($vectors, $response->getContent()); + if ($chunkSize === \count($chunk)) { + $this->store->add(...$this->vectorizer->vectorizeDocuments($chunk)); + $chunk = []; } } - $vectorDocuments = []; - foreach ($documents as $i => $document) { - $vectorDocuments[] = new VectorDocument($document->id, $vectors[$i], $document->metadata); + if (\count($chunk) > 0) { + $this->store->add(...$this->vectorizer->vectorizeDocuments($chunk)); } - return $vectorDocuments; + $this->logger->debug(0 === $counter ? 'No documents to index' : \sprintf('Indexed %d documents', $counter)); } } diff --git a/src/store/tests/Document/Loader/TextFileLoaderTest.php b/src/store/tests/Document/Loader/TextFileLoaderTest.php new file mode 100644 index 00000000..bd816dd1 --- /dev/null +++ b/src/store/tests/Document/Loader/TextFileLoaderTest.php @@ -0,0 +1,61 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Tests\Document\Loader; + +use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\Test; +use PHPUnit\Framework\TestCase; +use Symfony\AI\Store\Document\Loader\TextFileLoader; +use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Exception\RuntimeException; + +#[CoversClass(TextFileLoader::class)] +final class TextFileLoaderTest extends TestCase +{ + #[Test] + public function loadWithInvalidSource(): void + { + $loader = new TextFileLoader(); + + self::expectException(RuntimeException::class); + self::expectExceptionMessage('File "/invalid/source.txt" does not exist.'); + + iterator_to_array($loader('/invalid/source.txt')); + } + + #[Test] + public function loadWithValidSource(): void + { + $loader = new TextFileLoader(); + + $documents = iterator_to_array($loader(\dirname(__DIR__, 5).'/fixtures/lorem.txt')); + + self::assertCount(1, $documents); + self::assertInstanceOf(TextDocument::class, $document = $documents[0]); + self::assertStringStartsWith('Lorem ipsum', $document->content); + self::assertStringEndsWith('nonummy id, met', $document->content); + self::assertSame(1500, \strlen($document->content)); + } + + #[Test] + public function sourceIsPresentInMetadata(): void + { + $loader = new TextFileLoader(); + + $source = \dirname(__DIR__, 5).'/fixtures/lorem.txt'; + $documents = iterator_to_array($loader($source)); + + self::assertCount(1, $documents); + self::assertInstanceOf(TextDocument::class, $document = $documents[0]); + self::assertSame($source, $document->metadata['source']); + } +} diff --git a/src/store/tests/Document/Transformer/ChainTransformerTest.php b/src/store/tests/Document/Transformer/ChainTransformerTest.php new file mode 100644 index 00000000..f2ec9f5c --- /dev/null +++ b/src/store/tests/Document/Transformer/ChainTransformerTest.php @@ -0,0 +1,67 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Tests\Document\Transformer; + +use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\Test; +use PHPUnit\Framework\TestCase; +use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\Transformer\ChainTransformer; +use Symfony\AI\Store\Document\TransformerInterface; +use Symfony\Component\Uid\Uuid; + +#[CoversClass(TransformerInterface::class)] +final class ChainTransformerTest extends TestCase +{ + #[Test] + public function chainTransformerAppliesAllTransformersInOrder(): void + { + $transformerA = new class implements TransformerInterface { + public function __invoke(iterable $documents, array $options = []): iterable + { + foreach ($documents as $document) { + yield new TextDocument($document->id, $document->content.'-A'); + } + } + }; + + $transformerB = new class implements TransformerInterface { + public function __invoke(iterable $documents, array $options = []): iterable + { + foreach ($documents as $document) { + yield new TextDocument($document->id, $document->content.'-B'); + } + } + }; + + $chain = new ChainTransformer([$transformerA, $transformerB]); + $documents = [ + new TextDocument(Uuid::v4(), 'foo'), + new TextDocument(Uuid::v4(), 'bar'), + ]; + + $result = iterator_to_array($chain->__invoke($documents)); + + self::assertSame('foo-A-B', $result[0]->content); + self::assertSame('bar-A-B', $result[1]->content); + } + + public function testChainTransformerWithNoTransformersReturnsInput(): void + { + $chain = new ChainTransformer([]); + $documents = [new TextDocument(Uuid::v4(), 'baz')]; + + $result = iterator_to_array($chain->__invoke($documents)); + + self::assertSame('baz', $result[0]->content); + } +} diff --git a/src/store/tests/Document/Transformer/TextSplitTransformerTest.php b/src/store/tests/Document/Transformer/TextSplitTransformerTest.php new file mode 100644 index 00000000..85f89e6b --- /dev/null +++ b/src/store/tests/Document/Transformer/TextSplitTransformerTest.php @@ -0,0 +1,202 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Tests\Document\Transformer; + +use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\Test; +use PHPUnit\Framework\TestCase; +use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\Transformer\TextSplitTransformer; +use Symfony\AI\Store\Exception\InvalidArgumentException; +use Symfony\Component\Uid\Uuid; + +#[CoversClass(TextSplitTransformer::class)] +final class TextSplitTransformerTest extends TestCase +{ + private TextSplitTransformer $transformer; + + protected function setUp(): void + { + $this->transformer = new TextSplitTransformer(); + } + + #[Test] + public function splitReturnsSingleChunkForShortText(): void + { + $document = new TextDocument(Uuid::v4(), 'short text'); + + $chunks = iterator_to_array(($this->transformer)([$document])); + + self::assertCount(1, $chunks); + self::assertSame('short text', $chunks[0]->content); + } + + #[Test] + public function textLength(): void + { + self::assertSame(1500, mb_strlen($this->getLongText())); + } + + #[Test] + public function splitSplitsLongTextWithOverlap(): void + { + $document = new TextDocument(Uuid::v4(), $this->getLongText()); + + $chunks = iterator_to_array(($this->transformer)([$document])); + + self::assertCount(2, $chunks); + + self::assertSame(1000, mb_strlen($chunks[0]->content)); + self::assertSame(substr($this->getLongText(), 0, 1000), $chunks[0]->content); + + self::assertSame(700, mb_strlen($chunks[1]->content)); + self::assertSame(substr($this->getLongText(), 800, 700), $chunks[1]->content); + } + + #[Test] + public function splitWithCustomChunkSizeAndOverlap(): void + { + $document = new TextDocument(Uuid::v4(), $this->getLongText()); + + $chunks = iterator_to_array(($this->transformer)([$document], [ + TextSplitTransformer::OPTION_CHUNK_SIZE => 150, + TextSplitTransformer::OPTION_OVERLAP => 25, + ])); + + self::assertCount(12, $chunks); + + self::assertSame(150, mb_strlen($chunks[0]->content)); + self::assertSame(substr($this->getLongText(), 0, 150), $chunks[0]->content); + + self::assertSame(150, mb_strlen($chunks[1]->content)); + self::assertSame(substr($this->getLongText(), 125, 150), $chunks[1]->content); + + self::assertSame(150, mb_strlen($chunks[2]->content)); + self::assertSame(substr($this->getLongText(), 250, 150), $chunks[2]->content); + + self::assertSame(150, mb_strlen($chunks[3]->content)); + self::assertSame(substr($this->getLongText(), 375, 150), $chunks[3]->content); + + self::assertSame(150, mb_strlen($chunks[4]->content)); + self::assertSame(substr($this->getLongText(), 500, 150), $chunks[4]->content); + + self::assertSame(150, mb_strlen($chunks[5]->content)); + self::assertSame(substr($this->getLongText(), 625, 150), $chunks[5]->content); + + self::assertSame(150, mb_strlen($chunks[6]->content)); + self::assertSame(substr($this->getLongText(), 750, 150), $chunks[6]->content); + + self::assertSame(150, mb_strlen($chunks[7]->content)); + self::assertSame(substr($this->getLongText(), 875, 150), $chunks[7]->content); + + self::assertSame(150, mb_strlen($chunks[8]->content)); + self::assertSame(substr($this->getLongText(), 1000, 150), $chunks[8]->content); + + self::assertSame(150, mb_strlen($chunks[9]->content)); + self::assertSame(substr($this->getLongText(), 1125, 150), $chunks[9]->content); + + self::assertSame(150, mb_strlen($chunks[10]->content)); + self::assertSame(substr($this->getLongText(), 1250, 150), $chunks[10]->content); + + self::assertSame(125, mb_strlen($chunks[11]->content)); + self::assertSame(substr($this->getLongText(), 1375, 150), $chunks[11]->content); + } + + #[Test] + public function splitWithZeroOverlap(): void + { + $document = new TextDocument(Uuid::v4(), $this->getLongText()); + + $chunks = iterator_to_array(($this->transformer)([$document], [ + TextSplitTransformer::OPTION_OVERLAP => 0, + ])); + + self::assertCount(2, $chunks); + self::assertSame(substr($this->getLongText(), 0, 1000), $chunks[0]->content); + self::assertSame(substr($this->getLongText(), 1000, 500), $chunks[1]->content); + } + + #[Test] + public function parentIdIsSetInMetadata(): void + { + $document = new TextDocument(Uuid::v4(), $this->getLongText()); + + $chunks = iterator_to_array(($this->transformer)([$document], [ + TextSplitTransformer::OPTION_CHUNK_SIZE => 1000, + TextSplitTransformer::OPTION_OVERLAP => 200, + ])); + + self::assertCount(2, $chunks); + self::assertSame($document->id, $chunks[0]->metadata['parent_id']); + self::assertSame($document->id, $chunks[1]->metadata['parent_id']); + } + + #[Test] + public function metadataIsInherited(): void + { + $document = new TextDocument(Uuid::v4(), $this->getLongText(), new Metadata([ + 'key' => 'value', + 'foo' => 'bar', + ])); + + $chunks = iterator_to_array(($this->transformer)([$document])); + + self::assertCount(2, $chunks); + self::assertSame('value', $chunks[0]->metadata['key']); + self::assertSame('bar', $chunks[0]->metadata['foo']); + self::assertSame('value', $chunks[1]->metadata['key']); + self::assertSame('bar', $chunks[1]->metadata['foo']); + } + + #[Test] + public function splitWithChunkSizeLargerThanText(): void + { + $document = new TextDocument(Uuid::v4(), 'tiny'); + + $chunks = iterator_to_array(($this->transformer)([$document])); + + self::assertCount(1, $chunks); + self::assertSame('tiny', $chunks[0]->content); + } + + #[Test] + public function splitWithOverlapGreaterThanChunkSize(): void + { + $document = new TextDocument(Uuid::v4(), 'Abcdefg', new Metadata([])); + self::expectException(InvalidArgumentException::class); + self::expectExceptionMessage('Overlap must be non-negative and less than chunk size.'); + + iterator_to_array(($this->transformer)([$document], [ + TextSplitTransformer::OPTION_CHUNK_SIZE => 10, + TextSplitTransformer::OPTION_OVERLAP => 20, + ])); + } + + #[Test] + public function splitWithNegativeOverlap(): void + { + $document = new TextDocument(Uuid::v4(), 'Abcdefg', new Metadata([])); + self::expectException(InvalidArgumentException::class); + self::expectExceptionMessage('Overlap must be non-negative and less than chunk size.'); + + iterator_to_array(($this->transformer)([$document], [ + TextSplitTransformer::OPTION_CHUNK_SIZE => 10, + TextSplitTransformer::OPTION_OVERLAP => -1, + ])); + } + + private function getLongText(): string + { + return trim(file_get_contents(\dirname(__DIR__, 5).'/fixtures/lorem.txt')); + } +} diff --git a/src/store/tests/IndexerTest.php b/src/store/tests/IndexerTest.php index 9f7fda60..74cfadec 100644 --- a/src/store/tests/IndexerTest.php +++ b/src/store/tests/IndexerTest.php @@ -27,10 +27,10 @@ use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\VectorDocument; +use Symfony\AI\Store\Document\Vectorizer; use Symfony\AI\Store\Indexer; use Symfony\AI\Store\Tests\Double\PlatformTestHandler; use Symfony\AI\Store\Tests\Double\TestStore; -use Symfony\Component\Clock\MockClock; use Symfony\Component\Uid\Uuid; #[CoversClass(Indexer::class)] @@ -47,18 +47,13 @@ final class IndexerTest extends TestCase { #[Test] - public function embedSingleDocument(): void + public function indexSingleDocument(): void { $document = new TextDocument($id = Uuid::v4(), 'Test content'); $vector = new Vector([0.1, 0.2, 0.3]); + $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResponse($vector)), new Embeddings()); - $indexer = new Indexer( - PlatformTestHandler::createPlatform(new VectorResponse($vector)), - new Embeddings(), - $store = new TestStore(), - new MockClock(), - ); - + $indexer = new Indexer($vectorizer, $store = new TestStore()); $indexer->index($document); self::assertCount(1, $store->documents); @@ -68,38 +63,27 @@ public function embedSingleDocument(): void } #[Test] - public function embedEmptyDocumentList(): void + public function indexEmptyDocumentList(): void { $logger = self::createMock(LoggerInterface::class); $logger->expects(self::once())->method('debug')->with('No documents to index'); + $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(), new Embeddings()); - $indexer = new Indexer( - PlatformTestHandler::createPlatform(), - new Embeddings(), - $store = new TestStore(), - new MockClock(), - $logger, - ); - + $indexer = new Indexer($vectorizer, $store = new TestStore(), $logger); $indexer->index([]); self::assertSame([], $store->documents); } #[Test] - public function embedDocumentWithMetadata(): void + public function indexDocumentWithMetadata(): void { $metadata = new Metadata(['key' => 'value']); $document = new TextDocument($id = Uuid::v4(), 'Test content', $metadata); $vector = new Vector([0.1, 0.2, 0.3]); + $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResponse($vector)), new Embeddings()); - $indexer = new Indexer( - PlatformTestHandler::createPlatform(new VectorResponse($vector)), - new Embeddings(), - $store = new TestStore(), - new MockClock(), - ); - + $indexer = new Indexer($vectorizer, $store = new TestStore()); $indexer->index($document); self::assertSame(1, $store->addCalls); @@ -109,30 +93,4 @@ public function embedDocumentWithMetadata(): void self::assertSame($vector, $store->documents[0]->vector); self::assertSame(['key' => 'value'], $store->documents[0]->metadata->getArrayCopy()); } - - #[Test] - public function embedWithSleep(): void - { - $vector1 = new Vector([0.1, 0.2, 0.3]); - $vector2 = new Vector([0.4, 0.5, 0.6]); - - $document1 = new TextDocument(Uuid::v4(), 'Test content 1'); - $document2 = new TextDocument(Uuid::v4(), 'Test content 2'); - - $indexer = new Indexer( - PlatformTestHandler::createPlatform(new VectorResponse($vector1, $vector2)), - new Embeddings(), - $store = new TestStore(), - $clock = new MockClock('2024-01-01 00:00:00'), - ); - - $indexer->index( - documents: [$document1, $document2], - sleep: 3 - ); - - self::assertSame(1, $store->addCalls); - self::assertCount(2, $store->documents); - self::assertSame('2024-01-01 00:00:03', $clock->now()->format('Y-m-d H:i:s')); - } }