feature #858 [Platform][OpenAI] Add support for OpenAI text-to-speech (chr-hertel)

chr-hertel · chr-hertel · commit 07594e91de16 · 2025-11-13T00:37:49.000+01:00
This PR was merged into the main branch. Discussion ---------- [Platform][OpenAI] Add support for OpenAI text-to-speech | Q | A | ------------- | --- | Bug fix? | no | New feature? | yes | Docs? | no | Issues | | License | MIT Adding OpenAI TTS support: https://platform.openai.com/docs/guides/text-to-speech Commits ------- 1ab425d Add support for OpenAI text-to-speech
diff --git a/examples/openai/audio-output.php b/examples/openai/audio-output.php
@@ -0,0 +1,24 @@
+<?php
+
+/*
+ * This file is part of the Symfony package.
+ *
+ * (c) Fabien Potencier <fabien@symfony.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory;
+use Symfony\AI\Platform\Bridge\OpenAi\TextToSpeech\Voice;
+
+require_once dirname(__DIR__).'/bootstrap.php';
+
+$platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client());
+
+$result = $platform->invoke('gpt-4o-mini-tts', 'Today is a wonderful day to build something people love!', [
+    'voice' => Voice::CORAL,
+    'instructions' => 'Speak in a cheerful and positive tone.',
+]);
+
+echo $result->asBinary();
diff --git a/src/platform/src/Bridge/OpenAi/ModelCatalog.php b/src/platform/src/Bridge/OpenAi/ModelCatalog.php
@@ -234,6 +234,27 @@ public function __construct(array $additionalModels = [])
                 'class' => Embeddings::class,
                 'capabilities' => [Capability::INPUT_TEXT],
             ],
+            'tts-1' => [
+                'class' => TextToSpeech::class,
+                'capabilities' => [
+                    Capability::INPUT_TEXT,
+                    Capability::OUTPUT_AUDIO,
+                ],
+            ],
+            'tts-1-hd' => [
+                'class' => TextToSpeech::class,
+                'capabilities' => [
+                    Capability::INPUT_TEXT,
+                    Capability::OUTPUT_AUDIO,
+                ],
+            ],
+            'gpt-4o-mini-tts' => [
+                'class' => TextToSpeech::class,
+                'capabilities' => [
+                    Capability::INPUT_TEXT,
+                    Capability::OUTPUT_AUDIO,
+                ],
+            ],
             'whisper-1' => [
                 'class' => Whisper::class,
                 'capabilities' => [
diff --git a/src/platform/src/Bridge/OpenAi/PlatformFactory.php b/src/platform/src/Bridge/OpenAi/PlatformFactory.php
@@ -42,12 +42,14 @@ public static function create(
                 new Gpt\ModelClient($httpClient, $apiKey, $region),
                 new Embeddings\ModelClient($httpClient, $apiKey, $region),
                 new DallE\ModelClient($httpClient, $apiKey, $region),
+                new TextToSpeech\ModelClient($httpClient, $apiKey, $region),
                 new Whisper\ModelClient($httpClient, $apiKey, $region),
             ],
             [
                 new Gpt\ResultConverter(),
                 new Embeddings\ResultConverter(),
                 new DallE\ResultConverter(),
+                new TextToSpeech\ResultConverter(),
                 new Whisper\ResultConverter(),
             ],
             $modelCatalog,
diff --git a/src/platform/src/Bridge/OpenAi/TextToSpeech.php b/src/platform/src/Bridge/OpenAi/TextToSpeech.php
@@ -0,0 +1,21 @@
+<?php
+
+/*
+ * This file is part of the Symfony package.
+ *
+ * (c) Fabien Potencier <fabien@symfony.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+namespace Symfony\AI\Platform\Bridge\OpenAi;
+
+use Symfony\AI\Platform\Model;
+
+/**
+ * @author Christopher Hertel <mail@christopher-hertel.de>
+ */
+class TextToSpeech extends Model
+{
+}
diff --git a/src/platform/src/Bridge/OpenAi/TextToSpeech/Format.php b/src/platform/src/Bridge/OpenAi/TextToSpeech/Format.php
@@ -0,0 +1,25 @@
+<?php
+
+/*
+ * This file is part of the Symfony package.
+ *
+ * (c) Fabien Potencier <fabien@symfony.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+namespace Symfony\AI\Platform\Bridge\OpenAi\TextToSpeech;
+
+/**
+ * @author Christopher Hertel <mail@christopher-hertel.de>
+ */
+interface Format
+{
+    public const MP3 = 'mp3';
+    public const OPUS = 'opus';
+    public const AAC = 'aac';
+    public const FLAC = 'flac';
+    public const WAV = 'wav';
+    public const PCM = 'pcm';
+}
diff --git a/src/platform/src/Bridge/OpenAi/TextToSpeech/ModelClient.php b/src/platform/src/Bridge/OpenAi/TextToSpeech/ModelClient.php
@@ -0,0 +1,56 @@
+<?php
+
+/*
+ * This file is part of the Symfony package.
+ *
+ * (c) Fabien Potencier <fabien@symfony.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+namespace Symfony\AI\Platform\Bridge\OpenAi\TextToSpeech;
+
+use Symfony\AI\Platform\Bridge\OpenAi\AbstractModelClient;
+use Symfony\AI\Platform\Bridge\OpenAi\TextToSpeech;
+use Symfony\AI\Platform\Exception\InvalidArgumentException;
+use Symfony\AI\Platform\Model;
+use Symfony\AI\Platform\ModelClientInterface;
+use Symfony\AI\Platform\Result\RawHttpResult;
+use Symfony\Contracts\HttpClient\HttpClientInterface;
+
+/**
+ * @author Christopher Hertel <mail@christopher-hertel.de>
+ */
+final class ModelClient extends AbstractModelClient implements ModelClientInterface
+{
+    public function __construct(
+        private readonly HttpClientInterface $httpClient,
+        #[\SensitiveParameter] private readonly string $apiKey,
+        private readonly ?string $region = null,
+    ) {
+        self::validateApiKey($apiKey);
+    }
+
+    public function supports(Model $model): bool
+    {
+        return $model instanceof TextToSpeech;
+    }
+
+    public function request(Model $model, array|string $payload, array $options = []): RawHttpResult
+    {
+        if (!isset($options['voice'])) {
+            throw new InvalidArgumentException('The "voice" option is required for TextToSpeech requests.');
+        }
+
+        if (isset($options['stream_format']) || isset($options['stream'])) {
+            throw new InvalidArgumentException('Streaming text to speech results is not supported yet.');
+        }
+
+        return new RawHttpResult($this->httpClient->request('POST', \sprintf('%s/v1/audio/speech', self::getBaseUrl($this->region)), [
+            'auth_bearer' => $this->apiKey,
+            'headers' => ['Content-Type' => 'application/json'],
+            'json' => array_merge($options, ['model' => $model->getName(), 'input' => $payload]),
+        ]));
+    }
+}
diff --git a/src/platform/src/Bridge/OpenAi/TextToSpeech/ResultConverter.php b/src/platform/src/Bridge/OpenAi/TextToSpeech/ResultConverter.php
@@ -0,0 +1,43 @@
+<?php
+
+/*
+ * This file is part of the Symfony package.
+ *
+ * (c) Fabien Potencier <fabien@symfony.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+namespace Symfony\AI\Platform\Bridge\OpenAi\TextToSpeech;
+
+use Symfony\AI\Platform\Bridge\OpenAi\TextToSpeech;
+use Symfony\AI\Platform\Exception\RuntimeException;
+use Symfony\AI\Platform\Model;
+use Symfony\AI\Platform\Result\BinaryResult;
+use Symfony\AI\Platform\Result\RawHttpResult;
+use Symfony\AI\Platform\Result\RawResultInterface;
+use Symfony\AI\Platform\Result\ResultInterface;
+use Symfony\AI\Platform\ResultConverterInterface as BaseResponseConverter;
+
+/**
+ * @author Christopher Hertel <mail@christopher-hertel.de>
+ */
+final class ResultConverter implements BaseResponseConverter
+{
+    public function supports(Model $model): bool
+    {
+        return $model instanceof TextToSpeech;
+    }
+
+    public function convert(RawResultInterface|RawHttpResult $result, array $options = []): ResultInterface
+    {
+        $response = $result->getObject();
+
+        if (200 !== $response->getStatusCode()) {
+            throw new RuntimeException(\sprintf('The OpenAI Text-to-Speech API returned an error: "%s"', $response->getContent(false)));
+        }
+
+        return new BinaryResult($result->getObject()->getContent());
+    }
+}
diff --git a/src/platform/src/Bridge/OpenAi/TextToSpeech/Voice.php b/src/platform/src/Bridge/OpenAi/TextToSpeech/Voice.php
@@ -0,0 +1,30 @@
+<?php
+
+/*
+ * This file is part of the Symfony package.
+ *
+ * (c) Fabien Potencier <fabien@symfony.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+namespace Symfony\AI\Platform\Bridge\OpenAi\TextToSpeech;
+
+/**
+ * @author Christopher Hertel <mail@christopher-hertel.de>
+ */
+interface Voice
+{
+    public const ALLOY = 'alloy';
+    public const ASH = 'ash';
+    public const BALLAD = 'ballad';
+    public const CORAL = 'coral';
+    public const ECHO = 'echo';
+    public const FABLE = 'fable';
+    public const NOVA = 'nova';
+    public const ONYX = 'onyx';
+    public const SAGE = 'sage';
+    public const SHIMMER = 'shimmer';
+    public const VERSE = 'verse';
+}
diff --git a/src/platform/tests/Bridge/OpenAi/ModelCatalogTest.php b/src/platform/tests/Bridge/OpenAi/ModelCatalogTest.php
@@ -15,6 +15,7 @@
 use Symfony\AI\Platform\Bridge\OpenAi\Embeddings;
 use Symfony\AI\Platform\Bridge\OpenAi\Gpt;
 use Symfony\AI\Platform\Bridge\OpenAi\ModelCatalog;
+use Symfony\AI\Platform\Bridge\OpenAi\TextToSpeech;
 use Symfony\AI\Platform\Bridge\OpenAi\Whisper;
 use Symfony\AI\Platform\Capability;
 use Symfony\AI\Platform\ModelCatalog\ModelCatalogInterface;
@@ -53,6 +54,11 @@ public static function modelsProvider(): iterable
         yield 'text-embedding-3-large' => ['text-embedding-3-large', Embeddings::class, [Capability::INPUT_TEXT]];
         yield 'text-embedding-3-small' => ['text-embedding-3-small', Embeddings::class, [Capability::INPUT_TEXT]];
 
+        // Text-to-speech models
+        yield 'tts-1' => ['tts-1', TextToSpeech::class, [Capability::INPUT_TEXT, Capability::OUTPUT_AUDIO]];
+        yield 'tts-1-hd' => ['tts-1-hd', TextToSpeech::class, [Capability::INPUT_TEXT, Capability::OUTPUT_AUDIO]];
+        yield 'gpt-4o-mini-tts' => ['gpt-4o-mini-tts', TextToSpeech::class, [Capability::INPUT_TEXT, Capability::OUTPUT_AUDIO]];
+
         // Whisper models
         yield 'whisper-1' => ['whisper-1', Whisper::class, [Capability::INPUT_AUDIO, Capability::OUTPUT_TEXT]];
 
diff --git a/src/platform/tests/Bridge/OpenAi/TextToSpeech/ModelClientTest.php b/src/platform/tests/Bridge/OpenAi/TextToSpeech/ModelClientTest.php
@@ -0,0 +1,88 @@
+<?php
+
+/*
+ * This file is part of the Symfony package.
+ *
+ * (c) Fabien Potencier <fabien@symfony.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+namespace Symfony\AI\Platform\Tests\Bridge\OpenAi\TextToSpeech;
+
+use PHPUnit\Framework\TestCase;
+use Symfony\AI\Platform\Bridge\OpenAi\TextToSpeech;
+use Symfony\AI\Platform\Bridge\OpenAi\TextToSpeech\ModelClient;
+use Symfony\AI\Platform\Bridge\OpenAi\TextToSpeech\ResultConverter;
+use Symfony\AI\Platform\Exception\InvalidArgumentException;
+use Symfony\AI\Platform\Model;
+use Symfony\Component\HttpClient\MockHttpClient;
+use Symfony\Component\HttpClient\Response\MockResponse;
+use Symfony\Contracts\HttpClient\ResponseInterface as HttpResponse;
+
+/**
+ * @author Christopher Hertel <mail@christopher-hertel.de>
+ */
+final class ModelClientTest extends TestCase
+{
+    public function testSupportsTextToSpeechModel()
+    {
+        $converter = new ResultConverter();
+        $model = new TextToSpeech('tts-1');
+
+        $this->assertTrue($converter->supports($model));
+    }
+
+    public function testDoesntSupportOtherModels()
+    {
+        $converter = new ResultConverter();
+        $model = new Model('test-model');
+
+        $this->assertFalse($converter->supports($model));
+    }
+
+    public function testHappyCase()
+    {
+        $resultCallback = static function (string $method, string $url, array $options): HttpResponse {
+            self::assertSame('POST', $method);
+            self::assertSame('https://api.openai.com/v1/audio/speech', $url);
+            self::assertSame('Authorization: Bearer sk-api-key', $options['normalized_headers']['authorization'][0]);
+            $expectedBody = '{"voice":"alloy","instruction":"Speak like a pirate","model":"tts-1","input":"Hello World!"}';
+            self::assertSame($expectedBody, $options['body']);
+
+            return new MockResponse();
+        };
+        $httpClient = new MockHttpClient([$resultCallback]);
+        $modelClient = new ModelClient($httpClient, 'sk-api-key');
+        $modelClient->request(new TextToSpeech('tts-1'), 'Hello World!', [
+            'voice' => 'alloy',
+            'instruction' => 'Speak like a pirate',
+        ]);
+    }
+
+    public function testFailsWithoutVoiceOption()
+    {
+        $this->expectException(InvalidArgumentException::class);
+        $this->expectExceptionMessage('The "voice" option is required for TextToSpeech requests.');
+
+        $httpClient = new MockHttpClient();
+        $modelClient = new ModelClient($httpClient, 'sk-api-key');
+        $modelClient->request(new TextToSpeech('tts-1'), 'Hello World!', [
+            'instruction' => 'Speak like a pirate',
+        ]);
+    }
+
+    public function testFailsWithStreamingOptions()
+    {
+        $this->expectException(InvalidArgumentException::class);
+        $this->expectExceptionMessage('Streaming text to speech results is not supported yet.');
+
+        $httpClient = new MockHttpClient();
+        $modelClient = new ModelClient($httpClient, 'sk-api-key');
+        $modelClient->request(new TextToSpeech('tts-1'), 'Hello World!', [
+            'voice' => 'alloy',
+            'stream' => true,
+        ]);
+    }
+}
diff --git a/src/platform/tests/Bridge/OpenAi/TextToSpeech/ResultConverterTest.php b/src/platform/tests/Bridge/OpenAi/TextToSpeech/ResultConverterTest.php