diff --git a/packages/playht/.gitignore b/packages/playht/.gitignore new file mode 100644 index 0000000..453af40 --- /dev/null +++ b/packages/playht/.gitignore @@ -0,0 +1 @@ +test-output-*.mp3 diff --git a/packages/playht/package.json b/packages/playht/package.json index 1820670..b34eb39 100644 --- a/packages/playht/package.json +++ b/packages/playht/package.json @@ -1,6 +1,6 @@ { "name": "playht", - "version": "0.13.0", + "version": "0.14.0", "description": "NodeJS SDK for PlayHT generative AI text-to-speech APIs", "files": [ "dist/**/*", @@ -22,6 +22,7 @@ "verify": "yarn check && yarn test", "check": "yarn build:protobufs && tsc -p tsconfig.json --noEmit && prettier --check . && eslint --ext .ts ./src", "release": "yarn && yarn verify && yarn build && cp ../../README.md . && npm publish || true && rm README.md", + "release-alpha": "yarn && yarn verify && yarn build && cp ../../README.md . && npm publish --tag=alpha || true && rm README.md", "postpublish": "PACKAGE_VERSION=$(cat package.json | grep \\\"version\\\" | head -1 | awk -F: '{ print $2 }' | sed 's/[\",]//g' | tr -d '[[:space:]]') && git tag v$PACKAGE_VERSION && git push --tags" }, "devDependencies": { diff --git a/packages/playht/src/__tests__/e2eStreaming.test.ts b/packages/playht/src/__tests__/e2eStreaming.test.ts new file mode 100644 index 0000000..8426558 --- /dev/null +++ b/packages/playht/src/__tests__/e2eStreaming.test.ts @@ -0,0 +1,92 @@ +import { buffer } from 'node:stream/consumers'; +import fs from 'node:fs'; +import { describe, expect, it } from '@jest/globals'; +import * as PlayHT from '../index'; +import { E2E_CONFIG } from './e2eTestConfig'; + +describe('E2E Streaming', () => { + describe('Play3.0-mini', () => { + it('streams from text', async () => { + PlayHT.init({ + userId: E2E_CONFIG.USER_ID, + apiKey: E2E_CONFIG.API_KEY, + }); + + const streamFromText = await PlayHT.stream('Hello from SDK test.', { + voiceEngine: 'Play3.0-mini', + // @ts-expect-error emotion is not part of the Play3.0-mini contract + emotion: 'female_surprised', + outputFormat: 'mp3', + }); + + const audioBuffer = await buffer(streamFromText); + fs.writeFileSync('test-output-Play3.0-mini.mp3', audioBuffer); // for debugging + + expect(audioBuffer.length).toBeGreaterThan(30_000); // errors would result in smaller payloads + expect(audioBuffer.toString('ascii')).toContain('ID3'); + }); + }); + + describe('PlayDialog', () => { + it('streams from text', async () => { + PlayHT.init({ + userId: E2E_CONFIG.USER_ID, + apiKey: E2E_CONFIG.API_KEY, + }); + + const streamFromText = await PlayHT.stream('Host 1: Is this the SDK?\nHost 2: Yes, it is.', { + voiceEngine: 'PlayDialog', + outputFormat: 'mp3', + temperature: 1.2, + quality: 'high', + voiceId2: 's3://voice-cloning-zero-shot/775ae416-49bb-4fb6-bd45-740f205d20a1/jennifersaad/manifest.json', + turnPrefix: 'Host 1:', + turnPrefix2: 'Host 2:', + language: 'english', + + // @ts-expect-error emotion and language are not part of the PlayDialog contract + emotion: 'female_surprised', + styleGuidance: 16, + }); + + const audioBuffer = await buffer(streamFromText); + fs.writeFileSync('test-output-PlayDialog.mp3', audioBuffer); // for debugging + + expect(audioBuffer.length).toBeGreaterThan(30_000); // errors would result in smaller payloads + expect(audioBuffer.toString('ascii')).toContain('ID3'); + }, 120_000); + }); + + describe('PlayDialogMultilingual', () => { + it('streams from text', async () => { + PlayHT.init({ + userId: E2E_CONFIG.USER_ID, + apiKey: E2E_CONFIG.API_KEY, + }); + + const streamFromText = await PlayHT.stream( + 'Host 1: Estamos todos prontos para fazer o que for necessário aqui. Host 2: É impossível esquecer tudo que vivemos.', + { + voiceEngine: 'PlayDialog', + outputFormat: 'mp3', + temperature: 1.2, + quality: 'high', + voiceId2: 's3://voice-cloning-zero-shot/775ae416-49bb-4fb6-bd45-740f205d20a1/jennifersaad/manifest.json', + turnPrefix: 'Host 1:', + turnPrefix2: 'Host 2:', + language: 'portuguese', + + // @ts-expect-error emotion and language are not part of the PlayDialog contract + emotion: 'female_surprised', + styleGuidance: 16, + }, + ); + + const audioBuffer = await buffer(streamFromText); + fs.writeFileSync('test-output-PlayDialogMultilingual.mp3', audioBuffer); // for debugging + + expect(audioBuffer.length).toBeGreaterThan(30_000); // errors would result in smaller payloads + expect(audioBuffer.toString('ascii')).toContain('ID3'); + }, 120_000); + }); +}); diff --git a/packages/playht/src/api/apiCommon.ts b/packages/playht/src/api/apiCommon.ts index 1ed72ca..2020c78 100644 --- a/packages/playht/src/api/apiCommon.ts +++ b/packages/playht/src/api/apiCommon.ts @@ -9,6 +9,7 @@ import type { PlayHT20OutputStreamFormat, Play30EngineStreamOptions, OutputFormat, + PlayDialogEngineStreamOptions, } from '..'; import { PassThrough, Readable, Writable } from 'node:stream'; import { APISettingsStore } from './APISettingsStore'; @@ -18,7 +19,7 @@ import { generateV2Speech } from './generateV2Speech'; import { generateV2Stream } from './generateV2Stream'; import { textStreamToSentences } from './textStreamToSentences'; import { generateGRpcStream } from './generateGRpcStream'; -import { generateV3Stream } from './internal/tts/v3/generateV3Stream'; +import { generateAuthBasedStream } from './internal/tts/v3/generateAuthBasedStream'; import { PlayRequestConfig } from './internal/config/PlayRequestConfig'; export type V1ApiOptions = { @@ -43,8 +44,7 @@ export type V2ApiOptions = { textGuidance?: number; }; -export type V3ApiOptions = Pick & - Omit; +export type AuthBasedEngineOptions = Play30EngineStreamOptions | PlayDialogEngineStreamOptions; type Preset = 'real-time' | 'balanced' | 'low-latency' | 'high-quality'; @@ -104,8 +104,9 @@ export async function internalGenerateStreamFromString( const v2Options = toV2Options(options, true); return await generateGRpcStream(input, options.voiceId, v2Options); } - case 'Play3.0-mini': { - return await generateV3Stream(input, options.voiceId, options, reqConfig); + case 'Play3.0-mini': + case 'PlayDialog': { + return await generateAuthBasedStream(input, options.voiceId, options, reqConfig); } } } diff --git a/packages/playht/src/api/internal/tts/v3/V3InternalSettings.ts b/packages/playht/src/api/internal/tts/v3/V3InternalSettings.ts index 96421a3..8151033 100644 --- a/packages/playht/src/api/internal/tts/v3/V3InternalSettings.ts +++ b/packages/playht/src/api/internal/tts/v3/V3InternalSettings.ts @@ -1,3 +1,13 @@ +/** + * "Public" because these are the engines the users can choose from. + */ +export type PublicAuthBasedEngine = 'Play3.0-mini' | 'PlayDialog'; + +/** + * "Internal" because these are the engines we use internally to determine the inference address (the HTTP endpoint). + */ +export type InternalAuthBasedEngine = PublicAuthBasedEngine | 'PlayDialogMultilingual'; + export type V3InternalSettings = { // how much time before expiration should we refresh the coordinates coordinatesExpirationAdvanceRefreshTimeMs?: number; @@ -5,7 +15,11 @@ export type V3InternalSettings = { coordinatesExpirationMinimalFrequencyMs?: number; // number of attempts when calling API to get new coordinates coordinatesGetApiCallMaxRetries?: number; - customInferenceCoordinatesGenerator?: (userId: string, apiKey: string) => Promise; + customInferenceCoordinatesGenerator?: ( + engine: InternalAuthBasedEngine, + userId: string, + apiKey: string, + ) => Promise; }; export type InferenceCoordinatesEntry = { diff --git a/packages/playht/src/api/internal/tts/v3/backgroundWarmUpAuthBasedEngine.ts b/packages/playht/src/api/internal/tts/v3/backgroundWarmUpAuthBasedEngine.ts index f9e78f7..ed0241a 100644 --- a/packages/playht/src/api/internal/tts/v3/backgroundWarmUpAuthBasedEngine.ts +++ b/packages/playht/src/api/internal/tts/v3/backgroundWarmUpAuthBasedEngine.ts @@ -2,16 +2,29 @@ import axios, { AxiosRequestConfig } from 'axios'; import { keepAliveHttpsAgent } from '../../http'; import { PlayRequestConfig } from '../../config/PlayRequestConfig'; import { createOrGetInferenceAddress } from './createOrGetInferenceAddress'; +import { InternalAuthBasedEngine, PublicAuthBasedEngine } from './V3InternalSettings'; -export const backgroundWarmUpAuthBasedEngine = (reqConfigSettings: PlayRequestConfig['settings']) => { - warmUp(reqConfigSettings).catch((error: any) => { - // eslint-disable-next-line no-process-env - console.log(`[PlayHT SDK] Error while warming up SDK: ${error.message}`, process.env.DEBUG ? error : ''); - }); +export const backgroundWarmUpAuthBasedEngine = ( + selectedEngine: PublicAuthBasedEngine, + reqConfigSettings: PlayRequestConfig['settings'], +) => { + const engines = + selectedEngine === 'Play3.0-mini' + ? (['Play3.0-mini'] as const) + : (['PlayDialog', 'PlayDialogMultilingual'] as const); + for (const engine of engines) { + warmUp(engine, reqConfigSettings).catch((error: any) => { + console.log( + `[PlayHT SDK] Error while warming up SDK (${engine}): ${error.message}`, + // eslint-disable-next-line no-process-env + process.env.DEBUG ? error : '', + ); + }); + } }; -const warmUp = async (reqConfigSettings: PlayRequestConfig['settings']) => { - const inferenceAddress = await createOrGetInferenceAddress(reqConfigSettings); +const warmUp = async (engine: InternalAuthBasedEngine, reqConfigSettings: PlayRequestConfig['settings']) => { + const inferenceAddress = await createOrGetInferenceAddress(engine, reqConfigSettings); const streamOptions: AxiosRequestConfig = { method: 'OPTIONS', url: inferenceAddress, diff --git a/packages/playht/src/api/internal/tts/v3/createOrGetInferenceAddress.test.ts b/packages/playht/src/api/internal/tts/v3/createOrGetInferenceAddress.test.ts index f7c7c1f..7c7e3b3 100644 --- a/packages/playht/src/api/internal/tts/v3/createOrGetInferenceAddress.test.ts +++ b/packages/playht/src/api/internal/tts/v3/createOrGetInferenceAddress.test.ts @@ -1,21 +1,25 @@ -import { describe, expect } from '@jest/globals'; +import { beforeEach, describe, expect } from '@jest/globals'; import { createOrGetInferenceAddress } from './createOrGetInferenceAddress'; +import { InternalAuthBasedEngine } from './V3InternalSettings'; async function sleep(timeout: number) { await new Promise((resolve) => setTimeout(resolve, timeout)); } describe('createOrGetInferenceAddress', () => { - let callSequenceNumber = 0; + let callSequenceNumber: number; + beforeEach(() => { + callSequenceNumber = 0; + }); const reqConfigSettings = (userId: string) => ({ userId, - apiKey: 'test', + apiKey: 'test-api-key', experimental: { v3: { - customInferenceCoordinatesGenerator: async () => { + customInferenceCoordinatesGenerator: async (_: InternalAuthBasedEngine, u: string) => { await sleep(10); // simulate a delay return { - inferenceAddress: `call ${userId} #${++callSequenceNumber}`, + inferenceAddress: `call ${u} #${++callSequenceNumber}`, expiresAtMs: Date.now() + 1_000_000, }; }, @@ -29,7 +33,7 @@ describe('createOrGetInferenceAddress', () => { it('serializes concurrent calls for the same user', async () => { const numberOfTestCalls = 15; const calls = Array.from({ length: numberOfTestCalls }, () => - createOrGetInferenceAddress(reqConfigSettings('test-user')), + createOrGetInferenceAddress('Play3.0-mini', reqConfigSettings('test-user')), ); // Expect all calls to return 'call #1', not 'call #1', 'call #2', 'call #3', etc. @@ -39,19 +43,19 @@ describe('createOrGetInferenceAddress', () => { it('doesnt serialize calls for different users', async () => { const numberOfTestCalls = 3; const callsOne = Array.from({ length: numberOfTestCalls }, (_, i) => - createOrGetInferenceAddress(reqConfigSettings(`test-user#${i}`)), + createOrGetInferenceAddress('Play3.0-mini', reqConfigSettings(`test-user#${i}`)), ); const callsTwo = Array.from({ length: numberOfTestCalls }, (_, i) => - createOrGetInferenceAddress(reqConfigSettings(`test-user#${i}`)), + createOrGetInferenceAddress('Play3.0-mini', reqConfigSettings(`test-user#${i}`)), ); expect(await Promise.all([...callsOne, ...callsTwo])).toEqual([ - 'call test-user#0 #2', - 'call test-user#1 #3', - 'call test-user#2 #4', - 'call test-user#0 #2', - 'call test-user#1 #3', - 'call test-user#2 #4', + 'call test-user#0 #1', + 'call test-user#1 #2', + 'call test-user#2 #3', + 'call test-user#0 #1', + 'call test-user#1 #2', + 'call test-user#2 #3', ]); }); }); diff --git a/packages/playht/src/api/internal/tts/v3/createOrGetInferenceAddress.ts b/packages/playht/src/api/internal/tts/v3/createOrGetInferenceAddress.ts index c170d22..dbfa6dd 100644 --- a/packages/playht/src/api/internal/tts/v3/createOrGetInferenceAddress.ts +++ b/packages/playht/src/api/internal/tts/v3/createOrGetInferenceAddress.ts @@ -4,19 +4,24 @@ import { keepAliveHttpsAgent } from '../../http'; import { PlayRequestConfig } from '../../config/PlayRequestConfig'; import { APISettingsStore } from '../../../APISettingsStore'; import { UserId } from '../../types'; -import { InferenceCoordinatesEntry } from './V3InternalSettings'; +import { InternalAuthBasedEngine, InferenceCoordinatesEntry, V3InternalSettings } from './V3InternalSettings'; import { V3_DEFAULT_SETTINGS } from './V3DefaultSettings'; -const inferenceCoordinatesStore: Record = {}; +const inferenceCoordinatesStores: Record> = { + 'Play3.0-mini': {}, + PlayDialog: {}, + PlayDialogMultilingual: {}, +}; // By default, the inference coordinates generator will call the Play API to get the inference coordinates. -const defaultInferenceCoordinatesGenerator = async ( - userId: string, - apiKey: string, +const defaultInferenceCoordinatesGenerator: V3InternalSettings['customInferenceCoordinatesGenerator'] = async ( + engine, + userId, + apiKey, ): Promise => { const data = await axios .post( - 'https://api.play.ht/api/v3/auth', + 'https://api.play.ht/api/v3/auth?dialog', {}, { headers: { @@ -28,19 +33,23 @@ const defaultInferenceCoordinatesGenerator = async ( ) .then( (response) => - response.data as { - inference_address: string; + response.data as Record & { expires_at_ms: number; }, ) .catch((error: any) => convertError(error)); + const httpStreamingUrl = data[engine]?.http_streaming_url; + if (!httpStreamingUrl) { + return convertError(new Error(`Engine ${engine} not found in AUTH response`)); + } return { - inferenceAddress: data.inference_address, + inferenceAddress: httpStreamingUrl, expiresAtMs: data.expires_at_ms, }; }; const createInferenceCoordinates = async ( + voiceEngine: InternalAuthBasedEngine, reqConfigSettings?: PlayRequestConfig['settings'], attemptNo = 0, ): Promise => { @@ -64,13 +73,13 @@ const createInferenceCoordinates = async ( V3_DEFAULT_SETTINGS.coordinatesGetApiCallMaxRetries; try { - const newInferenceCoordinatesEntry = await inferenceCoordinatesGenerator(userId, apiKey); + const newInferenceCoordinatesEntry = await inferenceCoordinatesGenerator(voiceEngine, userId, apiKey); const automaticRefreshDelay = Math.max( coordinatesExpirationMinimalFrequencyMs, newInferenceCoordinatesEntry.expiresAtMs - Date.now() - coordinatesExpirationAdvanceRefreshTimeMs, ); - setTimeout(() => createInferenceCoordinates(reqConfigSettings), automaticRefreshDelay).unref(); - inferenceCoordinatesStore[userId] = newInferenceCoordinatesEntry; + setTimeout(() => createInferenceCoordinates(voiceEngine, reqConfigSettings), automaticRefreshDelay).unref(); + inferenceCoordinatesStores[voiceEngine][userId] = newInferenceCoordinatesEntry; return newInferenceCoordinatesEntry; } catch (e) { if (attemptNo >= coordinatesGetApiCallMaxRetries) { @@ -79,7 +88,7 @@ const createInferenceCoordinates = async ( return new Promise((resolve) => { setTimeout( () => { - resolve(createInferenceCoordinates(reqConfigSettings, attemptNo + 1)); + resolve(createInferenceCoordinates(voiceEngine, reqConfigSettings, attemptNo + 1)); }, 500 * (attemptNo + 1), ).unref(); @@ -90,15 +99,16 @@ const createInferenceCoordinates = async ( const inferenceCoordinatesCreationPromise: Record> = {}; export const createOrGetInferenceAddress = async ( + voiceEngine: InternalAuthBasedEngine, reqConfigSettings?: PlayRequestConfig['settings'], ): Promise => { const userId = (reqConfigSettings?.userId ?? APISettingsStore.getSettings().userId) as UserId; - const inferenceCoordinatesEntry = inferenceCoordinatesStore[userId]; + const inferenceCoordinatesEntry = inferenceCoordinatesStores[voiceEngine][userId]; if (inferenceCoordinatesEntry && inferenceCoordinatesEntry.expiresAtMs >= Date.now() - 5_000) { return inferenceCoordinatesEntry.inferenceAddress; } else { if (!(userId in inferenceCoordinatesCreationPromise)) { - inferenceCoordinatesCreationPromise[userId] = createInferenceCoordinates(reqConfigSettings); + inferenceCoordinatesCreationPromise[userId] = createInferenceCoordinates(voiceEngine, reqConfigSettings); } const newInferenceCoordinatesEntry = (await inferenceCoordinatesCreationPromise[userId])!; delete inferenceCoordinatesCreationPromise[userId]; diff --git a/packages/playht/src/api/internal/tts/v3/generateAuthBasedStream.test.ts b/packages/playht/src/api/internal/tts/v3/generateAuthBasedStream.test.ts new file mode 100644 index 0000000..4bf209b --- /dev/null +++ b/packages/playht/src/api/internal/tts/v3/generateAuthBasedStream.test.ts @@ -0,0 +1,22 @@ +import { describe, expect, it } from '@jest/globals'; +import { getInternalEngineForEndpoint } from './generateAuthBasedStream'; + +describe('getInternalEngineForEndpoint', () => { + it('returns Play3.0-mini for Play3.0-mini engine', () => { + const options = { voiceEngine: 'Play3.0-mini' } as const; + const result = getInternalEngineForEndpoint(options); + expect(result).toBe('Play3.0-mini'); + }); + + it('returns PlayDialog for PlayDialog engine with no language', () => { + const options = { voiceEngine: 'PlayDialog' } as const; + const result = getInternalEngineForEndpoint(options); + expect(result).toBe('PlayDialog'); + }); + + it('returns PlayDialogMultilingual for PlayDialog engine with non-english language', () => { + const options = { voiceEngine: 'PlayDialog', language: 'spanish' } as const; + const result = getInternalEngineForEndpoint(options); + expect(result).toBe('PlayDialogMultilingual'); + }); +}); diff --git a/packages/playht/src/api/internal/tts/v3/generateAuthBasedStream.ts b/packages/playht/src/api/internal/tts/v3/generateAuthBasedStream.ts new file mode 100644 index 0000000..6e5ea8e --- /dev/null +++ b/packages/playht/src/api/internal/tts/v3/generateAuthBasedStream.ts @@ -0,0 +1,101 @@ +import type { AuthBasedEngineOptions, V2ApiOptions } from '../../../apiCommon'; +import type { Play30EngineStreamOptions, PlayDialogEngineStreamOptions } from '../../../../index'; +import axios, { AxiosRequestConfig } from 'axios'; +import { convertError } from '../../convertError'; +import { keepAliveHttpsAgent } from '../../http'; +import { PlayRequestConfig } from '../../config/PlayRequestConfig'; +import { createOrGetInferenceAddress } from './createOrGetInferenceAddress'; +import { InternalAuthBasedEngine } from './V3InternalSettings'; + +export async function generateAuthBasedStream( + text: string, + voice: string, + options: AuthBasedEngineOptions, + reqConfig: PlayRequestConfig, +): Promise { + const inferenceAddress = await createOrGetInferenceAddress(getInternalEngineForEndpoint(options), reqConfig.settings); + const streamOptions: AxiosRequestConfig = { + method: 'POST', + url: inferenceAddress, + headers: { + accept: outputFormatToMimeType(options.outputFormat), + }, + data: createPayloadForEngine(text, voice, options), + responseType: 'stream', + httpsAgent: keepAliveHttpsAgent, + signal: reqConfig.signal, + }; + + const response = await axios(streamOptions).catch((error: any) => convertError(error)); + return response.data; +} + +const outputFormatToMimeType = (outputFormat: V2ApiOptions['outputFormat'] | undefined): `audio/${string}` => { + if (!outputFormat) { + return outputFormatToMimeType('mp3'); + } + switch (outputFormat) { + case 'raw': + // fallthrough + case 'mulaw': + return 'audio/basic'; + case 'wav': + return 'audio/wav'; + case 'ogg': + return 'audio/ogg'; + case 'flac': + return 'audio/flac'; + case 'mp3': + return 'audio/mpeg'; + } +}; + +const createPayloadForEngine = ( + text: string, + voice: string, + options: Play30EngineStreamOptions | PlayDialogEngineStreamOptions, +) => { + const common = { + text, + voice, + output_format: options.outputFormat, + speed: options.speed, + sample_rate: options.sampleRate, + seed: options.seed, + temperature: options.temperature, + voice_engine: options.voiceEngine, + language: options.language, + }; + switch (options.voiceEngine) { + case 'Play3.0-mini': + return { + ...common, + quality: options.quality, + voice_guidance: options.voiceGuidance, + text_guidance: options.textGuidance, + style_guidance: options.styleGuidance, + }; + case 'PlayDialog': + return { + ...common, + voice_2: options.voiceId2, + turn_prefix: options.turnPrefix, + turn_prefix_2: options.turnPrefix2, + prompt: options.prompt, + prompt_2: options.prompt2, + voice_conditioning_seconds: options.voiceConditioningSeconds, + voice_conditioning_seconds_2: options.voiceConditioningSeconds2, + }; + } +}; + +// visible for test +export const getInternalEngineForEndpoint = (options: AuthBasedEngineOptions): InternalAuthBasedEngine => { + switch (options.voiceEngine) { + case 'Play3.0-mini': + return 'Play3.0-mini'; + case 'PlayDialog': + if (options.language && options.language !== 'english') return 'PlayDialogMultilingual'; + return 'PlayDialog'; + } +}; diff --git a/packages/playht/src/api/internal/tts/v3/generateV3Stream.ts b/packages/playht/src/api/internal/tts/v3/generateV3Stream.ts deleted file mode 100644 index 3b9925d..0000000 --- a/packages/playht/src/api/internal/tts/v3/generateV3Stream.ts +++ /dev/null @@ -1,63 +0,0 @@ -import type { V2ApiOptions, V3ApiOptions } from '../../../apiCommon'; -import axios, { AxiosRequestConfig } from 'axios'; -import { convertError } from '../../convertError'; -import { keepAliveHttpsAgent } from '../../http'; -import { PlayRequestConfig } from '../../config/PlayRequestConfig'; -import { createOrGetInferenceAddress } from './createOrGetInferenceAddress'; - -export async function generateV3Stream( - text: string, - voice: string, - options: V3ApiOptions, - reqConfig: PlayRequestConfig, -): Promise { - const inferenceAddress = await createOrGetInferenceAddress(reqConfig.settings); - const streamOptions: AxiosRequestConfig = { - method: 'POST', - url: inferenceAddress, - headers: { - accept: outputFormatToMimeType(options.outputFormat), - }, - data: { - text, - voice, - quality: options.quality, - output_format: options.outputFormat, - speed: options.speed, - sample_rate: options.sampleRate, - seed: options.seed, - temperature: options.temperature, - voice_engine: options.voiceEngine, - voice_guidance: options.voiceGuidance, - text_guidance: options.textGuidance, - style_guidance: options.styleGuidance, - language: options.language, - }, - responseType: 'stream', - httpsAgent: keepAliveHttpsAgent, - signal: reqConfig.signal, - }; - - const response = await axios(streamOptions).catch((error: any) => convertError(error)); - return response.data; -} - -const outputFormatToMimeType = (outputFormat: V2ApiOptions['outputFormat'] | undefined): `audio/${string}` => { - if (!outputFormat) { - return outputFormatToMimeType('mp3'); - } - switch (outputFormat) { - case 'raw': - // fallthrough - case 'mulaw': - return 'audio/basic'; - case 'wav': - return 'audio/wav'; - case 'ogg': - return 'audio/ogg'; - case 'flac': - return 'audio/flac'; - case 'mp3': - return 'audio/mpeg'; - } -}; diff --git a/packages/playht/src/api/internal/tts/v3/v3.test.ts b/packages/playht/src/api/internal/tts/v3/v3.test.ts deleted file mode 100644 index 6c61ad4..0000000 --- a/packages/playht/src/api/internal/tts/v3/v3.test.ts +++ /dev/null @@ -1,31 +0,0 @@ -import { buffer } from 'node:stream/consumers'; -import { describe, expect, it } from '@jest/globals'; -import * as PlayHT from '../../../../index'; -import { E2E_CONFIG } from '../../../../__tests__/e2eTestConfig'; - -describe('Auth-Based Models', () => { - describe('Play3.0-mini', () => { - it('streams from text', async () => { - PlayHT.init({ - userId: E2E_CONFIG.USER_ID, - apiKey: E2E_CONFIG.API_KEY, - }); - - const streamFromText = await PlayHT.stream('Hello from SDK test.', { - voiceEngine: 'Play3.0-mini', - outputFormat: 'mp3', - temperature: 1.2, - quality: 'high', - // @ts-expect-error emotion is not part of the Play3.0-mini contract - emotion: 'not-used', - styleGuidance: 16, - }); - - const audioBuffer = await buffer(streamFromText); - // fs.writeFileSync('Play3.0-mini.mp3', audioBuffer); // uncomment this line to save the generated file - - expect(audioBuffer.length).toBeGreaterThan(30_000); // errors would result in smaller payloads - expect(audioBuffer.toString('ascii')).toContain('ID3'); - }); - }); -}); diff --git a/packages/playht/src/index.ts b/packages/playht/src/index.ts index 47280e6..8be05a1 100644 --- a/packages/playht/src/index.ts +++ b/packages/playht/src/index.ts @@ -9,8 +9,9 @@ import { backgroundWarmUpAuthBasedEngine } from './api/internal/tts/v3/backgroun * The various voice engines that can be used for speech synthesis. * * For the lowest latency, use `Play3.0-mini`. + * For the highest quality, use `PlayDialog`. */ -export type VoiceEngine = 'Play3.0-mini' | 'PlayHT2.0-turbo' | 'PlayHT2.0' | 'PlayHT1.0' | 'Standard'; +export type VoiceEngine = 'PlayDialog' | 'Play3.0-mini' | 'PlayHT2.0-turbo' | 'PlayHT2.0' | 'PlayHT1.0' | 'Standard'; /** * Type representing the different input types that can be used to define the format of the input text. @@ -101,20 +102,6 @@ export type VoiceAgeGroup = 'youth' | 'adult' | 'senior'; /** * Potential values for emotions to be applied to speech. - * @typedef {( - * 'female_happy' | - * 'female_sad' | - * 'female_angry' | - * 'female_fearful' | - * 'female_disgust' | - * 'female_surprised' | - * 'male_happy' | - * 'male_sad' | - * 'male_angry' | - * 'male_fearful' | - * 'male_disgust' | - * 'male_surprised' - * )} Emotion */ export type Emotion = | 'female_happy' @@ -203,7 +190,6 @@ export type VoicesFilter = { * @property {OutputQuality} [quality] - Optional parameter to define the output quality of the speech. */ export type SharedSpeechOptions = { - voiceEngine: VoiceEngine; voiceId?: string; inputType?: InputType; speed?: number; @@ -394,12 +380,66 @@ export type Play30EngineStreamOptions = Omit & { + /** + * The identifier for the PlayDialog voice engine. + */ + voiceEngine: 'PlayDialog'; + + /** + * The unique ID for a PlayHT or Cloned Voice. Used for generating turn-based dialogues. + */ + voiceId2?: string; + + /** + * The prefix to indicate the start of a turn in a dialogue with `voice`. + */ + turnPrefix?: string; + + /** + * The prefix to indicate the start of a turn in a dialogue with `voiceId2`. + */ + turnPrefix2?: string; + + /** + * The prompt to be used for the `PlayDialog` model with `voice`. + */ + prompt?: string; + + /** + * The prompt to be used for the `PlayDialog` model with `voiceId2`. + */ + prompt2?: string; + + /** + * The number of seconds of conditioning to use from the selected `voice`. + * If equal to 0 (default value), the generation will be unconditioned. + * Higher values lead to higher model instability. + */ + voiceConditioningSeconds?: number; + + /** + * The number of seconds of conditioning to use from the selected `voiceId2`. + * If equal to 0 (default value), the generation will be unconditioned. + * Higher values lead to higher model instability. + */ + voiceConditioningSeconds2?: number; +}; + /** * The options available for configuring speech synthesis, which include shared options combined with engine-specific * options. @@ -412,7 +452,13 @@ export type SpeechOptions = SharedSpeechOptions & * options. */ export type SpeechStreamOptions = SharedSpeechOptions & - (Play30EngineStreamOptions | PlayHT20EngineStreamOptions | PlayHT10EngineStreamOptions | StandardEngineOptions); + ( + | PlayDialogEngineStreamOptions + | Play30EngineStreamOptions + | PlayHT20EngineStreamOptions + | PlayHT10EngineStreamOptions + | StandardEngineOptions + ); /** * `SpeechOutput` is the output type for a text-to-speech method, providing information about the generated @@ -479,8 +525,9 @@ export type APISettingsInput = { */ export function init(settings: APISettingsInput) { APISettingsStore.setSettings(settings); - if (settings.defaultVoiceEngine === 'Play3.0-mini') { - backgroundWarmUpAuthBasedEngine(settings); + // todo: change to isAuthBasedEngine at the same file of the warm upper + if (settings.defaultVoiceEngine === 'Play3.0-mini' || settings.defaultVoiceEngine === 'PlayDialog') { + backgroundWarmUpAuthBasedEngine(settings.defaultVoiceEngine, settings); } }