From 58307ded2632fc2bd3a89779a67d5a2e3f6af798 Mon Sep 17 00:00:00 2001 From: rany2 Date: Wed, 25 Jan 2023 18:29:40 +0200 Subject: [PATCH] Update edge_tts to v6.1.3 * Sort voices in util's --list-voices * Disable overlapping subtitles by default * Try to fix subtitles for many hour long TTS generation * Match Microsoft Edge connection behavior more * Bump edge_tts version --- lint.sh | 2 ++ pylintrc | 4 +-- src/edge_tts/communicate.py | 66 ++++++++++++++++++++++++++----------- src/edge_tts/submaker.py | 2 +- src/edge_tts/util.py | 8 +++-- src/edge_tts/version.py | 2 +- 6 files changed, 57 insertions(+), 27 deletions(-) diff --git a/lint.sh b/lint.sh index 66ee7cc..dcbf843 100755 --- a/lint.sh +++ b/lint.sh @@ -1,2 +1,4 @@ +#!/bin/sh +set -ux find src examples -name '*.py' | xargs pylint find src examples -name '*.py' | xargs mypy diff --git a/pylintrc b/pylintrc index 658ba4d..5f0ae9c 100644 --- a/pylintrc +++ b/pylintrc @@ -283,10 +283,10 @@ max-attributes=7 max-bool-expr=5 # Maximum number of branch for function / method body. -max-branches=15 +max-branches=16 # Maximum number of locals for function / method body. -max-locals=15 +max-locals=18 # Maximum number of parents for a class (see R0901). max-parents=7 diff --git a/src/edge_tts/communicate.py b/src/edge_tts/communicate.py index f518c8e..7d1561c 100644 --- a/src/edge_tts/communicate.py +++ b/src/edge_tts/communicate.py @@ -254,9 +254,15 @@ def __init__( self.voice: str = voice match = re.match(r"^([a-z]{2})-([A-Z]{2})-(.+Neural)$", voice) if match is not None: + lang = match.group(1) + region = match.group(2) + name = match.group(3) + if name.find("-") != -1: + region = region + "-" + name[: name.find("-")] + name = name[name.find("-") + 1 :] self.voice = ( "Microsoft Server Speech Text to Speech Voice" - + f" ({match.group(1)}-{match.group(2)}, {match.group(3)})" + + f" ({lang}-{region}, {name})" ) if ( @@ -291,24 +297,29 @@ async def stream(self) -> AsyncGenerator[Dict[str, Any], None]: escape(remove_incompatible_characters(self.text)), calc_max_mesg_size(self.voice, self.rate, self.volume), ) - - async with aiohttp.ClientSession(trust_env=True) as session, session.ws_connect( - f"{WSS_URL}&ConnectionId={connect_id()}", - compress=15, - autoclose=True, - autoping=True, - proxy=self.proxy, - headers={ - "Pragma": "no-cache", - "Cache-Control": "no-cache", - "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold", - "Accept-Encoding": "gzip, deflate, br", - "Accept-Language": "en-US,en;q=0.9", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" - " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41", - }, - ) as websocket: - for text in texts: + final_utterance: Dict[int, int] = {} + prev_idx = -1 + shift_time = -1 + + for idx, text in enumerate(texts): + async with aiohttp.ClientSession( + trust_env=True + ) as session, session.ws_connect( + f"{WSS_URL}&ConnectionId={connect_id()}", + compress=15, + autoclose=True, + autoping=True, + proxy=self.proxy, + headers={ + "Pragma": "no-cache", + "Cache-Control": "no-cache", + "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "en-US,en;q=0.9", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41", + }, + ) as websocket: # download indicates whether we should be expecting audio data, # this is so what we avoid getting binary data from the websocket # and falsely thinking it's audio data. @@ -362,10 +373,25 @@ async def stream(self) -> AsyncGenerator[Dict[str, Any], None]: elif path == b"audio.metadata": for meta_obj in json.loads(data)["Metadata"]: meta_type = meta_obj["Type"] + if idx != prev_idx: + shift_time = sum( + final_utterance[i] for i in range(idx) + ) + prev_idx = idx if meta_type == "WordBoundary": + final_utterance[idx] = ( + meta_obj["Data"]["Offset"] + + meta_obj["Data"]["Duration"] + # Average padding added by the service + # Alternatively we could use ffmpeg to get value properly + # but I don't want to add an additional dependency + # if this is found to work well enough. + + 8_750_000 + ) yield { "type": meta_type, - "offset": meta_obj["Data"]["Offset"], + "offset": meta_obj["Data"]["Offset"] + + shift_time, "duration": meta_obj["Data"]["Duration"], "text": meta_obj["Data"]["text"]["Text"], } diff --git a/src/edge_tts/submaker.py b/src/edge_tts/submaker.py index 03a04db..959b373 100644 --- a/src/edge_tts/submaker.py +++ b/src/edge_tts/submaker.py @@ -40,7 +40,7 @@ class SubMaker: SubMaker class """ - def __init__(self, overlapping: int = 1) -> None: + def __init__(self, overlapping: int = 0) -> None: """ SubMaker constructor. diff --git a/src/edge_tts/util.py b/src/edge_tts/util.py index e19fbba..c4d6f59 100644 --- a/src/edge_tts/util.py +++ b/src/edge_tts/util.py @@ -14,7 +14,9 @@ async def _print_voices(*, proxy: str) -> None: """Print all available voices.""" - for idx, voice in enumerate(await list_voices(proxy=proxy)): + voices = await list_voices(proxy=proxy) + voices = sorted(voices, key=lambda voice: voice["ShortName"]) # type: ignore + for idx, voice in enumerate(voices): if idx != 0: print() @@ -82,8 +84,8 @@ async def _async_main() -> None: parser.add_argument( "-O", "--overlapping", - help="overlapping subtitles in seconds", - default=1, + help="overlapping subtitles in seconds. Default: 0.", + default=0, type=float, ) parser.add_argument( diff --git a/src/edge_tts/version.py b/src/edge_tts/version.py index aa1c517..ac9ba97 100644 --- a/src/edge_tts/version.py +++ b/src/edge_tts/version.py @@ -1,4 +1,4 @@ """Edge TTS version information.""" -__version__ = "6.1.1" +__version__ = "6.1.3" __version_info__ = tuple(int(num) for num in __version__.split("."))