Update edge_tts to v6.1.3

* Sort voices in util's --list-voices * Disable overlapping subtitles by default * Try to fix subtitles for many hour long TTS generation * Match Microsoft Edge connection behavior more * Bump edge_tts version
kill136 · Jan 25, 2023 · 58307de · 58307de
1 parent 85eef7d
commit 58307de
Showing 6 changed files with 57 additions and 27 deletions.
diff --git a/lint.sh b/lint.sh
@@ -1,2 +1,4 @@
+#!/bin/sh
+set -ux
 find src examples -name '*.py' | xargs pylint
 find src examples -name '*.py' | xargs mypy
diff --git a/pylintrc b/pylintrc
@@ -283,10 +283,10 @@ max-attributes=7
 max-bool-expr=5
 
 # Maximum number of branch for function / method body.
-max-branches=15
+max-branches=16
 
 # Maximum number of locals for function / method body.
-max-locals=15
+max-locals=18
 
 # Maximum number of parents for a class (see R0901).
 max-parents=7

diff --git a/src/edge_tts/communicate.py b/src/edge_tts/communicate.py
@@ -254,9 +254,15 @@ def __init__(
         self.voice: str = voice
         match = re.match(r"^([a-z]{2})-([A-Z]{2})-(.+Neural)$", voice)
         if match is not None:
+            lang = match.group(1)
+            region = match.group(2)
+            name = match.group(3)
+            if name.find("-") != -1:
+                region = region + "-" + name[: name.find("-")]
+                name = name[name.find("-") + 1 :]
             self.voice = (
                 "Microsoft Server Speech Text to Speech Voice"
-                + f" ({match.group(1)}-{match.group(2)}, {match.group(3)})"
+                + f" ({lang}-{region}, {name})"
             )
 
         if (
@@ -291,24 +297,29 @@ async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
             escape(remove_incompatible_characters(self.text)),
             calc_max_mesg_size(self.voice, self.rate, self.volume),
         )
-
-        async with aiohttp.ClientSession(trust_env=True) as session, session.ws_connect(
-            f"{WSS_URL}&ConnectionId={connect_id()}",
-            compress=15,
-            autoclose=True,
-            autoping=True,
-            proxy=self.proxy,
-            headers={
-                "Pragma": "no-cache",
-                "Cache-Control": "no-cache",
-                "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
-                "Accept-Encoding": "gzip, deflate, br",
-                "Accept-Language": "en-US,en;q=0.9",
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-                " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
-            },
-        ) as websocket:
-            for text in texts:
+        final_utterance: Dict[int, int] = {}
+        prev_idx = -1
+        shift_time = -1
+
+        for idx, text in enumerate(texts):
+            async with aiohttp.ClientSession(
+                trust_env=True
+            ) as session, session.ws_connect(
+                f"{WSS_URL}&ConnectionId={connect_id()}",
+                compress=15,
+                autoclose=True,
+                autoping=True,
+                proxy=self.proxy,
+                headers={
+                    "Pragma": "no-cache",
+                    "Cache-Control": "no-cache",
+                    "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
+                    "Accept-Encoding": "gzip, deflate, br",
+                    "Accept-Language": "en-US,en;q=0.9",
+                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+                    " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
+                },
+            ) as websocket:
                 # download indicates whether we should be expecting audio data,
                 # this is so what we avoid getting binary data from the websocket
                 # and falsely thinking it's audio data.
@@ -362,10 +373,25 @@ async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
                         elif path == b"audio.metadata":
                             for meta_obj in json.loads(data)["Metadata"]:
                                 meta_type = meta_obj["Type"]
+                                if idx != prev_idx:
+                                    shift_time = sum(
+                                        final_utterance[i] for i in range(idx)
+                                    )
+                                    prev_idx = idx
                                 if meta_type == "WordBoundary":
+                                    final_utterance[idx] = (
+                                        meta_obj["Data"]["Offset"]
+                                        + meta_obj["Data"]["Duration"]
+                                        # Average padding added by the service
+                                        # Alternatively we could use ffmpeg to get value properly
+                                        # but I don't want to add an additional dependency
+                                        # if this is found to work well enough.
+                                        + 8_750_000
+                                    )
                                     yield {
                                         "type": meta_type,
-                                        "offset": meta_obj["Data"]["Offset"],
+                                        "offset": meta_obj["Data"]["Offset"]
+                                        + shift_time,
                                         "duration": meta_obj["Data"]["Duration"],
                                         "text": meta_obj["Data"]["text"]["Text"],
                                     }

diff --git a/src/edge_tts/submaker.py b/src/edge_tts/submaker.py
@@ -40,7 +40,7 @@ class SubMaker:
     SubMaker class
     """
 
-    def __init__(self, overlapping: int = 1) -> None:
+    def __init__(self, overlapping: int = 0) -> None:
         """
         SubMaker constructor.
 

diff --git a/src/edge_tts/util.py b/src/edge_tts/util.py
@@ -14,7 +14,9 @@
 
 async def _print_voices(*, proxy: str) -> None:
     """Print all available voices."""
-    for idx, voice in enumerate(await list_voices(proxy=proxy)):
+    voices = await list_voices(proxy=proxy)
+    voices = sorted(voices, key=lambda voice: voice["ShortName"])  # type: ignore
+    for idx, voice in enumerate(voices):
         if idx != 0:
             print()
 
@@ -82,8 +84,8 @@ async def _async_main() -> None:
     parser.add_argument(
         "-O",
         "--overlapping",
-        help="overlapping subtitles in seconds",
-        default=1,
+        help="overlapping subtitles in seconds. Default: 0.",
+        default=0,
         type=float,
     )
     parser.add_argument(

diff --git a/src/edge_tts/version.py b/src/edge_tts/version.py
@@ -1,4 +1,4 @@
 """Edge TTS version information."""
 
-__version__ = "6.1.1"
+__version__ = "6.1.3"
 __version_info__ = tuple(int(num) for num in __version__.split("."))