From 8578d24c51a514678ea4d7336be86008839673db Mon Sep 17 00:00:00 2001 From: Sameer Rahmani Date: Fri, 15 Apr 2022 14:35:30 +0100 Subject: [PATCH] Add the Rigel for TTS --- .gitignore | 4 +- builder | 8 + rigel/.models.json | 377 +++++++++++++++++++++++++++++++++++++++++ rigel/__init__.py | 0 rigel/requirements.txt | 2 + rigel/server.py | 29 ++++ rigel/tcp.py | 75 ++++++++ 7 files changed, 494 insertions(+), 1 deletion(-) create mode 100644 rigel/.models.json create mode 100644 rigel/__init__.py create mode 100644 rigel/requirements.txt create mode 100755 rigel/server.py create mode 100644 rigel/tcp.py diff --git a/.gitignore b/.gitignore index c161c85..5e937ae 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,6 @@ nc.tar.xz *.ogg *.wav models -demo \ No newline at end of file +demo +__pycache__/ +*.pyc \ No newline at end of file diff --git a/builder b/builder index dc0b0b7..a65b052 100755 --- a/builder +++ b/builder @@ -92,6 +92,7 @@ function setup() { ## Setup the working directory and make it ready for developm # shellcheck source=.venv/bin/activate . "$ME/.venv/bin/activate" info "Intalling tflite runtime..." + pip3 install -r rigel/requirements.txt pip3 install --extra-index-url https://google-coral.github.io/py-repo/ tflite_runtime info "Downloding STT..." @@ -120,6 +121,13 @@ function run() { ## Setup the working directory and make it ready for developmen deactivate } +function rigel() { ## Run the rigel server + # shellcheck source=.venv/bin/activate + . "$ME/.venv/bin/activate" + "$ME/rigel/server.py" "$ME/.env" + deactivate +} + function help() { ## Print out this help message echo "Commands:" grep -E '^function [a-zA-Z0-9_-]+\(\) \{ ## .*$$' "$0" | \ diff --git a/rigel/.models.json b/rigel/.models.json new file mode 100644 index 0000000..801b846 --- /dev/null +++ b/rigel/.models.json @@ -0,0 +1,377 @@ +{ + "tts_models": { + "multilingual":{ + "multi-dataset":{ + "your_tts":{ + "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--multilingual--multi-dataset--your_tts.zip", + "default_vocoder": null, + "commit": "e9a1953e", + "license": "CC BY-NC-ND 4.0", + "contact": "egolge@coqui.ai" + } + } + }, + "en": { + "ek1": { + "tacotron2": { + "description": "EK1 en-rp tacotron2 by NMStoker", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.0/tts_models--en--ek1--tacotron2.zip", + "default_vocoder": "vocoder_models/en/ek1/wavegrad", + "commit": "c802255" + } + }, + "ljspeech": { + "tacotron2-DDC": { + "description": "Tacotron2 with Double Decoder Consistency.", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/tts_models--en--ljspeech--tacotron2-DDC.zip", + "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", + "commit": "bae2ad0f", + "author": "Eren Gölge @erogol", + "license": "", + "contact": "egolge@coqui.com" + }, + "tacotron2-DDC_ph": { + "description": "Tacotron2 with Double Decoder Consistency with phonemes.", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip", + "default_vocoder": "vocoder_models/en/ljspeech/univnet", + "commit": "3900448", + "author": "Eren Gölge @erogol", + "license": "", + "contact": "egolge@coqui.com" + }, + "glow-tts": { + "description": "", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--glow-tts.zip", + "stats_file": null, + "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", + "commit": "", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact": "egolge@coqui.com" + }, + "speedy-speech": { + "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/tts_models--en--ljspeech--speedy_speech.zip", + "stats_file": null, + "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", + "commit": "4581e3d", + "author": "Eren Gölge @erogol", + "license": "TBD", + "contact": "egolge@coqui.com" + }, + "tacotron2-DCA": { + "description": "", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--tacotron2-DCA.zip", + "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", + "commit": "", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact": "egolge@coqui.com" + }, + "vits": { + "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--vits.zip", + "default_vocoder": null, + "commit": "3900448", + "author": "Eren Gölge @erogol", + "license": "TBD", + "contact": "egolge@coqui.com" + }, + "fast_pitch": { + "description": "FastPitch model trained on LJSpeech using the Aligner Network", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.2/tts_models--en--ljspeech--fast_pitch.zip", + "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", + "commit": "b27b3ba", + "author": "Eren Gölge @erogol", + "license": "TBD", + "contact": "egolge@coqui.com" + } + }, + "vctk": { + "vits": { + "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--vits.zip", + "default_vocoder": null, + "commit": "3900448", + "author": "Eren @erogol", + "license": "", + "contact": "egolge@coqui.ai" + }, + "fast_pitch":{ + "description": "FastPitch model trained on VCTK dataseset.", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--fast_pitch.zip", + "default_vocoder": null, + "commit": "bdab788d", + "author": "Eren @erogol", + "license": "CC BY-NC-ND 4.0", + "contact": "egolge@coqui.ai" + } + }, + "sam": { + "tacotron-DDC": { + "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/tts_models--en--sam--tacotron_DDC.zip", + "default_vocoder": "vocoder_models/en/sam/hifigan_v2", + "commit": "bae2ad0f", + "author": "Eren Gölge @erogol", + "license": "", + "contact": "egolge@coqui.com" + } + } + }, + "es": { + "mai": { + "tacotron2-DDC": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--es--mai--tacotron2-DDC.zip", + "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", + "commit": "", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact": "egolge@coqui.com" + } + } + }, + "fr": { + "mai": { + "tacotron2-DDC": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--fr--mai--tacotron2-DDC.zip", + "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", + "commit": "", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact": "egolge@coqui.com" + } + } + }, + "uk":{ + "mai": { + "glow-tts": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--uk--mai--glow-tts.zip", + "author":"@robinhad", + "commit": "bdab788d", + "license": "MIT", + "contact": "", + "default_vocoder": "vocoder_models/uk/mai/multiband-melgan" + } + } + }, + "zh-CN": { + "baker": { + "tacotron2-DDC-GST": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip", + "commit": "unknown", + "author": "@kirianguiller", + "default_vocoder": null + } + } + }, + "nl": { + "mai": { + "tacotron2-DDC": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/tts_models--nl--mai--tacotron2-DDC.zip", + "author": "@r-dh", + "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan", + "stats_file": null, + "commit": "540d811" + } + } + }, + "de": { + "thorsten": { + "tacotron2-DCA": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip", + "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan", + "author": "@thorstenMueller", + "commit": "unknown" + } + } + }, + "ja": { + "kokoro": { + "tacotron2-DDC": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip", + "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1", + "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.", + "author": "@kaiidams", + "commit": "401fbd89" + } + } + }, + "tr":{ + "common-voice": { + "glow-tts":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--tr--common-voice--glow-tts.zip", + "default_vocoder": "vocoder_models/tr/common-voice/hifigan", + "license": "MIT", + "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.", + "author": "Fatih Akademi", + "commit": null + } + } + }, + "it": { + "mai_female": { + "glow-tts":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--glow-tts.zip", + "default_vocoder": null, + "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", + "author": "@nicolalandro", + "commit": null + }, + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--vits.zip", + "default_vocoder": null, + "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", + "author": "@nicolalandro", + "commit": null + } + }, + "mai_male": { + "glow-tts":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--glow-tts.zip", + "default_vocoder": null, + "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", + "author": "@nicolalandro", + "commit": null + }, + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--vits.zip", + "default_vocoder": null, + "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", + "author": "@nicolalandro", + "commit": null + } + } + } + }, + "vocoder_models": { + "universal": { + "libri-tts": { + "wavegrad": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--wavegrad.zip", + "commit": "ea976b0", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact": "egolge@coqui.com" + }, + "fullband-melgan": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--fullband-melgan.zip", + "commit": "4132240", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact": "egolge@coqui.com" + } + } + }, + "en": { + "ek1": { + "wavegrad": { + "description": "EK1 en-rp wavegrad by NMStoker", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--en--ek1--wavegrad.zip", + "commit": "c802255" + } + }, + "ljspeech": { + "multiband-melgan": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--en--ljspeech--mulitband-melgan.zip", + "commit": "ea976b0", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact": "egolge@coqui.com" + }, + "hifigan_v2": { + "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--ljspeech-hifigan_v2.zip", + "commit": "bae2ad0f", + "author": "@erogol", + "license": "", + "contact": "egolge@coqui.ai" + }, + "univnet": { + "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/vocoder_models--en--ljspeech--univnet_v2.zip", + "commit": "4581e3d", + "author": "Eren @erogol", + "license": "TBD", + "contact": "egolge@coqui.ai" + } + }, + "vctk": { + "hifigan_v2": { + "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--vctk--hifigan_v2.zip", + "commit": "2f07160", + "author": "Edresson Casanova", + "license": "", + "contact": "" + } + }, + "sam": { + "hifigan_v2": { + "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/vocoder_models--en--sam--hifigan_v2.zip", + "commit": "2f07160", + "author": "Eren Gölge @erogol", + "license": "", + "contact": "egolge@coqui.ai" + } + } + }, + "nl": { + "mai": { + "parallel-wavegan": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--nl--mai--parallel-wavegan.zip", + "author": "@r-dh", + "commit": "unknown" + } + } + }, + "de": { + "thorsten": { + "wavegrad": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip", + "author": "@thorstenMueller", + "commit": "unknown" + }, + "fullband-melgan": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip", + "author": "@thorstenMueller", + "commit": "unknown" + } + } + }, + "ja": { + "kokoro": { + "hifigan_v1": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/vocoder_models--ja--kokoro--hifigan_v1.zip", + "description": "HifiGAN model trained for kokoro dataset by @kaiidams", + "author": "@kaiidams", + "commit": "3900448" + } + } + }, + "uk": { + "mai": { + "multiband-melgan": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.5.0_models/vocoder_models--uk--mai--multiband-melgan.zip", + "author":"@robinhad", + "commit": "bdab788d", + "license": "MIT", + "contact": "" + } + } + }, + "tr":{ + "common-voice": { + "hifigan":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/vocoder_models--tr--common-voice--hifigan.zip", + "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.", + "author": "Fatih Akademi", + "license": "MIT", + "commit": null + } + } + } + } +} \ No newline at end of file diff --git a/rigel/__init__.py b/rigel/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rigel/requirements.txt b/rigel/requirements.txt new file mode 100644 index 0000000..0a95692 --- /dev/null +++ b/rigel/requirements.txt @@ -0,0 +1,2 @@ +tts==0.6 +python-dotenv diff --git a/rigel/server.py b/rigel/server.py new file mode 100755 index 0000000..d3cfa01 --- /dev/null +++ b/rigel/server.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +# +# Orion Rigel --- Text to Speech engine +# +# Copyright (c) 2022 Sameer Rahmani +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# You should have received a copy of the GNU General Public License + +import sys +import asyncio + +import dotenv + +from tcp import start, synth + + +def main(): + config = dotenv.dotenv_values(sys.argv[1]) + asyncio.run(start(config, synth(config))) + +if __name__ == "__main__": + main() diff --git a/rigel/tcp.py b/rigel/tcp.py new file mode 100644 index 0000000..90be2fa --- /dev/null +++ b/rigel/tcp.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# +# Orion Rigel --- Text to Speech engine +# +# Copyright (c) 2022 Sameer Rahmani +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# You should have received a copy of the GNU General Public License + +import asyncio +# pylint: disable=redefined-outer-name, unused-argument +from pathlib import Path + +from TTS.utils.manage import ModelManager +from TTS.utils.synthesizer import Synthesizer + + +def synth(config): + path = Path(__file__).parent / ".models.json" + manager = ModelManager(path) + + model_name = config.get("MODEL_NAME", "tts_models/en/ljspeech/tacotron2-DDC") + language_ids_file_path = None + vocoder_path = None + vocoder_config_path = None + encoder_path = None + encoder_config_path = None + + model_path, config_path, model_item = manager.download_model(model_name) + vocoder_name = model_item["default_vocoder"] + vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name) + speaker_idx = config.get("SPEAKER_IDX") + + # load models + synthesizer = Synthesizer( + model_path, + config_path, + None, + language_ids_file_path, + vocoder_path, + vocoder_config_path, + encoder_path, + encoder_config_path, + False, + ) + + async def tcp_handler(reader, writer): + data = await reader.readuntil(b'\0') + message = data.decode() + #wav = synthesizer.tts(message, speaker_idx, "None", None) + print(f"Received {message!r}") + + writer.write("Ok") + await writer.drain() + writer.close() + + return tcp_handler + + +async def start(config, fn): + host = config.get('host', '127.0.0.1') + port = config.get('port', 6666) + server = await asyncio.start_server(fn, host, port) + + addrs = ', '.join(str(sock.getsockname()) for sock in server.sockets) + print(f'Serving on {addrs}') + + async with server: + await server.serve_forever()