From 8578d24c51a514678ea4d7336be86008839673db Mon Sep 17 00:00:00 2001
From: Sameer Rahmani <lxsameer@gnu.org>
Date: Fri, 15 Apr 2022 14:35:30 +0100
Subject: [PATCH] Add the Rigel for TTS

---
 .gitignore             |   4 +-
 builder                |   8 +
 rigel/.models.json     | 377 +++++++++++++++++++++++++++++++++++++++++
 rigel/__init__.py      |   0
 rigel/requirements.txt |   2 +
 rigel/server.py        |  29 ++++
 rigel/tcp.py           |  75 ++++++++
 7 files changed, 494 insertions(+), 1 deletion(-)
 create mode 100644 rigel/.models.json
 create mode 100644 rigel/__init__.py
 create mode 100644 rigel/requirements.txt
 create mode 100755 rigel/server.py
 create mode 100644 rigel/tcp.py

diff --git a/.gitignore b/.gitignore
index c161c85..5e937ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,6 @@ nc.tar.xz
 *.ogg
 *.wav
 models
-demo
\ No newline at end of file
+demo
+__pycache__/
+*.pyc
\ No newline at end of file
diff --git a/builder b/builder
index dc0b0b7..a65b052 100755
--- a/builder
+++ b/builder
@@ -92,6 +92,7 @@ function setup() { ## Setup the working directory and make it ready for developm
     # shellcheck source=.venv/bin/activate
     . "$ME/.venv/bin/activate"
     info "Intalling tflite runtime..."
+    pip3 install -r rigel/requirements.txt
     pip3 install --extra-index-url https://google-coral.github.io/py-repo/ tflite_runtime
 
     info "Downloding STT..."
@@ -120,6 +121,13 @@ function run() { ## Setup the working directory and make it ready for developmen
     deactivate
 }
 
+function rigel() { ## Run the rigel server
+    # shellcheck source=.venv/bin/activate
+    . "$ME/.venv/bin/activate"
+    "$ME/rigel/server.py" "$ME/.env"
+    deactivate
+}
+
 function help() { ## Print out this help message
     echo "Commands:"
     grep -E '^function [a-zA-Z0-9_-]+\(\) \{ ## .*$$' "$0" | \
diff --git a/rigel/.models.json b/rigel/.models.json
new file mode 100644
index 0000000..801b846
--- /dev/null
+++ b/rigel/.models.json
@@ -0,0 +1,377 @@
+{
+    "tts_models": {
+        "multilingual":{
+            "multi-dataset":{
+                "your_tts":{
+                    "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--multilingual--multi-dataset--your_tts.zip",
+                    "default_vocoder": null,
+                    "commit": "e9a1953e",
+                    "license": "CC BY-NC-ND 4.0",
+                    "contact": "egolge@coqui.ai"
+                }
+            }
+        },
+        "en": {
+            "ek1": {
+                "tacotron2": {
+                    "description": "EK1 en-rp tacotron2 by NMStoker",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.0/tts_models--en--ek1--tacotron2.zip",
+                    "default_vocoder": "vocoder_models/en/ek1/wavegrad",
+                    "commit": "c802255"
+                }
+            },
+            "ljspeech": {
+                "tacotron2-DDC": {
+                    "description": "Tacotron2 with Double Decoder Consistency.",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/tts_models--en--ljspeech--tacotron2-DDC.zip",
+                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+                    "commit": "bae2ad0f",
+                    "author": "Eren Gölge @erogol",
+                    "license": "",
+                    "contact": "egolge@coqui.com"
+                },
+                "tacotron2-DDC_ph": {
+                    "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
+                    "default_vocoder": "vocoder_models/en/ljspeech/univnet",
+                    "commit": "3900448",
+                    "author": "Eren Gölge @erogol",
+                    "license": "",
+                    "contact": "egolge@coqui.com"
+                },
+                "glow-tts": {
+                    "description": "",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--glow-tts.zip",
+                    "stats_file": null,
+                    "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
+                    "commit": "",
+                    "author": "Eren Gölge @erogol",
+                    "license": "MPL",
+                    "contact": "egolge@coqui.com"
+                },
+                "speedy-speech": {
+                    "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/tts_models--en--ljspeech--speedy_speech.zip",
+                    "stats_file": null,
+                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+                    "commit": "4581e3d",
+                    "author": "Eren Gölge @erogol",
+                    "license": "TBD",
+                    "contact": "egolge@coqui.com"
+                },
+                "tacotron2-DCA": {
+                    "description": "",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--tacotron2-DCA.zip",
+                    "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
+                    "commit": "",
+                    "author": "Eren Gölge @erogol",
+                    "license": "MPL",
+                    "contact": "egolge@coqui.com"
+                },
+                "vits": {
+                    "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--vits.zip",
+                    "default_vocoder": null,
+                    "commit": "3900448",
+                    "author": "Eren Gölge @erogol",
+                    "license": "TBD",
+                    "contact": "egolge@coqui.com"
+                },
+                "fast_pitch": {
+                    "description": "FastPitch model trained on LJSpeech using the Aligner Network",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.2/tts_models--en--ljspeech--fast_pitch.zip",
+                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+                    "commit": "b27b3ba",
+                    "author": "Eren Gölge @erogol",
+                    "license": "TBD",
+                    "contact": "egolge@coqui.com"
+                }
+            },
+            "vctk": {
+                "vits": {
+                    "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--vits.zip",
+                    "default_vocoder": null,
+                    "commit": "3900448",
+                    "author": "Eren @erogol",
+                    "license": "",
+                    "contact": "egolge@coqui.ai"
+                },
+                "fast_pitch":{
+                    "description": "FastPitch model trained on VCTK dataseset.",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--fast_pitch.zip",
+                    "default_vocoder": null,
+                    "commit": "bdab788d",
+                    "author": "Eren @erogol",
+                    "license": "CC BY-NC-ND 4.0",
+                    "contact": "egolge@coqui.ai"
+                }
+            },
+            "sam": {
+                "tacotron-DDC": {
+                    "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/tts_models--en--sam--tacotron_DDC.zip",
+                    "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
+                    "commit": "bae2ad0f",
+                    "author": "Eren Gölge @erogol",
+                    "license": "",
+                    "contact": "egolge@coqui.com"
+                }
+            }
+        },
+        "es": {
+            "mai": {
+                "tacotron2-DDC": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--es--mai--tacotron2-DDC.zip",
+                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
+                    "commit": "",
+                    "author": "Eren Gölge @erogol",
+                    "license": "MPL",
+                    "contact": "egolge@coqui.com"
+                }
+            }
+        },
+        "fr": {
+            "mai": {
+                "tacotron2-DDC": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--fr--mai--tacotron2-DDC.zip",
+                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
+                    "commit": "",
+                    "author": "Eren Gölge @erogol",
+                    "license": "MPL",
+                    "contact": "egolge@coqui.com"
+                }
+            }
+        },
+        "uk":{
+            "mai": {
+                "glow-tts": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--uk--mai--glow-tts.zip",
+                    "author":"@robinhad",
+                    "commit": "bdab788d",
+                    "license": "MIT",
+                    "contact": "",
+                    "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
+                }
+            }
+        },
+        "zh-CN": {
+            "baker": {
+                "tacotron2-DDC-GST": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
+                    "commit": "unknown",
+                    "author": "@kirianguiller",
+                    "default_vocoder": null
+                }
+            }
+        },
+        "nl": {
+            "mai": {
+                "tacotron2-DDC": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/tts_models--nl--mai--tacotron2-DDC.zip",
+                    "author": "@r-dh",
+                    "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
+                    "stats_file": null,
+                    "commit": "540d811"
+                }
+            }
+        },
+        "de": {
+            "thorsten": {
+                "tacotron2-DCA": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip",
+                    "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
+                    "author": "@thorstenMueller",
+                    "commit": "unknown"
+                }
+            }
+        },
+        "ja": {
+            "kokoro": {
+                "tacotron2-DDC": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip",
+                    "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
+                    "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
+                    "author": "@kaiidams",
+                    "commit": "401fbd89"
+                }
+            }
+        },
+        "tr":{
+            "common-voice": {
+                "glow-tts":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--tr--common-voice--glow-tts.zip",
+                    "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
+                    "license": "MIT",
+                    "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
+                    "author": "Fatih Akademi",
+                    "commit": null
+                }
+            }
+        },
+        "it": {
+            "mai_female": {
+                "glow-tts":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--glow-tts.zip",
+                    "default_vocoder": null,
+                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
+                    "author": "@nicolalandro",
+                    "commit": null
+                },
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--vits.zip",
+                    "default_vocoder": null,
+                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
+                    "author": "@nicolalandro",
+                    "commit": null
+                }
+            },
+            "mai_male": {
+                "glow-tts":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--glow-tts.zip",
+                    "default_vocoder": null,
+                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
+                    "author": "@nicolalandro",
+                    "commit": null
+                },
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--vits.zip",
+                    "default_vocoder": null,
+                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
+                    "author": "@nicolalandro",
+                    "commit": null
+                }
+            }
+        }
+    },
+    "vocoder_models": {
+        "universal": {
+            "libri-tts": {
+                "wavegrad": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--wavegrad.zip",
+                    "commit": "ea976b0",
+                    "author": "Eren Gölge @erogol",
+                    "license": "MPL",
+                    "contact": "egolge@coqui.com"
+                },
+                "fullband-melgan": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--fullband-melgan.zip",
+                    "commit": "4132240",
+                    "author": "Eren Gölge @erogol",
+                    "license": "MPL",
+                    "contact": "egolge@coqui.com"
+                }
+            }
+        },
+        "en": {
+            "ek1": {
+                "wavegrad": {
+                    "description": "EK1 en-rp wavegrad by NMStoker",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--en--ek1--wavegrad.zip",
+                    "commit": "c802255"
+                }
+            },
+            "ljspeech": {
+                "multiband-melgan": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--en--ljspeech--mulitband-melgan.zip",
+                    "commit": "ea976b0",
+                    "author": "Eren Gölge @erogol",
+                    "license": "MPL",
+                    "contact": "egolge@coqui.com"
+                },
+                "hifigan_v2": {
+                    "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--ljspeech-hifigan_v2.zip",
+                    "commit": "bae2ad0f",
+                    "author": "@erogol",
+                    "license": "",
+                    "contact": "egolge@coqui.ai"
+                },
+                "univnet": {
+                    "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/vocoder_models--en--ljspeech--univnet_v2.zip",
+                    "commit": "4581e3d",
+                    "author": "Eren @erogol",
+                    "license": "TBD",
+                    "contact": "egolge@coqui.ai"
+                }
+            },
+            "vctk": {
+                "hifigan_v2": {
+                    "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--vctk--hifigan_v2.zip",
+                    "commit": "2f07160",
+                    "author": "Edresson Casanova",
+                    "license": "",
+                    "contact": ""
+                }
+            },
+            "sam": {
+                "hifigan_v2": {
+                    "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/vocoder_models--en--sam--hifigan_v2.zip",
+                    "commit": "2f07160",
+                    "author": "Eren Gölge @erogol",
+                    "license": "",
+                    "contact": "egolge@coqui.ai"
+                }
+            }
+        },
+        "nl": {
+            "mai": {
+                "parallel-wavegan": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--nl--mai--parallel-wavegan.zip",
+                    "author": "@r-dh",
+                    "commit": "unknown"
+                }
+            }
+        },
+        "de": {
+            "thorsten": {
+                "wavegrad": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip",
+                    "author": "@thorstenMueller",
+                    "commit": "unknown"
+                },
+                "fullband-melgan": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip",
+                    "author": "@thorstenMueller",
+                    "commit": "unknown"
+                }
+            }
+        },
+        "ja": {
+            "kokoro": {
+                "hifigan_v1": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/vocoder_models--ja--kokoro--hifigan_v1.zip",
+                    "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
+                    "author": "@kaiidams",
+                    "commit": "3900448"
+                }
+            }
+        },
+        "uk": {
+            "mai": {
+                "multiband-melgan": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.5.0_models/vocoder_models--uk--mai--multiband-melgan.zip",
+                    "author":"@robinhad",
+                    "commit": "bdab788d",
+                    "license": "MIT",
+                    "contact": ""
+                }
+            }
+        },
+        "tr":{
+            "common-voice": {
+                "hifigan":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/vocoder_models--tr--common-voice--hifigan.zip",
+                    "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
+                    "author": "Fatih Akademi",
+                    "license": "MIT",
+                    "commit": null
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/rigel/__init__.py b/rigel/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rigel/requirements.txt b/rigel/requirements.txt
new file mode 100644
index 0000000..0a95692
--- /dev/null
+++ b/rigel/requirements.txt
@@ -0,0 +1,2 @@
+tts==0.6
+python-dotenv
diff --git a/rigel/server.py b/rigel/server.py
new file mode 100755
index 0000000..d3cfa01
--- /dev/null
+++ b/rigel/server.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+#
+# Orion Rigel --- Text to Speech engine
+#
+#   Copyright (c) 2022  Sameer Rahmani <lxsameer@gnu.org>
+#
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation, either version 2 of the License.
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#   You should have received a copy of the GNU General Public License
+
+import sys
+import asyncio
+
+import dotenv
+
+from tcp import start, synth
+
+
+def main():
+    config = dotenv.dotenv_values(sys.argv[1])
+    asyncio.run(start(config, synth(config)))
+
+if __name__ == "__main__":
+    main()
diff --git a/rigel/tcp.py b/rigel/tcp.py
new file mode 100644
index 0000000..90be2fa
--- /dev/null
+++ b/rigel/tcp.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+#
+# Orion Rigel --- Text to Speech engine
+#
+#   Copyright (c) 2022  Sameer Rahmani <lxsameer@gnu.org>
+#
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation, either version 2 of the License.
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#   You should have received a copy of the GNU General Public License
+
+import asyncio
+# pylint: disable=redefined-outer-name, unused-argument
+from pathlib import Path
+
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+
+
+def synth(config):
+    path = Path(__file__).parent / ".models.json"
+    manager = ModelManager(path)
+
+    model_name = config.get("MODEL_NAME", "tts_models/en/ljspeech/tacotron2-DDC")
+    language_ids_file_path = None
+    vocoder_path = None
+    vocoder_config_path = None
+    encoder_path = None
+    encoder_config_path = None
+
+    model_path, config_path, model_item = manager.download_model(model_name)
+    vocoder_name = model_item["default_vocoder"]
+    vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)
+    speaker_idx = config.get("SPEAKER_IDX")
+
+    # load models
+    synthesizer = Synthesizer(
+        model_path,
+        config_path,
+        None,
+        language_ids_file_path,
+        vocoder_path,
+        vocoder_config_path,
+        encoder_path,
+        encoder_config_path,
+        False,
+    )
+
+    async def tcp_handler(reader, writer):
+        data = await reader.readuntil(b'\0')
+        message = data.decode()
+        #wav = synthesizer.tts(message, speaker_idx, "None", None)
+        print(f"Received {message!r}")
+
+        writer.write("Ok")
+        await writer.drain()
+        writer.close()
+
+    return tcp_handler
+
+
+async def start(config, fn):
+    host = config.get('host', '127.0.0.1')
+    port = config.get('port', 6666)
+    server = await asyncio.start_server(fn, host, port)
+
+    addrs = ', '.join(str(sock.getsockname()) for sock in server.sockets)
+    print(f'Serving on {addrs}')
+
+    async with server:
+        await server.serve_forever()