From c7b918ddae67d78bcfd6b4e887d496bc2778cb71 Mon Sep 17 00:00:00 2001 From: Sameer Rahmani Date: Sat, 9 Apr 2022 18:14:30 +0100 Subject: [PATCH] Finish up the basic bot behavior and the general speech recognition --- .gitignore | 3 +- builder | 10 +- main.go | 84 ---------------- orion.go | 59 +++++++++--- pkg/core/converter.go | 2 +- pkg/core/core.go | 216 ++++++++++++++++++++++++++++++++++++++++-- 6 files changed, 260 insertions(+), 114 deletions(-) delete mode 100644 main.go diff --git a/.gitignore b/.gitignore index a8e97d7..29e802b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ coqui/ .env nc.tar.xz *.ogg -*.wav \ No newline at end of file +*.wav +models \ No newline at end of file diff --git a/builder b/builder index ffc3dbe..dc0b0b7 100755 --- a/builder +++ b/builder @@ -52,11 +52,6 @@ export CGO_LDFLAGS=-L$ME/coqui/ export CGO_CXXFLAGS=-I$ME/coqui/ export LD_LIBRARY_PATH=$ME/coqui/:$LD_LIBRARY_PATH -# The `builder` script is supposed to be run from the -# root of the source tree -ROOT_DIR=$ME -BUILD_DIR=$ROOT_DIR/build - default_model="$ME/models/default.tflite" default_scorer="$ME/models/default.scorer" @@ -105,13 +100,14 @@ function setup() { ## Setup the working directory and make it ready for developm tar -Jxvf nc.tar.xz -C "$ME/coqui" info "Installing coqui go binding..." - go get -u github.com/asticode/go-asticoqui/... go mod download info "Downloading the model..." - wget "$MODEL" -O "$default_model$ME/models/default.tflite" + mkdir -p "$(dirname $default_model)" + wget "$MODEL" -O "$default_model" info "Downloading the scorer..." + mkdir -p "$(dirname $default_scorer)" wget "$SCORER" -O "$default_scorer" deactivate } diff --git a/main.go b/main.go deleted file mode 100644 index 13a6a8a..0000000 --- a/main.go +++ /dev/null @@ -1,84 +0,0 @@ -package main - -import ( - "fmt" - "log" - "os" - "time" - "github.com/joho/godotenv" - tele "gopkg.in/telebot.v3" -) - -const user = "lxsameer" -const version = "0.1.0" - -func main1() { - err := godotenv.Load(".env") - if err != nil { - log.Fatal(err) - return - } - - pref := tele.Settings{ - Token: os.Getenv("BOT_TOKEN"), - Poller: &tele.LongPoller{Timeout: 10 * time.Second}, - } - - b, err := tele.NewBot(pref) - if err != nil { - log.Fatal(err) - return - } - - b.Handle(tele.OnText, func(c tele.Context) error { - me := c.Sender().Username - if (me != user) { - return c.Send("Get lost!") - } - return c.Send("Get lost!") - }) - - b.Handle("/version", func(c tele.Context) error { - me := c.Sender().Username - if (me != user) { - return c.Send("Get lost!") - } - - return c.Send(version) - }) - - b.Handle(tele.OnVoice, func(c tele.Context) error { - me := c.Sender().Username - if (me != user) { - return c.Send("Get lost!") - } - - fmt.Println("Got a voice!") - audio := c.Message().Voice.File - time := c.Message().Time().Format("2006-01-02_15:04:05") - c.Bot().Download(&audio, time + ".ogg") - fmt.Printf("Got ahthnthnt voice!\n") - - // _, err = b.FileReader.Read(content) - - // if (err != nil) { - // fmt.Printf("Error: %s\n", err) - // return c.Send("Error!") - // } - - // f, err := os.Create("/tmp/f.mp3") - - // if (err != nil) { - // fmt.Printf("Error: %s\n", err) - // return c.Send("Error!") - // } - // defer f.Close() - - // f.Write(content) - - fmt.Println("user:", me) - return c.Send("Got it!") - }) - - b.Start() -} diff --git a/orion.go b/orion.go index c441d97..1598c3b 100644 --- a/orion.go +++ b/orion.go @@ -21,38 +21,73 @@ import ( "flag" "log" "fmt" + "path/filepath" "lxsameer.com/go/orion/pkg/core" + "github.com/asticode/go-asticoqui" + "github.com/joho/godotenv" + ) -var model = flag.String("model", "", "Path to the model (protocol buffer binary file)") -var scorer = flag.String("scorer", "", "Path to the external scorer") +var model = flag.String("model", "models/default.tflite", "Path to the model (protocol buffer binary file)") +var scorer = flag.String("scorer", "models/default.scorer", "Path to the external scorer") var owner = flag.String("owner", "", "Telegram user id that is allowed to use this bot") +var storage = flag.String("voice-storage", "", "Where to store the voices") func main() { flag.Parse() log.SetFlags(0) - if *model == "" { + err := godotenv.Load(".env") + if err != nil { + log.Fatal(err) + return + } + + if *model == "" || *scorer == "" { // In case of error print error and print usage // This can also be done by passing -h or --help flags fmt.Fprintf(flag.CommandLine.Output(), "Usage of %s:\n", os.Args[0]) flag.PrintDefaults() return } - // // Initialize Coqui - // m, err := asticoqui.New(*model) - // if err != nil { - // log.Fatal("Failed initializing model: ", err) - // } - // defer m.Close() - a1 := "/home/lxsameer/src/orion/2022-04-09_11:48:57.ogg" - a2 := "/home/lxsameer/src/orion/blah.wav" - err := core.ConvertOggtoWav(&a1, &a2) + if *storage == "" { + dir, err := os.UserHomeDir() + if err != nil { + log.Fatal(err) + return + } + *storage = filepath.Join(dir, ".orion", "storage") + } + + bot, err := core.CreateBot(storage) if err != nil { log.Fatal(err) return } + + bot.Owner = *owner + + // Initialize Coqui + m, err := asticoqui.New(*model) + if err != nil { + log.Fatal("Failed initializing model: ", err) + } + defer m.Close() + + if err := m.EnableExternalScorer(*scorer); err != nil { + log.Fatal("Failed enabling external scorer: ", err) + return + } + + bot.Model = m + + bot.StartBot() + + // a1 := "/home/lxsameer/src/orion/2022-04-09_11:48:57.ogg" + // a2 := "/home/lxsameer/src/orion/blah.wav" + // err = core.ConvertOggtoWav(&a1, &a2) + log.Println("done!") } diff --git a/pkg/core/converter.go b/pkg/core/converter.go index 78eb48c..8d7f075 100644 --- a/pkg/core/converter.go +++ b/pkg/core/converter.go @@ -21,7 +21,7 @@ import ( ) func ConvertOggtoWav(inputFile *string, outputFile *string) error { - cmd := exec.Command("ffmpeg", "-i", *inputFile, *outputFile) + cmd := exec.Command("ffmpeg", "-i", *inputFile, "-ar", "16000", *outputFile) if err := cmd.Run(); err != nil { return err } diff --git a/pkg/core/core.go b/pkg/core/core.go index 231117a..2335793 100644 --- a/pkg/core/core.go +++ b/pkg/core/core.go @@ -1,12 +1,210 @@ +/* + Orion --- Speech to text bot + + Copyright (c) 2022 Sameer Rahmani + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ package core -// import ( -// "flag" -// "fmt" -// "io" -// "log" -// "os" +import ( + "fmt" + "io" + "log" + "os" + "path/filepath" + "strings" + "time" -// "github.com/asticode/go-asticoqui" -// "github.com/cryptix/wav" -// ) + "github.com/asticode/go-asticoqui" + "github.com/cryptix/wav" + tele "gopkg.in/telebot.v3" +) + +type Bot struct { + Owner string + StoragePath string + VoiceStorage string + Token string + Model *asticoqui.Model + ExtendedMetaData bool + MaxResults uint +} + +func CreateBot(storage *string) (*Bot,error) { + voiceStorage := filepath.Join(*storage, "voices") + err := os.MkdirAll(voiceStorage, 0750) + + if err != nil { + return nil, err + } + + return &Bot{ + Token: os.Getenv("BOT_TOKEN"), + MaxResults: 5, + ExtendedMetaData: false, + StoragePath: *storage, + VoiceStorage: voiceStorage, + }, nil +} + +func (bot *Bot) StartBot() { + pref := tele.Settings{ + Token: bot.Token, + // TODO: Move this to the config + Poller: &tele.LongPoller{Timeout: 10 * time.Second}, + } + + b, err := tele.NewBot(pref) + if err != nil { + log.Fatal(err) + return + } + + // b.Handle(tele.OnText, func(c tele.Context) error { + // if !bot.isOwner(c) { + // return nil + // } + // }) + + b.Handle(tele.OnVoice, func(c tele.Context) error { + if !bot.isOwner(c) { + return nil + } + + log.Println("Got a voice!") + filepath, err := bot.StoreVoice(c) + if err != nil { + return err + } + text, err := bot.ConvertToText(filepath) + + if err != nil { + return err + } + + s := strings.Join(*text, "**") + + c.Send(s) + fmt.Println(s) + + return c.Send("Got it!") + }) + + b.Start() +} + +func (bot *Bot) isOwner(c tele.Context) bool { + me := c.Sender().Username + log.Printf("[Info] User '%s' is trying to connect!", me) + + if me != bot.Owner { + c.Send("Get lost!") + return false + } + return true +} + +func (bot *Bot) StoreVoice(c tele.Context) (*string, error) { + v := c.Message().Voice.File + time := c.Message().Time().Format("2006-01-02_15:04:05") + + path := filepath.Join(bot.VoiceStorage, time + ".ogg") + wavePath := filepath.Join(bot.VoiceStorage, time + ".wav") + + if err := c.Bot().Download(&v, path); err != nil { + return nil, err + } + + if err := ConvertOggtoWav(&path, &wavePath); err != nil { + return nil, err + } + + if err := os.Remove(path); err != nil { + return nil, err + } + + return &wavePath, nil +} + + +func (bot *Bot) ConvertToText(voice *string) (*[]string, error) { + // Stat audio + i, err := os.Stat(*voice) + + if err != nil { + return nil, fmt.Errorf("stating %s failed: %w", *voice, err) + } + + // Open audio + f, err := os.Open(*voice) + if err != nil { + return nil, fmt.Errorf("opening %s failed: %w", *voice, err) + } + + // Create reader + r, err := wav.NewReader(f, i.Size()) + if err != nil { + return nil, fmt.Errorf("creating new reader failed: %w", err) + } + + // Read + var d []int16 + for { + // Read sample + s, err := r.ReadSample() + if err == io.EOF { + break + } else if err != nil { + return nil, fmt.Errorf("reading sample failed: %w", err) + } + + // Append + d = append(d, int16(s)) + } + + // Speech to text + var results []string + + if bot.ExtendedMetaData { + metadata, err := bot.Model.SpeechToTextWithMetadata(d, bot.MaxResults) + if err != nil { + return nil, fmt.Errorf("failed converting speech to text: ", err) + } + defer metadata.Close() + results = metadataToStrings(metadata) + } else { + res, err := bot.Model.SpeechToText(d) + if err != nil { + return nil, fmt.Errorf("failed converting speech to text: ", err) + } + results = []string{res} + } + + // for _, res := range results { + // fmt.Println("Text:", res) + // } + return &results, nil +} + + +func metadataToStrings(m *asticoqui.Metadata) []string { + results := make([]string, 0, m.NumTranscripts()) + for _, tr := range m.Transcripts() { + var res string + for _, tok := range tr.Tokens() { + res += tok.Text() + } + res += fmt.Sprintf(" [%0.3f]", tr.Confidence()) + results = append(results, res) + } + return results +}