Finish up the basic bot behavior and the general speech recognition

This commit is contained in:
Sameer Rahmani 2022-04-09 18:14:30 +01:00
parent b1f2020eaf
commit c7b918ddae
6 changed files with 260 additions and 114 deletions

3
.gitignore vendored
View File

@ -5,4 +5,5 @@ coqui/
.env
nc.tar.xz
*.ogg
*.wav
*.wav
models

10
builder
View File

@ -52,11 +52,6 @@ export CGO_LDFLAGS=-L$ME/coqui/
export CGO_CXXFLAGS=-I$ME/coqui/
export LD_LIBRARY_PATH=$ME/coqui/:$LD_LIBRARY_PATH
# The `builder` script is supposed to be run from the
# root of the source tree
ROOT_DIR=$ME
BUILD_DIR=$ROOT_DIR/build
default_model="$ME/models/default.tflite"
default_scorer="$ME/models/default.scorer"
@ -105,13 +100,14 @@ function setup() { ## Setup the working directory and make it ready for developm
tar -Jxvf nc.tar.xz -C "$ME/coqui"
info "Installing coqui go binding..."
go get -u github.com/asticode/go-asticoqui/...
go mod download
info "Downloading the model..."
wget "$MODEL" -O "$default_model$ME/models/default.tflite"
mkdir -p "$(dirname $default_model)"
wget "$MODEL" -O "$default_model"
info "Downloading the scorer..."
mkdir -p "$(dirname $default_scorer)"
wget "$SCORER" -O "$default_scorer"
deactivate
}

84
main.go
View File

@ -1,84 +0,0 @@
package main
import (
"fmt"
"log"
"os"
"time"
"github.com/joho/godotenv"
tele "gopkg.in/telebot.v3"
)
const user = "lxsameer"
const version = "0.1.0"
func main1() {
err := godotenv.Load(".env")
if err != nil {
log.Fatal(err)
return
}
pref := tele.Settings{
Token: os.Getenv("BOT_TOKEN"),
Poller: &tele.LongPoller{Timeout: 10 * time.Second},
}
b, err := tele.NewBot(pref)
if err != nil {
log.Fatal(err)
return
}
b.Handle(tele.OnText, func(c tele.Context) error {
me := c.Sender().Username
if (me != user) {
return c.Send("Get lost!")
}
return c.Send("Get lost!")
})
b.Handle("/version", func(c tele.Context) error {
me := c.Sender().Username
if (me != user) {
return c.Send("Get lost!")
}
return c.Send(version)
})
b.Handle(tele.OnVoice, func(c tele.Context) error {
me := c.Sender().Username
if (me != user) {
return c.Send("Get lost!")
}
fmt.Println("Got a voice!")
audio := c.Message().Voice.File
time := c.Message().Time().Format("2006-01-02_15:04:05")
c.Bot().Download(&audio, time + ".ogg")
fmt.Printf("Got ahthnthnt voice!\n")
// _, err = b.FileReader.Read(content)
// if (err != nil) {
// fmt.Printf("Error: %s\n", err)
// return c.Send("Error!")
// }
// f, err := os.Create("/tmp/f.mp3")
// if (err != nil) {
// fmt.Printf("Error: %s\n", err)
// return c.Send("Error!")
// }
// defer f.Close()
// f.Write(content)
fmt.Println("user:", me)
return c.Send("Got it!")
})
b.Start()
}

View File

@ -21,38 +21,73 @@ import (
"flag"
"log"
"fmt"
"path/filepath"
"lxsameer.com/go/orion/pkg/core"
"github.com/asticode/go-asticoqui"
"github.com/joho/godotenv"
)
var model = flag.String("model", "", "Path to the model (protocol buffer binary file)")
var scorer = flag.String("scorer", "", "Path to the external scorer")
var model = flag.String("model", "models/default.tflite", "Path to the model (protocol buffer binary file)")
var scorer = flag.String("scorer", "models/default.scorer", "Path to the external scorer")
var owner = flag.String("owner", "", "Telegram user id that is allowed to use this bot")
var storage = flag.String("voice-storage", "", "Where to store the voices")
func main() {
flag.Parse()
log.SetFlags(0)
if *model == "" {
err := godotenv.Load(".env")
if err != nil {
log.Fatal(err)
return
}
if *model == "" || *scorer == "" {
// In case of error print error and print usage
// This can also be done by passing -h or --help flags
fmt.Fprintf(flag.CommandLine.Output(), "Usage of %s:\n", os.Args[0])
flag.PrintDefaults()
return
}
// // Initialize Coqui
// m, err := asticoqui.New(*model)
// if err != nil {
// log.Fatal("Failed initializing model: ", err)
// }
// defer m.Close()
a1 := "/home/lxsameer/src/orion/2022-04-09_11:48:57.ogg"
a2 := "/home/lxsameer/src/orion/blah.wav"
err := core.ConvertOggtoWav(&a1, &a2)
if *storage == "" {
dir, err := os.UserHomeDir()
if err != nil {
log.Fatal(err)
return
}
*storage = filepath.Join(dir, ".orion", "storage")
}
bot, err := core.CreateBot(storage)
if err != nil {
log.Fatal(err)
return
}
bot.Owner = *owner
// Initialize Coqui
m, err := asticoqui.New(*model)
if err != nil {
log.Fatal("Failed initializing model: ", err)
}
defer m.Close()
if err := m.EnableExternalScorer(*scorer); err != nil {
log.Fatal("Failed enabling external scorer: ", err)
return
}
bot.Model = m
bot.StartBot()
// a1 := "/home/lxsameer/src/orion/2022-04-09_11:48:57.ogg"
// a2 := "/home/lxsameer/src/orion/blah.wav"
// err = core.ConvertOggtoWav(&a1, &a2)
log.Println("done!")
}

View File

@ -21,7 +21,7 @@ import (
)
func ConvertOggtoWav(inputFile *string, outputFile *string) error {
cmd := exec.Command("ffmpeg", "-i", *inputFile, *outputFile)
cmd := exec.Command("ffmpeg", "-i", *inputFile, "-ar", "16000", *outputFile)
if err := cmd.Run(); err != nil {
return err
}

View File

@ -1,12 +1,210 @@
/*
Orion --- Speech to text bot
Copyright (c) 2022 Sameer Rahmani <lxsameer@gnu.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package core
// import (
// "flag"
// "fmt"
// "io"
// "log"
// "os"
import (
"fmt"
"io"
"log"
"os"
"path/filepath"
"strings"
"time"
// "github.com/asticode/go-asticoqui"
// "github.com/cryptix/wav"
// )
"github.com/asticode/go-asticoqui"
"github.com/cryptix/wav"
tele "gopkg.in/telebot.v3"
)
type Bot struct {
Owner string
StoragePath string
VoiceStorage string
Token string
Model *asticoqui.Model
ExtendedMetaData bool
MaxResults uint
}
func CreateBot(storage *string) (*Bot,error) {
voiceStorage := filepath.Join(*storage, "voices")
err := os.MkdirAll(voiceStorage, 0750)
if err != nil {
return nil, err
}
return &Bot{
Token: os.Getenv("BOT_TOKEN"),
MaxResults: 5,
ExtendedMetaData: false,
StoragePath: *storage,
VoiceStorage: voiceStorage,
}, nil
}
func (bot *Bot) StartBot() {
pref := tele.Settings{
Token: bot.Token,
// TODO: Move this to the config
Poller: &tele.LongPoller{Timeout: 10 * time.Second},
}
b, err := tele.NewBot(pref)
if err != nil {
log.Fatal(err)
return
}
// b.Handle(tele.OnText, func(c tele.Context) error {
// if !bot.isOwner(c) {
// return nil
// }
// })
b.Handle(tele.OnVoice, func(c tele.Context) error {
if !bot.isOwner(c) {
return nil
}
log.Println("Got a voice!")
filepath, err := bot.StoreVoice(c)
if err != nil {
return err
}
text, err := bot.ConvertToText(filepath)
if err != nil {
return err
}
s := strings.Join(*text, "**")
c.Send(s)
fmt.Println(s)
return c.Send("Got it!")
})
b.Start()
}
func (bot *Bot) isOwner(c tele.Context) bool {
me := c.Sender().Username
log.Printf("[Info] User '%s' is trying to connect!", me)
if me != bot.Owner {
c.Send("Get lost!")
return false
}
return true
}
func (bot *Bot) StoreVoice(c tele.Context) (*string, error) {
v := c.Message().Voice.File
time := c.Message().Time().Format("2006-01-02_15:04:05")
path := filepath.Join(bot.VoiceStorage, time + ".ogg")
wavePath := filepath.Join(bot.VoiceStorage, time + ".wav")
if err := c.Bot().Download(&v, path); err != nil {
return nil, err
}
if err := ConvertOggtoWav(&path, &wavePath); err != nil {
return nil, err
}
if err := os.Remove(path); err != nil {
return nil, err
}
return &wavePath, nil
}
func (bot *Bot) ConvertToText(voice *string) (*[]string, error) {
// Stat audio
i, err := os.Stat(*voice)
if err != nil {
return nil, fmt.Errorf("stating %s failed: %w", *voice, err)
}
// Open audio
f, err := os.Open(*voice)
if err != nil {
return nil, fmt.Errorf("opening %s failed: %w", *voice, err)
}
// Create reader
r, err := wav.NewReader(f, i.Size())
if err != nil {
return nil, fmt.Errorf("creating new reader failed: %w", err)
}
// Read
var d []int16
for {
// Read sample
s, err := r.ReadSample()
if err == io.EOF {
break
} else if err != nil {
return nil, fmt.Errorf("reading sample failed: %w", err)
}
// Append
d = append(d, int16(s))
}
// Speech to text
var results []string
if bot.ExtendedMetaData {
metadata, err := bot.Model.SpeechToTextWithMetadata(d, bot.MaxResults)
if err != nil {
return nil, fmt.Errorf("failed converting speech to text: ", err)
}
defer metadata.Close()
results = metadataToStrings(metadata)
} else {
res, err := bot.Model.SpeechToText(d)
if err != nil {
return nil, fmt.Errorf("failed converting speech to text: ", err)
}
results = []string{res}
}
// for _, res := range results {
// fmt.Println("Text:", res)
// }
return &results, nil
}
func metadataToStrings(m *asticoqui.Metadata) []string {
results := make([]string, 0, m.NumTranscripts())
for _, tr := range m.Transcripts() {
var res string
for _, tok := range tr.Tokens() {
res += tok.Text()
}
res += fmt.Sprintf(" [%0.3f]", tr.Confidence())
results = append(results, res)
}
return results
}