stt_voice.py 4.84 KB
import base64

import requests
import json
import re
import datetime
import ast

import logging

import soundfile as sf
import codecs
import subprocess
import copy

# Enable logging
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    level=logging.DEBUG,
                    filename = u'stt_voice.log')


logger = logging.getLogger(__name__)

class Speech_Client:
    def __init__(self, url, port, subscription_key):
        """
        Initialize SDK client for simple work with API server using python.
        :param subscription_key: your subscription key that you can get from our manager.
        :param url: API server url. For example: "https://bigdata.nsu.ru"
        :param port: API server port. For example: "5044"

        :type self: Biometric_Client
        :type subscription_key: str
        :type port: int
        :type url: str
        """
        self.url = url
        self.port = port
        self.subscription_key = subscription_key
        self.connection_string = url + ":" + str(port) + "/" #+ subscription_key + "/"
        self.client = requests.session()
        self.client.verify = False #"./ssl-cert-snakeoil1.key.cer"
        #self.client.cert = "./ssl-cert-snakeoil.key"
    
    def change_subscription_key(self, subscription_key):
        self.subscription_key = subscription_key
        self.connection_string = self.url + ":" + str(self.port) + "/" # + self.subscription_key + "/"
        self.client = requests.session()
        self.client.verify = False
        

    @staticmethod                   # TODO: multiple dicts in list length
    def getJSON(response):
        if response:
            return response.json()
        else:
            raise TypeError("no service output json") #OR VALUE ERROR?


    def recognize_speech(self, filename, extension):
        """
        Recognize text from speech.
        :param filename: name of audio file with person voice. String value.
        :type filename: str
        :param extension: audio file extension. Supported extensions:
        * WAV
        * MP3
        * OGG
        * FLAC
        :type extension: str
        :return:
        * message - error condition. 1 - result with error. 0 - result without error
        * result - if no error: list of dictionaries. Each item of list contains:
            ** "stt.punct" - list of recognized sentences.
            ** "sent" - float value between 0 and ~110. Bigger value mean that identifying voice most look like sample voice
            that was enrolled to profile before.
        If have error then return error string.
        :rtype: dict
        """
        logger = logging.getLogger(__name__)
        
        try:
            stt_connection_string = self.url + ":8000" + "/recognize"
            sttp_connection_string = self.url + ":8001" + "/recognize"
            ner_connection_string = self.url + ":8002" + "/recognize"
            

            # transform to 8khz WAV 16bit PCM
            subprocess.run(["ffmpeg",'-y','-t','300','-i', filename+'.'+extension, '-ar','8000', '-ac','1', 'tmp.wav'])

#            data, samplerate = sf.read(filename+'.'+extension)
#            sf.write(filename+'.wav', data, 8000)
            # run STT
            f = open('tmp.wav', 'rb')
            files = {'wav':('tmp.wav',f,'audio/wave'),
                     'json':('tmp.json',open('tmp.json','rb'),'application/json')}

            logger.info('POST request to connection string: %s', stt_connection_string)
            r = requests.post(stt_connection_string,files=files)
            logger.info('Got response: {}'.format(r.text))
            stt_json = self.getJSON(r)
            r.close()
            with codecs.open('tmp1.json', mode='w', encoding='utf-8', errors='ignore') as f:
                json.dump(stt_json, fp=f, ensure_ascii=False)

            # run STT.PUNCT
            f = open('tmp.wav', 'rb')
            files = {'json':('tmp1.json',open('tmp1.json','rb'),'application/json')}

            logger.info('POST request to connection string: %s', sttp_connection_string)
            r = requests.post(sttp_connection_string,files=files)
            sttp_json = self.getJSON(r)
            r.close()
            
            logger.info('Server response: {}'.format(sttp_json))
            
            # run NER
            logger.info('POST request to connection string: %s', ner_connection_string)
            sttp_json['stt.dictors']=copy.deepcopy(sttp_json['stt.punct'])
            for s in sttp_json['stt.dictors']:
                s['text']=s['sent']
            r = requests.post(ner_connection_string,json=sttp_json)
            ner_json = self.getJSON(r)
            r.close()
            
            logger.info('Server response: {}'.format(ner_json))
            
            result = ner_json
            return result
        except Exception as e:
            return {"message": str(e)+"; " + str(e.__doc__)}