TakwayPlatform/utils/stt/funasr_utils.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ####################################################### #
# FunAutoSpeechRecognizer: https://github.com/alibaba-damo-academy/FunASR
# ####################################################### #
import io
import numpy as np
import base64
import wave
from funasr import AutoModel
from .base_stt import STTBase

def decode_str2bytes(data):
    # 将Base64编码的字节串解码为字节串
    if data is None:
        return None
    return base64.b64decode(data.encode('utf-8'))

class FunAutoSpeechRecognizer(STTBase):
    def __init__(self, 
                 model_path="paraformer-zh-streaming", 
                 device="cuda", 
                 RATE=16000, 
                 cfg_path=None, 
                 debug=False, 
                 chunk_ms=480, 
                 encoder_chunk_look_back=4, 
                 decoder_chunk_look_back=1, 
                 **kwargs):
        super().__init__(RATE=RATE, cfg_path=cfg_path, debug=debug)
        
        
        self.asr_model = AutoModel(model=model_path, device=device, **kwargs)
        
        self.encoder_chunk_look_back = encoder_chunk_look_back #number of chunks to lookback for encoder self-attention
        self.decoder_chunk_look_back = decoder_chunk_look_back #number of encoder chunks to lookback for decoder cross-attention
        
        #[0, 8, 4] 480ms, [0, 10, 5] 600ms
        if chunk_ms == 480:
            self.chunk_size = [0, 8, 4] 
        elif chunk_ms == 600:
            self.chunk_size = [0, 10, 5]
        else:
            raise ValueError("`chunk_ms` should be 480 or 600, and type is int.")
        self.chunk_partial_size = self.chunk_size[1] * 960
        self.audio_cache = {}
        self.asr_cache = {}

        # self.audio_cache = None
        # self.asr_cache = {}
        
        self._init_asr()
        
    def check_audio_type(self, audio_data):
        """check audio data type and convert it to bytes if necessary."""
        if isinstance(audio_data, bytes):
            pass
        elif isinstance(audio_data, list):
            audio_data = b''.join(audio_data)
        elif isinstance(audio_data, str):
            audio_data = decode_str2bytes(audio_data)
        elif isinstance(audio_data, io.BytesIO):
            wf = wave.open(audio_data, 'rb')
            audio_data = wf.readframes(wf.getnframes())
        elif isinstance(audio_data, np.ndarray):
            pass
        else:
            raise TypeError(f"audio_data must be bytes, list, str, \
                io.BytesIO or numpy array, but got {type(audio_data)}")
        
        if isinstance(audio_data, bytes):
            audio_data = np.frombuffer(audio_data, dtype=np.int16)
        elif isinstance(audio_data, np.ndarray):
            if audio_data.dtype != np.int16:
                audio_data = audio_data.astype(np.int16)
        else:
            raise TypeError(f"audio_data must be bytes or numpy array, but got {type(audio_data)}")
        return audio_data
    
    def _init_asr(self):
        # 随机初始化一段音频数据
        init_audio_data = np.random.randint(-32768, 32767, size=self.chunk_partial_size, dtype=np.int16)
        self.session_signup("init")
        self.asr_model.generate(input=init_audio_data, cache=self.asr_cache, is_final=False, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back, session_id="init")
        self.session_signout("init")
        # print("init ASR model done.")
    
    # when chat trying to use asr , sign up
    def session_signup(self,session_id):
        self.audio_cache[session_id] = None
        self.asr_cache[session_id] = {}

    # when chat finish using asr , sign out
    def session_signout(self,session_id):
        del self.audio_cache[session_id]
        del self.asr_cache[session_id]

    def streaming_recognize(self,
                            session_id, 
                            audio_data, 
                            is_end=False, 
                            auto_det_end=False):
        """recognize partial result
        
        Args:
            audio_data: bytes or numpy array, partial audio data
            is_end: bool, whether the audio data is the end of a sentence
            auto_det_end: bool, whether to automatically detect the end of a audio data
        """
        text_dict = dict(text=[], is_end=is_end)

        audio_cache = self.audio_cache[session_id]
        
        audio_data = self.check_audio_type(audio_data)
        if audio_cache is None:
            audio_cache = audio_data
        else:
            if audio_cache.shape[0] > 0:
                audio_cache = np.concatenate([audio_cache, audio_data], axis=0)
        
        if not is_end and audio_cache.shape[0] < self.chunk_partial_size:
            self.audio_cache[session_id] = audio_cache
            return text_dict
        
        total_chunk_num = int((len(audio_cache)-1)/self.chunk_partial_size)
        
        if is_end:
            # if the audio data is the end of a sentence, \
            # we need to add one more chunk to the end to \
            # ensure the end of the sentence is recognized correctly.
            auto_det_end = True
        
        if auto_det_end:
            total_chunk_num += 1

        end_idx = None
        for i in range(total_chunk_num):
            if auto_det_end:
                is_end = i == total_chunk_num - 1
            start_idx = i*self.chunk_partial_size
            if auto_det_end:
                end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num-1 else -1
            else:
                end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num else -1
            # print(f"cut part: {start_idx}:{end_idx}, is_end: {is_end}, i: {i}, total_chunk_num: {total_chunk_num}")
            # t_stamp = time.time()
            
            speech_chunk = audio_cache[start_idx:end_idx]

            # TODO: exceptions processes
            # print("i:", i)
            try:
                res = self.asr_model.generate(input=speech_chunk, cache=self.asr_cache, is_final=is_end, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back, session_id=session_id)
            except ValueError as e:
                print(f"ValueError: {e}")
                continue
            text_dict['text'].append(self.text_postprecess(res[0], data_id='text'))
            # print(f"each chunk time: {time.time()-t_stamp}")
            
        if is_end:
            audio_cache = None
        else:
            if end_idx:
                audio_cache = audio_cache[end_idx:] # cut the processed part from audio_cache
        text_dict['is_end'] = is_end
        
        
        self.audio_cache[session_id] = audio_cache
        return text_dict
    
    def streaming_recognize_origin(self,
                            session_id, 
                            audio_data, 
                            is_end=False, 
                            auto_det_end=False):
        """recognize partial result
        
        Args:
            audio_data: bytes or numpy array, partial audio data
            is_end: bool, whether the audio data is the end of a sentence
            auto_det_end: bool, whether to automatically detect the end of a audio data
        """
        text_dict = dict(text=[], is_end=is_end)

        audio_cache = self.audio_cache[session_id]
        asr_cache = self.asr_cache[session_id]
        
        audio_data = self.check_audio_type(audio_data)
        if audio_cache is None:
            audio_cache = audio_data
        else:
            if audio_cache.shape[0] > 0:
                audio_cache = np.concatenate([audio_cache, audio_data], axis=0)
        
        if not is_end and audio_cache.shape[0] < self.chunk_partial_size:
            self.audio_cache[session_id] = audio_cache
            return text_dict
        
        total_chunk_num = int((len(audio_cache)-1)/self.chunk_partial_size)
        
        if is_end:
            # if the audio data is the end of a sentence, \
            # we need to add one more chunk to the end to \
            # ensure the end of the sentence is recognized correctly.
            auto_det_end = True
        
        if auto_det_end:
            total_chunk_num += 1

        end_idx = None
        for i in range(total_chunk_num):
            if auto_det_end:
                is_end = i == total_chunk_num - 1
            start_idx = i*self.chunk_partial_size
            if auto_det_end:
                end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num-1 else -1
            else:
                end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num else -1
            # print(f"cut part: {start_idx}:{end_idx}, is_end: {is_end}, i: {i}, total_chunk_num: {total_chunk_num}")
            # t_stamp = time.time()
            
            speech_chunk = audio_cache[start_idx:end_idx]

            # TODO: exceptions processes
            try:
                res = self.asr_model.generate(input=speech_chunk, cache=asr_cache, is_final=is_end, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back)
            except ValueError as e:
                print(f"ValueError: {e}")
                continue
            text_dict['text'].append(self.text_postprecess(res[0], data_id='text'))
            # print(f"each chunk time: {time.time()-t_stamp}")
            
        if is_end:
            audio_cache = None
            asr_cache = {}
        else:
            if end_idx:
                audio_cache = audio_cache[end_idx:] # cut the processed part from audio_cache
        text_dict['is_end'] = is_end
        
        
        self.audio_cache[session_id] = audio_cache
        self.asr_cache[session_id] = asr_cache
        return text_dict


#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ####################################################### #
# FunAutoSpeechRecognizer: https://github.com/alibaba-damo-academy/FunASR
# ####################################################### #
# import io
# import numpy as np
# import base64
# import wave
# from funasr import AutoModel
# from .base_stt import STTBase

# def decode_str2bytes(data):
#     # 将Base64编码的字节串解码为字节串
#     if data is None:
#         return None
#     return base64.b64decode(data.encode('utf-8'))

# class FunAutoSpeechRecognizer(STTBase):
#     def __init__(self, 
#                  model_path="paraformer-zh-streaming", 
#                  device="cuda", 
#                  RATE=16000, 
#                  cfg_path=None, 
#                  debug=False, 
#                  chunk_ms=480, 
#                  encoder_chunk_look_back=4, 
#                  decoder_chunk_look_back=1, 
#                  **kwargs):
#         super().__init__(RATE=RATE, cfg_path=cfg_path, debug=debug)
        
#         self.asr_model = AutoModel(model=model_path, device=device, **kwargs)
        
#         self.encoder_chunk_look_back = encoder_chunk_look_back #number of chunks to lookback for encoder self-attention
#         self.decoder_chunk_look_back = decoder_chunk_look_back #number of encoder chunks to lookback for decoder cross-attention
        
#         #[0, 8, 4] 480ms, [0, 10, 5] 600ms
#         if chunk_ms == 480:
#             self.chunk_size = [0, 8, 4] 
#         elif chunk_ms == 600:
#             self.chunk_size = [0, 10, 5]
#         else:
#             raise ValueError("`chunk_ms` should be 480 or 600, and type is int.")
#         self.chunk_partial_size = self.chunk_size[1] * 960 
#         self.audio_cache = None
#         self.asr_cache = {}
        
        
#         self._init_asr()
        
#     def check_audio_type(self, audio_data):
#         """check audio data type and convert it to bytes if necessary."""
#         if isinstance(audio_data, bytes):
#             pass
#         elif isinstance(audio_data, list):
#             audio_data = b''.join(audio_data)
#         elif isinstance(audio_data, str):
#             audio_data = decode_str2bytes(audio_data)
#         elif isinstance(audio_data, io.BytesIO):
#             wf = wave.open(audio_data, 'rb')
#             audio_data = wf.readframes(wf.getnframes())
#         elif isinstance(audio_data, np.ndarray):
#             pass
#         else:
#             raise TypeError(f"audio_data must be bytes, list, str, \
#                 io.BytesIO or numpy array, but got {type(audio_data)}")
        
#         if isinstance(audio_data, bytes):
#             audio_data = np.frombuffer(audio_data, dtype=np.int16)
#         elif isinstance(audio_data, np.ndarray):
#             if audio_data.dtype != np.int16:
#                 audio_data = audio_data.astype(np.int16)
#         else:
#             raise TypeError(f"audio_data must be bytes or numpy array, but got {type(audio_data)}")
#         return audio_data
    
#     def _init_asr(self):
#         # 随机初始化一段音频数据
#         init_audio_data = np.random.randint(-32768, 32767, size=self.chunk_partial_size, dtype=np.int16)
#         self.asr_model.generate(input=init_audio_data, cache=self.asr_cache, is_final=False, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back)
#         self.audio_cache = None
#         self.asr_cache = {}
#         # print("init ASR model done.")
    
#     def recognize(self, audio_data):
#         """recognize audio data to text"""
#         audio_data = self.check_audio_type(audio_data)
#         result = self.asr_model.generate(input=audio_data, 
#                      batch_size_s=300, 
#                      hotword=self.hotwords)
        
#         # print(result)
#         text = ''
#         for res in result:
#             text += res['text']
#         return text
    
#     def streaming_recognize(self, 
#                             audio_data, 
#                             is_end=False, 
#                             auto_det_end=False):
#         """recognize partial result
        
#         Args:
#             audio_data: bytes or numpy array, partial audio data
#             is_end: bool, whether the audio data is the end of a sentence
#             auto_det_end: bool, whether to automatically detect the end of a audio data
#         """
#         text_dict = dict(text=[], is_end=is_end)
        
#         audio_data = self.check_audio_type(audio_data)
#         if self.audio_cache is None:
#             self.audio_cache = audio_data
#         else:
#             # print(f"audio_data: {audio_data.shape}, audio_cache: {self.audio_cache.shape}")
#             if self.audio_cache.shape[0] > 0:
#                 self.audio_cache = np.concatenate([self.audio_cache, audio_data], axis=0)
        
#         if not is_end and self.audio_cache.shape[0] < self.chunk_partial_size:
#             return text_dict
        
#         total_chunk_num = int((len(self.audio_cache)-1)/self.chunk_partial_size)
        
#         if is_end:
#             # if the audio data is the end of a sentence, \
#             # we need to add one more chunk to the end to \
#             # ensure the end of the sentence is recognized correctly.
#             auto_det_end = True
        
#         if auto_det_end:
#             total_chunk_num += 1

#         # print(f"chunk_size: {self.chunk_size}, chunk_stride: {self.chunk_partial_size}, total_chunk_num: {total_chunk_num}, len: {len(self.audio_cache)}")
#         end_idx = None
#         for i in range(total_chunk_num):
#             if auto_det_end:
#                 is_end = i == total_chunk_num - 1
#             start_idx = i*self.chunk_partial_size
#             if auto_det_end:
#                 end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num-1 else -1
#             else:
#                 end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num else -1
#             # print(f"cut part: {start_idx}:{end_idx}, is_end: {is_end}, i: {i}, total_chunk_num: {total_chunk_num}")
#             # t_stamp = time.time()
            
#             speech_chunk = self.audio_cache[start_idx:end_idx]

#             # TODO: exceptions processes
#             try:
#                 res = self.asr_model.generate(input=speech_chunk, cache=self.asr_cache, is_final=is_end, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back)
#             except ValueError as e:
#                 print(f"ValueError: {e}")
#                 continue
#             text_dict['text'].append(self.text_postprecess(res[0], data_id='text'))
#             # print(f"each chunk time: {time.time()-t_stamp}")
            
#         if is_end:
#             self.audio_cache = None
#             self.asr_cache = {}
#         else:
#             if end_idx:
#                 self.audio_cache = self.audio_cache[end_idx:] # cut the processed part from audio_cache
#         text_dict['is_end'] = is_end
        
#         # print(f"text_dict: {text_dict}")
#         return text_dict
仓库初始化 2024-05-01 17:18:30 +08:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`
			`# ####################################################### #`
			`# FunAutoSpeechRecognizer: https://github.com/alibaba-damo-academy/FunASR`
			`# ####################################################### #`
			`import io`
			`import numpy as np`
			`import base64`
			`import wave`
			`from funasr import AutoModel`
			`from .base_stt import STTBase`

			`def decode_str2bytes(data):`
			`# 将Base64编码的字节串解码为字节串`
			`if data is None:`
			`return None`
			`return base64.b64decode(data.encode('utf-8'))`

			`class FunAutoSpeechRecognizer(STTBase):`
			`def __init__(self,`
			`model_path="paraformer-zh-streaming",`
			`device="cuda",`
			`RATE=16000,`
			`cfg_path=None,`
			`debug=False,`
			`chunk_ms=480,`
			`encoder_chunk_look_back=4,`
			`decoder_chunk_look_back=1,`
			`**kwargs):`
			`super().__init__(RATE=RATE, cfg_path=cfg_path, debug=debug)`

[update] 更新funasr parformer_streaming cache管理，增加seesion_id字段 2024-05-15 22:38:12 +08:00
仓库初始化 2024-05-01 17:18:30 +08:00			`self.asr_model = AutoModel(model=model_path, device=device, **kwargs)`

			`self.encoder_chunk_look_back = encoder_chunk_look_back #number of chunks to lookback for encoder self-attention`
			`self.decoder_chunk_look_back = decoder_chunk_look_back #number of encoder chunks to lookback for decoder cross-attention`

			`#[0, 8, 4] 480ms, [0, 10, 5] 600ms`
			`if chunk_ms == 480:`
			`self.chunk_size = [0, 8, 4]`
			`elif chunk_ms == 600:`
			`self.chunk_size = [0, 10, 5]`
			`else:`
			raise ValueError("`chunk_ms` should be 480 or 600, and type is int.")
feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00			`self.chunk_partial_size = self.chunk_size[1] * 960`
			`self.audio_cache = {}`
仓库初始化 2024-05-01 17:18:30 +08:00			`self.asr_cache = {}`
feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00
			`# self.audio_cache = None`
			`# self.asr_cache = {}`
仓库初始化 2024-05-01 17:18:30 +08:00
			`self._init_asr()`

			`def check_audio_type(self, audio_data):`
			`"""check audio data type and convert it to bytes if necessary."""`
			`if isinstance(audio_data, bytes):`
			`pass`
			`elif isinstance(audio_data, list):`
			`audio_data = b''.join(audio_data)`
			`elif isinstance(audio_data, str):`
			`audio_data = decode_str2bytes(audio_data)`
			`elif isinstance(audio_data, io.BytesIO):`
			`wf = wave.open(audio_data, 'rb')`
			`audio_data = wf.readframes(wf.getnframes())`
			`elif isinstance(audio_data, np.ndarray):`
			`pass`
			`else:`
			`raise TypeError(f"audio_data must be bytes, list, str, \`
			`io.BytesIO or numpy array, but got {type(audio_data)}")`

			`if isinstance(audio_data, bytes):`
			`audio_data = np.frombuffer(audio_data, dtype=np.int16)`
			`elif isinstance(audio_data, np.ndarray):`
			`if audio_data.dtype != np.int16:`
			`audio_data = audio_data.astype(np.int16)`
			`else:`
			`raise TypeError(f"audio_data must be bytes or numpy array, but got {type(audio_data)}")`
			`return audio_data`

			`def _init_asr(self):`
			`# 随机初始化一段音频数据`
			`init_audio_data = np.random.randint(-32768, 32767, size=self.chunk_partial_size, dtype=np.int16)`
[update] 更新funasr parformer_streaming cache管理，增加seesion_id字段 2024-05-15 23:01:17 +08:00			`self.session_signup("init")`
			`self.asr_model.generate(input=init_audio_data, cache=self.asr_cache, is_final=False, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back, session_id="init")`
			`self.session_signout("init")`
仓库初始化 2024-05-01 17:18:30 +08:00			`# print("init ASR model done.")`

feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00			`# when chat trying to use asr , sign up`
			`def session_signup(self,session_id):`
			`self.audio_cache[session_id] = None`
			`self.asr_cache[session_id] = {}`

			`# when chat finish using asr , sign out`
			`def session_signout(self,session_id):`
			`del self.audio_cache[session_id]`
			`del self.asr_cache[session_id]`

			`def streaming_recognize(self,`
			`session_id,`
仓库初始化 2024-05-01 17:18:30 +08:00			`audio_data,`
			`is_end=False,`
[update] 更新funasr parformer_streaming cache管理，增加seesion_id字段 2024-05-15 21:56:10 +08:00			`auto_det_end=False):`
			`"""recognize partial result`

			`Args:`
			`audio_data: bytes or numpy array, partial audio data`
			`is_end: bool, whether the audio data is the end of a sentence`
			`auto_det_end: bool, whether to automatically detect the end of a audio data`
			`"""`
			`text_dict = dict(text=[], is_end=is_end)`

			`audio_cache = self.audio_cache[session_id]`

			`audio_data = self.check_audio_type(audio_data)`
			`if audio_cache is None:`
			`audio_cache = audio_data`
			`else:`
			`if audio_cache.shape[0] > 0:`
			`audio_cache = np.concatenate([audio_cache, audio_data], axis=0)`

			`if not is_end and audio_cache.shape[0] < self.chunk_partial_size:`
			`self.audio_cache[session_id] = audio_cache`
			`return text_dict`

			`total_chunk_num = int((len(audio_cache)-1)/self.chunk_partial_size)`

			`if is_end:`
			`# if the audio data is the end of a sentence, \`
			`# we need to add one more chunk to the end to \`
			`# ensure the end of the sentence is recognized correctly.`
			`auto_det_end = True`

			`if auto_det_end:`
			`total_chunk_num += 1`

			`end_idx = None`
			`for i in range(total_chunk_num):`
			`if auto_det_end:`
			`is_end = i == total_chunk_num - 1`
			`start_idx = i*self.chunk_partial_size`
			`if auto_det_end:`
			`end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num-1 else -1`
			`else:`
			`end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num else -1`
			`# print(f"cut part: {start_idx}:{end_idx}, is_end: {is_end}, i: {i}, total_chunk_num: {total_chunk_num}")`
			`# t_stamp = time.time()`

			`speech_chunk = audio_cache[start_idx:end_idx]`

			`# TODO: exceptions processes`
[update] 更新funasr parformer_streaming cache管理，增加seesion_id字段 2024-05-15 22:38:12 +08:00			`# print("i:", i)`
[update] 更新funasr parformer_streaming cache管理，增加seesion_id字段 2024-05-15 21:56:10 +08:00			`try:`
[update] 更新funasr parformer_streaming cache管理，增加seesion_id字段 2024-05-15 23:01:17 +08:00			`res = self.asr_model.generate(input=speech_chunk, cache=self.asr_cache, is_final=is_end, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back, session_id=session_id)`
[update] 更新funasr parformer_streaming cache管理，增加seesion_id字段 2024-05-15 21:56:10 +08:00			`except ValueError as e:`
			`print(f"ValueError: {e}")`
			`continue`
			`text_dict['text'].append(self.text_postprecess(res[0], data_id='text'))`
			`# print(f"each chunk time: {time.time()-t_stamp}")`

			`if is_end:`
			`audio_cache = None`
			`else:`
			`if end_idx:`
			`audio_cache = audio_cache[end_idx:] # cut the processed part from audio_cache`
			`text_dict['is_end'] = is_end`


			`self.audio_cache[session_id] = audio_cache`
			`return text_dict`

			`def streaming_recognize_origin(self,`
			`session_id,`
			`audio_data,`
			`is_end=False,`
仓库初始化 2024-05-01 17:18:30 +08:00			`auto_det_end=False):`
			`"""recognize partial result`

			`Args:`
			`audio_data: bytes or numpy array, partial audio data`
			`is_end: bool, whether the audio data is the end of a sentence`
			`auto_det_end: bool, whether to automatically detect the end of a audio data`
			`"""`
			`text_dict = dict(text=[], is_end=is_end)`
feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00
			`audio_cache = self.audio_cache[session_id]`
			`asr_cache = self.asr_cache[session_id]`
仓库初始化 2024-05-01 17:18:30 +08:00
			`audio_data = self.check_audio_type(audio_data)`
feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00			`if audio_cache is None:`
			`audio_cache = audio_data`
仓库初始化 2024-05-01 17:18:30 +08:00			`else:`
feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00			`if audio_cache.shape[0] > 0:`
			`audio_cache = np.concatenate([audio_cache, audio_data], axis=0)`
仓库初始化 2024-05-01 17:18:30 +08:00
feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00			`if not is_end and audio_cache.shape[0] < self.chunk_partial_size:`
			`self.audio_cache[session_id] = audio_cache`
仓库初始化 2024-05-01 17:18:30 +08:00			`return text_dict`

feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00			`total_chunk_num = int((len(audio_cache)-1)/self.chunk_partial_size)`
仓库初始化 2024-05-01 17:18:30 +08:00
			`if is_end:`
			`# if the audio data is the end of a sentence, \`
			`# we need to add one more chunk to the end to \`
			`# ensure the end of the sentence is recognized correctly.`
			`auto_det_end = True`

			`if auto_det_end:`
			`total_chunk_num += 1`

			`end_idx = None`
			`for i in range(total_chunk_num):`
			`if auto_det_end:`
			`is_end = i == total_chunk_num - 1`
			`start_idx = i*self.chunk_partial_size`
			`if auto_det_end:`
			`end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num-1 else -1`
			`else:`
			`end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num else -1`
			`# print(f"cut part: {start_idx}:{end_idx}, is_end: {is_end}, i: {i}, total_chunk_num: {total_chunk_num}")`
			`# t_stamp = time.time()`

feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00			`speech_chunk = audio_cache[start_idx:end_idx]`
仓库初始化 2024-05-01 17:18:30 +08:00
			`# TODO: exceptions processes`
			`try:`
feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00			`res = self.asr_model.generate(input=speech_chunk, cache=asr_cache, is_final=is_end, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back)`
仓库初始化 2024-05-01 17:18:30 +08:00			`except ValueError as e:`
			`print(f"ValueError: {e}")`
			`continue`
			`text_dict['text'].append(self.text_postprecess(res[0], data_id='text'))`
			`# print(f"each chunk time: {time.time()-t_stamp}")`

			`if is_end:`
feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00			`audio_cache = None`
			`asr_cache = {}`
仓库初始化 2024-05-01 17:18:30 +08:00			`else:`
			`if end_idx:`
feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00			`audio_cache = audio_cache[end_idx:] # cut the processed part from audio_cache`
仓库初始化 2024-05-01 17:18:30 +08:00			`text_dict['is_end'] = is_end`

feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00
			`self.audio_cache[session_id] = audio_cache`
			`self.asr_cache[session_id] = asr_cache`
仓库初始化 2024-05-01 17:18:30 +08:00			`return text_dict`


feat: 使全局asr实例有一定并发能力 2024-05-13 10:44:58 +08:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`
			`# ####################################################### #`
			`# FunAutoSpeechRecognizer: https://github.com/alibaba-damo-academy/FunASR`
			`# ####################################################### #`
			`# import io`
			`# import numpy as np`
			`# import base64`
			`# import wave`
			`# from funasr import AutoModel`
			`# from .base_stt import STTBase`

			`# def decode_str2bytes(data):`
			`# # 将Base64编码的字节串解码为字节串`
			`# if data is None:`
			`# return None`
			`# return base64.b64decode(data.encode('utf-8'))`

			`# class FunAutoSpeechRecognizer(STTBase):`
			`# def __init__(self,`
			`# model_path="paraformer-zh-streaming",`
			`# device="cuda",`
			`# RATE=16000,`
			`# cfg_path=None,`
			`# debug=False,`
			`# chunk_ms=480,`
			`# encoder_chunk_look_back=4,`
			`# decoder_chunk_look_back=1,`
			`# **kwargs):`
			`# super().__init__(RATE=RATE, cfg_path=cfg_path, debug=debug)`

			`# self.asr_model = AutoModel(model=model_path, device=device, **kwargs)`

			`# self.encoder_chunk_look_back = encoder_chunk_look_back #number of chunks to lookback for encoder self-attention`
			`# self.decoder_chunk_look_back = decoder_chunk_look_back #number of encoder chunks to lookback for decoder cross-attention`

			`# #[0, 8, 4] 480ms, [0, 10, 5] 600ms`
			`# if chunk_ms == 480:`
			`# self.chunk_size = [0, 8, 4]`
			`# elif chunk_ms == 600:`
			`# self.chunk_size = [0, 10, 5]`
			`# else:`
			# raise ValueError("`chunk_ms` should be 480 or 600, and type is int.")
			`# self.chunk_partial_size = self.chunk_size[1] * 960`
			`# self.audio_cache = None`
			`# self.asr_cache = {}`



			`# self._init_asr()`

			`# def check_audio_type(self, audio_data):`
			`# """check audio data type and convert it to bytes if necessary."""`
			`# if isinstance(audio_data, bytes):`
			`# pass`
			`# elif isinstance(audio_data, list):`
			`# audio_data = b''.join(audio_data)`
			`# elif isinstance(audio_data, str):`
			`# audio_data = decode_str2bytes(audio_data)`
			`# elif isinstance(audio_data, io.BytesIO):`
			`# wf = wave.open(audio_data, 'rb')`
			`# audio_data = wf.readframes(wf.getnframes())`
			`# elif isinstance(audio_data, np.ndarray):`
			`# pass`
			`# else:`
			`# raise TypeError(f"audio_data must be bytes, list, str, \`
			`# io.BytesIO or numpy array, but got {type(audio_data)}")`

			`# if isinstance(audio_data, bytes):`
			`# audio_data = np.frombuffer(audio_data, dtype=np.int16)`
			`# elif isinstance(audio_data, np.ndarray):`
			`# if audio_data.dtype != np.int16:`
			`# audio_data = audio_data.astype(np.int16)`
			`# else:`
			`# raise TypeError(f"audio_data must be bytes or numpy array, but got {type(audio_data)}")`
			`# return audio_data`

			`# def _init_asr(self):`
			`# # 随机初始化一段音频数据`
			`# init_audio_data = np.random.randint(-32768, 32767, size=self.chunk_partial_size, dtype=np.int16)`
			`# self.asr_model.generate(input=init_audio_data, cache=self.asr_cache, is_final=False, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back)`
			`# self.audio_cache = None`
			`# self.asr_cache = {}`
			`# # print("init ASR model done.")`

			`# def recognize(self, audio_data):`
			`# """recognize audio data to text"""`
			`# audio_data = self.check_audio_type(audio_data)`
			`# result = self.asr_model.generate(input=audio_data,`
			`# batch_size_s=300,`
			`# hotword=self.hotwords)`

			`# # print(result)`
			`# text = ''`
			`# for res in result:`
			`# text += res['text']`
			`# return text`

			`# def streaming_recognize(self,`
			`# audio_data,`
			`# is_end=False,`
			`# auto_det_end=False):`
			`# """recognize partial result`

			`# Args:`
			`# audio_data: bytes or numpy array, partial audio data`
			`# is_end: bool, whether the audio data is the end of a sentence`
			`# auto_det_end: bool, whether to automatically detect the end of a audio data`
			`# """`
			`# text_dict = dict(text=[], is_end=is_end)`

			`# audio_data = self.check_audio_type(audio_data)`
			`# if self.audio_cache is None:`
			`# self.audio_cache = audio_data`
			`# else:`
			`# # print(f"audio_data: {audio_data.shape}, audio_cache: {self.audio_cache.shape}")`
			`# if self.audio_cache.shape[0] > 0:`
			`# self.audio_cache = np.concatenate([self.audio_cache, audio_data], axis=0)`

			`# if not is_end and self.audio_cache.shape[0] < self.chunk_partial_size:`
			`# return text_dict`

			`# total_chunk_num = int((len(self.audio_cache)-1)/self.chunk_partial_size)`

			`# if is_end:`
			`# # if the audio data is the end of a sentence, \`
			`# # we need to add one more chunk to the end to \`
			`# # ensure the end of the sentence is recognized correctly.`
			`# auto_det_end = True`

			`# if auto_det_end:`
			`# total_chunk_num += 1`

			`# # print(f"chunk_size: {self.chunk_size}, chunk_stride: {self.chunk_partial_size}, total_chunk_num: {total_chunk_num}, len: {len(self.audio_cache)}")`
			`# end_idx = None`
			`# for i in range(total_chunk_num):`
			`# if auto_det_end:`
			`# is_end = i == total_chunk_num - 1`
			`# start_idx = i*self.chunk_partial_size`
			`# if auto_det_end:`
			`# end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num-1 else -1`
			`# else:`
			`# end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num else -1`
			`# # print(f"cut part: {start_idx}:{end_idx}, is_end: {is_end}, i: {i}, total_chunk_num: {total_chunk_num}")`
			`# # t_stamp = time.time()`

			`# speech_chunk = self.audio_cache[start_idx:end_idx]`

			`# # TODO: exceptions processes`
			`# try:`
			`# res = self.asr_model.generate(input=speech_chunk, cache=self.asr_cache, is_final=is_end, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back)`
			`# except ValueError as e:`
			`# print(f"ValueError: {e}")`
			`# continue`
			`# text_dict['text'].append(self.text_postprecess(res[0], data_id='text'))`
			`# # print(f"each chunk time: {time.time()-t_stamp}")`

			`# if is_end:`
			`# self.audio_cache = None`
			`# self.asr_cache = {}`
			`# else:`
			`# if end_idx:`
			`# self.audio_cache = self.audio_cache[end_idx:] # cut the processed part from audio_cache`
			`# text_dict['is_end'] = is_end`

			`# # print(f"text_dict: {text_dict}")`
			`# return text_dict`


仓库初始化 2024-05-01 17:18:30 +08:00