diff --git a/app/concrete.py b/app/concrete.py index 032c3a0..9f0a5f3 100644 --- a/app/concrete.py +++ b/app/concrete.py @@ -6,7 +6,8 @@ from .abstract import * from .public import * from .exception import * from .dependency import get_logger -from utils.vits_utils import TextToSpeech +from utils.vits_utils import TextToSpeech as VITS_TextToSpeech +from utils.bert_vits2_utils import TextToSpeech as BertVits_TextToSpeech from config import Config import threading import requests @@ -17,7 +18,11 @@ import time import json # ----------- 初始化vits ----------- # -vits = TextToSpeech() +vits = VITS_TextToSpeech() +# ---------------------------------- # + +# -------- 初始化bert-vits --------- # +bert_vits = BertVits_TextToSpeech() # ---------------------------------- # # ---------- 初始化logger ---------- # @@ -294,6 +299,14 @@ class VITS_TTS(TTS): def synthetize(self, assistant, text): tts_info = json.loads(assistant.tts_info) return vits.synthesize(text, tts_info) + +class BertVits_TTS(TTS): + def __init__(self): + pass + + def synthetize(self, assistant, text): + tts_info = json.loads(assistant.tts_info) + return bert_vits.synthesize(text, tts_info) # --------------------------------- # @@ -319,6 +332,8 @@ class TTSFactory: def create_tts(self,tts_type:str) -> TTS: if tts_type == 'VITS': return VITS_TTS() + if tts_type == 'BertVits': + return BertVits_TTS() # --------------------------------- # @@ -420,7 +435,12 @@ class Agent(): self.tts_audio_service_chain.add_service(TTSAudioRecordService()) def init_recorder(self,user_id): - self.recorder = Recorder(user_id) + input_sr = 16000 + if isinstance(self.tts, BertVits_TTS): + output_sr = 44100 + elif isinstance(self.tts, VITS_TTS): + output_sr = 22050 + self.recorder = Recorder(user_id,input_sr,output_sr) # 对用户输入的音频进行预处理 def user_audio_process(self, audio): diff --git a/app/public.py b/app/public.py index 24f92d1..3a276c8 100644 --- a/app/public.py +++ b/app/public.py @@ -30,12 +30,12 @@ class SentenceSegmentation(): return self.__sentenceSegmentation(llm_chunk) class Recorder: - def __init__(self, user_id): + def __init__(self, user_id, input_sr, output_sr): self.input_wav_path = 'storage/wav/'+ datetime.now().strftime('%Y%m%d%H%M%S') + 'U' + user_id + 'i.wav' self.output_wav_path = 'storage/wav/'+ datetime.now().strftime('%Y%m%d%H%M%S') + 'U' + user_id + 'o.wav' self.out_put_text_path = 'storage/record/'+ datetime.now().strftime('%Y%m%d%H%M%S') + 'U' + user_id + 'o.txt' - self.input_sr = 16000 - self.output_sr = 22050 + self.input_sr = input_sr + self.output_sr = output_sr self.user_audio = b'' self.tts_audio = b'' self.input_text = "" diff --git a/app/schemas.py b/app/schemas.py index d597836..cc6fd23 100644 --- a/app/schemas.py +++ b/app/schemas.py @@ -33,8 +33,13 @@ class update_assistant_deatil_params_request(BaseModel): platform:str model :str temperature :float + tts_engine:str speaker_id:int length_scale:float + language:str + style_text:str + style_weight:float + class update_assistant_max_tokens_request(BaseModel): max_tokens:int \ No newline at end of file diff --git a/config.py b/config.py index f85c2e8..d54c7e5 100644 --- a/config.py +++ b/config.py @@ -1,8 +1,5 @@ class Config: SQLITE_URL = 'sqlite:///takway.db' - ASR = "XF" #在此处选择语音识别引擎 - LLM = "MINIMAX" #在此处选择大模型 - TTS = "VITS" #在此处选择语音合成引擎 LOG_LEVEL = "DEBUG" class UVICORN: HOST = '0.0.0.0' diff --git a/main.py b/main.py index 93a3746..42b0444 100644 --- a/main.py +++ b/main.py @@ -122,7 +122,16 @@ async def update_assistant_deatil_params(id: str,request: update_assistant_deati llm_info['temperature'] = request.temperature tts_info['speaker_id'] = request.speaker_id tts_info['length_scale'] = request.length_scale + tts_info['language'] = request.language + tts_info['style_text'] = request.style_text + tts_info['style_weight'] = request.style_weight + tts_info['sdp_ratio'] = 0.5 + tts_info['opt_cut_by_send'] = False + tts_info['interval_between_para'] = 1.0 + tts_info['interval_between_sent'] = 0.2 + tts_info['en_ratio'] = 1.0 user_info['llm_type'] = request.platform + user_info['tts_type'] = request.tts_engine assistant.llm_info = json.dumps(llm_info, ensure_ascii=False) assistant.tts_info = json.dumps(tts_info, ensure_ascii=False) assistant.user_info = json.dumps(user_info, ensure_ascii=False) @@ -227,15 +236,7 @@ async def streaming_chat(ws: WebSocket): agent.recorder.input_text = prompt logger.debug("开始调用大模型") llm_frames = await agent.chat(assistant, prompt) - - start_time = time.time() - is_first_response = True - for llm_frame in llm_frames: - if is_first_response: - end_time = time.time() - logger.debug(f"第一帧返回耗时:{round(end_time-start_time,3)}s") - is_first_response = False resp_msgs = agent.llm_msg_process(llm_frame) for resp_msg in resp_msgs: llm_text += resp_msg diff --git a/requirements.txt b/requirements.txt index db2e993..64f2646 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,5 @@ numba librosa aiohttp 'volcengine-python-sdk[ark]' -zhipuai \ No newline at end of file +zhipuai +pyopenjtalk \ No newline at end of file diff --git a/test.py b/test.py deleted file mode 100644 index 2c7185b..0000000 --- a/test.py +++ /dev/null @@ -1,13 +0,0 @@ -from utils.bert_vits2_utils import TextToSpeech -import soundfile as sf -tts = TextToSpeech() -tts.print_speakers_info() - -audio, sample_rate= tts.synthesize("你好,我好开心", # 文本 - 0, # 说话人 id - style_text="我很难过!!!!呜呜呜!!!", # 情绪prompt,当language=="ZH" 才有效 - style_weight=0.9, # 情绪prompt权重 - language="mix", # 语言类型,包括 "ZH" "EN" "mix" - en_ratio=1.) # mix语言类型下,英文文本速度,越大速度越慢 -save_path = "./tmp2.wav" -sf.write(save_path, audio, sample_rate) \ No newline at end of file diff --git a/tmp2.wav b/tmp2.wav deleted file mode 100644 index d627eb8..0000000 Binary files a/tmp2.wav and /dev/null differ diff --git a/utils/bert_vits2_utils.py b/utils/bert_vits2_utils.py index 8620f4a..072897c 100644 --- a/utils/bert_vits2_utils.py +++ b/utils/bert_vits2_utils.py @@ -395,29 +395,28 @@ class TextToSpeech: def synthesize(self, text, - speaker_idx=0, # self.speakers 的 index,指定说话 - sdp_ratio=0.5, - noise_scale=0.6, - noise_scale_w=0.9, - length_scale=1.0, # 越大语速越慢 - language="mix", # ["ZH", "EN", "mix"] 三选一 - opt_cut_by_send=False, # 按句切分 在按段落切分的基础上再按句子切分文本 - interval_between_para=1.0, # 段间停顿(秒),需要大于句间停顿才有效 - interval_between_sent=0.2, # 句间停顿(秒),勾选按句切分才生效 - audio_prompt=None, - text_prompt="", - prompt_mode="Text prompts", - style_text="", # "使用辅助文本的语意来辅助生成对话(语言保持与主文本相同)\n\n" - # "**注意**:不要使用**指令式文本**(如:开心),要使用**带有强烈情感的文本**(如:我好快乐!!!)\n\n" - # "效果较不明确,留空即为不使用该功能" - style_weight=0.7, # "主文本和辅助文本的bert混合比率,0表示仅主文本,1表示仅辅助文本 - en_ratio=1.0 # 中英混合时,英文速度控制,越大英文速度越慢 + tts_info, ): """ return: audio, sample_rate """ + speaker_id = tts_info['speaker_id'] # self.speakers 的 index,指定说话 + sdp_ratio = tts_info['sdp_ratio'] + noise_scale = tts_info['noise_scale'] + noise_scale_w = tts_info['noise_scale_w'] + length_scale = tts_info['length_scale'] + language = tts_info['language'] # ["ZH", "EN", "mix"] 三选一 + opt_cut_by_send = tts_info['opt_cut_by_send'] + interval_between_para = tts_info['interval_between_para'] # 段间停顿(秒),需要大于句间停顿才有效 + interval_between_sent = tts_info['interval_between_sent'] # 句间停顿(秒),勾选按句切分才生效 + audio_prompt = None + text_prompt = "" + prompt_mode = "Text prompts" + style_text = tts_info['style_text'] + style_weight = tts_info['style_weight'] + en_ratio = tts_info['en_ratio'] - speaker = self.speakers[speaker_idx] + speaker = self.speakers[speaker_id] if language == "mix": language, text = self.format_utils(text, speaker) @@ -455,9 +454,17 @@ class TextToSpeech: style_weight ) - # return text_output, audio_output - return audio_output[1], audio_output[0] + return self.convert_numpy_to_bytes(audio_output[1]) def print_speakers_info(self): for i, speaker in enumerate(self.speakers): print(f"id: {i}, speaker: {speaker}") + + def convert_numpy_to_bytes(self, audio_data): + if isinstance(audio_data, np.ndarray): + if audio_data.dtype == np.dtype('float32'): + audio_data = np.int16(audio_data * np.iinfo(np.int16).max) + audio_data = audio_data.tobytes() + return audio_data + else: + raise TypeError("audio_data must be a numpy array")