1
0
Fork 0
TakwayPlatform/examples/tts_demo.ipynb

215 lines
9.3 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Importing the dtw module. When using in academic works please cite:\n",
" T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.\n",
" J. Stat. Soft., doi:10.18637/jss.v031.i07.\n",
"\n"
]
}
],
"source": [
"import sys\n",
"import os\n",
"sys.path.append(\"../\")\n",
"from utils.tts.openvoice_utils import TextToSpeech\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\bing\\.conda\\envs\\openVoice\\lib\\site-packages\\torch\\nn\\utils\\weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n",
" warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n",
"Building prefix dict from the default dictionary ...\n",
"Loading model from cache C:\\Users\\bing\\AppData\\Local\\Temp\\jieba.cache\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"load base tts model successfully!\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading model cost 0.304 seconds.\n",
"Prefix dict has been built successfully.\n",
"Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
"- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"c:\\Users\\bing\\.conda\\envs\\openVoice\\lib\\site-packages\\torch\\nn\\modules\\conv.py:797: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\aten\\src\\ATen\\native\\cudnn\\Conv_v8.cpp:919.)\n",
" return F.conv_transpose1d(\n",
"c:\\Users\\bing\\.conda\\envs\\openVoice\\lib\\site-packages\\torch\\nn\\modules\\conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\aten\\src\\ATen\\native\\cudnn\\Conv_v8.cpp:919.)\n",
" return F.conv1d(input, weight, bias, self.stride,\n",
"c:\\Users\\bing\\.conda\\envs\\openVoice\\lib\\site-packages\\torch\\nn\\utils\\weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n",
" warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"generate base speech!\n",
"**********************,tts sr 44100\n",
"audio segment length is [torch.Size([81565])]\n",
"True\n",
"Loaded checkpoint 'D:\\python\\OpenVoice\\checkpoints_v2\\converter/checkpoint.pth'\n",
"missing/unexpected keys: [] []\n",
"load tone color converter successfully!\n"
]
}
],
"source": [
"model = TextToSpeech(use_tone_convert=True, device=\"cuda\", debug=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# 测试用将mp3转为int32类型的numpy对齐输入端\n",
"from pydub import AudioSegment\n",
"import numpy as np\n",
"source_audio=r\"D:\\python\\OpenVoice\\resources\\demo_speaker0.mp3\"\n",
"audio = AudioSegment.from_file(source_audio, format=\"mp3\")\n",
"raw_data = audio.raw_data\n",
"audio_array = np.frombuffer(raw_data, dtype=np.int32)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"OpenVoice version: v2\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\bing\\.conda\\envs\\openVoice\\lib\\site-packages\\torch\\functional.py:665: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.\n",
"Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\aten\\src\\ATen\\native\\SpectralOps.cpp:878.)\n",
" return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]\n",
"c:\\Users\\bing\\.conda\\envs\\openVoice\\lib\\site-packages\\torch\\nn\\modules\\conv.py:456: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\aten\\src\\ATen\\native\\cudnn\\Conv_v8.cpp:919.)\n",
" return F.conv2d(input, weight, bias, self.stride,\n"
]
}
],
"source": [
"# 获取并设置目标说话人的speaker embedding\n",
"# audio_array :输入的音频信号,类型为 np.ndarray\n",
"# 获取speaker embedding\n",
"target_se = model.audio2emb(audio_array, rate=44100, vad=True)\n",
"# 将模型的默认目标说话人embedding设置为 target_se\n",
"model.initialize_target_se(target_se)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\bing\\.conda\\envs\\openVoice\\lib\\site-packages\\torch\\nn\\modules\\conv.py:797: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\aten\\src\\ATen\\native\\cudnn\\Conv_v8.cpp:919.)\n",
" return F.conv_transpose1d(\n",
"c:\\Users\\bing\\.conda\\envs\\openVoice\\lib\\site-packages\\torch\\nn\\modules\\conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\aten\\src\\ATen\\native\\cudnn\\Conv_v8.cpp:919.)\n",
" return F.conv1d(input, weight, bias, self.stride,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"generate base speech!\n",
"**********************,tts sr 44100\n",
"audio segment length is [torch.Size([216378])]\n",
"Audio saved to D:\\python\\OpenVoice\\outputs_v2\\demo_tts.wav\n"
]
}
],
"source": [
"# 测试base_tts不含音色转换\n",
"text = \"你好呀,我不知道该怎么告诉你这件事,但是我真的很需要你。\"\n",
"audio, sr = model._base_tts(text, speed=1)\n",
"audio = model.tensor2numpy(audio)\n",
"model.save_audio(audio, sr, r\"D:\\python\\OpenVoice\\outputs_v2\\demo_tts.wav\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"generate base speech!\n",
"**********************,tts sr 44100\n",
"audio segment length is [torch.Size([216378])]\n",
"torch.float32\n",
"**********************************, convert sr 22050\n",
"tone color has been converted!\n",
"Audio saved to D:\\python\\OpenVoice\\outputs_v2\\demo.wav\n"
]
}
],
"source": [
"# 测试整体pipeline包含音色转换\n",
"text = \"你好呀,我不知道该怎么告诉你这件事,但是我真的很需要你。\"\n",
"audio_bytes, sr = model.tts(text, speed=1)\n",
"audio = np.frombuffer(audio_bytes, dtype=np.int16).flatten()\n",
"model.save_audio(audio, sr, r\"D:\\python\\OpenVoice\\outputs_v2\\demo.wav\" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "openVoice",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 2
}