TakwayPlatform/utils/tts/openvoice/text/mandarin.py

327 lines
7.9 KiB
Python
Raw Normal View History

2024-05-22 15:28:23 +08:00
import os
import sys
import re
from pypinyin import lazy_pinyin, BOPOMOFO
import jieba
import cn2an
import logging
# List of (Latin alphabet, bopomofo) pairs:
_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
('a', 'ㄟˉ'),
('b', 'ㄅㄧˋ'),
('c', 'ㄙㄧˉ'),
('d', 'ㄉㄧˋ'),
('e', 'ㄧˋ'),
('f', 'ㄝˊㄈㄨˋ'),
('g', 'ㄐㄧˋ'),
('h', 'ㄝˇㄑㄩˋ'),
('i', 'ㄞˋ'),
('j', 'ㄐㄟˋ'),
('k', 'ㄎㄟˋ'),
('l', 'ㄝˊㄛˋ'),
('m', 'ㄝˊㄇㄨˋ'),
('n', 'ㄣˉ'),
('o', 'ㄡˉ'),
('p', 'ㄆㄧˉ'),
('q', 'ㄎㄧㄡˉ'),
('r', 'ㄚˋ'),
('s', 'ㄝˊㄙˋ'),
('t', 'ㄊㄧˋ'),
('u', 'ㄧㄡˉ'),
('v', 'ㄨㄧˉ'),
('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
('x', 'ㄝˉㄎㄨˋㄙˋ'),
('y', 'ㄨㄞˋ'),
('z', 'ㄗㄟˋ')
]]
# List of (bopomofo, romaji) pairs:
_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
('ㄅㄛ', 'p⁼wo'),
('ㄆㄛ', 'pʰwo'),
('ㄇㄛ', 'mwo'),
('ㄈㄛ', 'fwo'),
('', 'p⁼'),
('', ''),
('', 'm'),
('', 'f'),
('', 't⁼'),
('', ''),
('', 'n'),
('', 'l'),
('', 'k⁼'),
('', ''),
('', 'h'),
('', 'ʧ⁼'),
('', 'ʧʰ'),
('', 'ʃ'),
('', 'ʦ`⁼'),
('', 'ʦ`ʰ'),
('', 's`'),
('', 'ɹ`'),
('', 'ʦ⁼'),
('', 'ʦʰ'),
('', 's'),
('', 'a'),
('', 'o'),
('', 'ə'),
('', 'e'),
('', 'ai'),
('', 'ei'),
('', 'au'),
('', 'ou'),
('ㄧㄢ', 'yeNN'),
('', 'aNN'),
('ㄧㄣ', 'iNN'),
('', 'əNN'),
('', 'aNg'),
('ㄧㄥ', 'iNg'),
('ㄨㄥ', 'uNg'),
('ㄩㄥ', 'yuNg'),
('', 'əNg'),
('', 'əɻ'),
('', 'i'),
('', 'u'),
('', 'ɥ'),
('ˉ', ''),
('ˊ', ''),
('ˇ', '↓↑'),
('ˋ', ''),
('˙', ''),
('', ','),
('', '.'),
('', '!'),
('', '?'),
('', '-')
]]
# List of (romaji, ipa) pairs:
_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
('ʃy', 'ʃ'),
('ʧʰy', 'ʧʰ'),
('ʧ⁼y', 'ʧ⁼'),
('NN', 'n'),
('Ng', 'ŋ'),
('y', 'j'),
('h', 'x')
]]
# List of (bopomofo, ipa) pairs:
_bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
('ㄅㄛ', 'p⁼wo'),
('ㄆㄛ', 'pʰwo'),
('ㄇㄛ', 'mwo'),
('ㄈㄛ', 'fwo'),
('', 'p⁼'),
('', ''),
('', 'm'),
('', 'f'),
('', 't⁼'),
('', ''),
('', 'n'),
('', 'l'),
('', 'k⁼'),
('', ''),
('', 'x'),
('', 'tʃ⁼'),
('', 'tʃʰ'),
('', 'ʃ'),
('', 'ts`⁼'),
('', 'ts`ʰ'),
('', 's`'),
('', 'ɹ`'),
('', 'ts⁼'),
('', 'tsʰ'),
('', 's'),
('', 'a'),
('', 'o'),
('', 'ə'),
('', 'ɛ'),
('', 'aɪ'),
('', 'eɪ'),
('', 'ɑʊ'),
('', ''),
('ㄧㄢ', 'jɛn'),
('ㄩㄢ', 'ɥæn'),
('', 'an'),
('ㄧㄣ', 'in'),
('ㄩㄣ', 'ɥn'),
('', 'ən'),
('', 'ɑŋ'),
('ㄧㄥ', ''),
('ㄨㄥ', 'ʊŋ'),
('ㄩㄥ', 'jʊŋ'),
('', 'əŋ'),
('', 'əɻ'),
('', 'i'),
('', 'u'),
('', 'ɥ'),
('ˉ', ''),
('ˊ', ''),
('ˇ', '↓↑'),
('ˋ', ''),
('˙', ''),
('', ','),
('', '.'),
('', '!'),
('', '?'),
('', '-')
]]
# List of (bopomofo, ipa2) pairs:
_bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
('ㄅㄛ', 'pwo'),
('ㄆㄛ', 'pʰwo'),
('ㄇㄛ', 'mwo'),
('ㄈㄛ', 'fwo'),
('', 'p'),
('', ''),
('', 'm'),
('', 'f'),
('', 't'),
('', ''),
('', 'n'),
('', 'l'),
('', 'k'),
('', ''),
('', 'h'),
('', ''),
('', 'tɕʰ'),
('', 'ɕ'),
('', ''),
('', 'tʂʰ'),
('', 'ʂ'),
('', 'ɻ'),
('', 'ts'),
('', 'tsʰ'),
('', 's'),
('', 'a'),
('', 'o'),
('', 'ɤ'),
('', 'ɛ'),
('', 'aɪ'),
('', 'eɪ'),
('', 'ɑʊ'),
('', ''),
('ㄧㄢ', 'jɛn'),
('ㄩㄢ', 'yæn'),
('', 'an'),
('ㄧㄣ', 'in'),
('ㄩㄣ', 'yn'),
('', 'ən'),
('', 'ɑŋ'),
('ㄧㄥ', ''),
('ㄨㄥ', 'ʊŋ'),
('ㄩㄥ', 'jʊŋ'),
('', 'ɤŋ'),
('', 'əɻ'),
('', 'i'),
('', 'u'),
('', 'y'),
('ˉ', '˥'),
('ˊ', '˧˥'),
('ˇ', '˨˩˦'),
('ˋ', '˥˩'),
('˙', ''),
('', ','),
('', '.'),
('', '!'),
('', '?'),
('', '-')
]]
def number_to_chinese(text):
numbers = re.findall(r'\d+(?:\.?\d+)?', text)
for number in numbers:
text = text.replace(number, cn2an.an2cn(number), 1)
return text
def chinese_to_bopomofo(text):
text = text.replace('', '').replace('', '').replace('', '')
words = jieba.lcut(text, cut_all=False)
text = ''
for word in words:
bopomofos = lazy_pinyin(word, BOPOMOFO)
if not re.search('[\u4e00-\u9fff]', word):
text += word
continue
for i in range(len(bopomofos)):
bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\', bopomofos[i])
if text != '':
text += ' '
text += ''.join(bopomofos)
return text
def latin_to_bopomofo(text):
for regex, replacement in _latin_to_bopomofo:
text = re.sub(regex, replacement, text)
return text
def bopomofo_to_romaji(text):
for regex, replacement in _bopomofo_to_romaji:
text = re.sub(regex, replacement, text)
return text
def bopomofo_to_ipa(text):
for regex, replacement in _bopomofo_to_ipa:
text = re.sub(regex, replacement, text)
return text
def bopomofo_to_ipa2(text):
for regex, replacement in _bopomofo_to_ipa2:
text = re.sub(regex, replacement, text)
return text
def chinese_to_romaji(text):
text = number_to_chinese(text)
text = chinese_to_bopomofo(text)
text = latin_to_bopomofo(text)
text = bopomofo_to_romaji(text)
text = re.sub('i([aoe])', r'y\1', text)
text = re.sub('u([aoəe])', r'w\1', text)
text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\\2', text)
return text
def chinese_to_lazy_ipa(text):
text = chinese_to_romaji(text)
for regex, replacement in _romaji_to_ipa:
text = re.sub(regex, replacement, text)
return text
def chinese_to_ipa(text):
text = number_to_chinese(text)
text = chinese_to_bopomofo(text)
text = latin_to_bopomofo(text)
text = bopomofo_to_ipa(text)
text = re.sub('i([aoe])', r'j\1', text)
text = re.sub('u([aoəe])', r'w\1', text)
text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\\2', text)
return text
def chinese_to_ipa2(text):
text = number_to_chinese(text)
text = chinese_to_bopomofo(text)
text = latin_to_bopomofo(text)
text = bopomofo_to_ipa2(text)
text = re.sub(r'i([aoe])', r'j\1', text)
text = re.sub(r'u([aoəe])', r'w\1', text)
text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\\2', text)
text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
return text