1
0
Fork 0
TakwayDisplayPlatform/utils/bert_vits2/text/japanese.py

721 lines
18 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Convert Japanese text to phonemes which is
# compatible with Julius https://github.com/julius-speech/segmentation-kit
import re
import unicodedata
from transformers import AutoTokenizer
from ..text import punctuation, symbols
from num2words import num2words
import pyopenjtalk
import jaconv
# Mapping of hiragana to phonetic representation
hiragana_map = {
"う゛ぁ": " v a",
"う゛ぃ": " v i",
"う゛ぇ": " v e",
"う゛ぉ": " v o",
"う゛ゅ": " by u",
"ぅ゛": " v u",
# ゔ等の処理を追加
"ゔぁ": " v a",
"ゔぃ": " v i",
"ゔぇ": " v e",
"ゔぉ": " v o",
"ゔゅ": " by u",
# 2文字からなる変換規則
"あぁ": " a a",
"いぃ": " i i",
"いぇ": " i e",
"いゃ": " y a",
"うぅ": " u:",
"えぇ": " e e",
"おぉ": " o:",
"かぁ": " k a:",
"きぃ": " k i:",
"くぅ": " k u:",
"くゃ": " ky a",
"くゅ": " ky u",
"くょ": " ky o",
"けぇ": " k e:",
"こぉ": " k o:",
"がぁ": " g a:",
"ぎぃ": " g i:",
"ぐぅ": " g u:",
"ぐゃ": " gy a",
"ぐゅ": " gy u",
"ぐょ": " gy o",
"げぇ": " g e:",
"ごぉ": " g o:",
"さぁ": " s a:",
"しぃ": " sh i",
"すぅ": " s u:",
"すゃ": " sh a",
"すゅ": " sh u",
"すょ": " sh o",
"せぇ": " s e:",
"そぉ": " s o:",
"ざぁ": " z a:",
"じぃ": " j i:",
"ずぅ": " z u:",
"ずゃ": " zy a",
"ずゅ": " zy u",
"ずょ": " zy o",
"ぜぇ": " z e:",
"ぞぉ": " z o:",
"たぁ": " t a:",
"ちぃ": " ch i",
"つぁ": " ts a",
"つぃ": " ts i",
"つぅ": " ts u",
"つゃ": " ch a",
"つゅ": " ch u",
"つょ": " ch o",
"つぇ": " ts e",
"つぉ": " ts o",
"てぇ": " t e:",
"とぉ": " t o:",
"だぁ": " d a:",
"ぢぃ": " j i:",
"づぅ": " d u:",
"づゃ": " zy a",
"づゅ": " zy u",
"づょ": " zy o",
"でぇ": " d e:",
"なぁ": " n a:",
"にぃ": " n i:",
"ぬぅ": " n u:",
"ぬゃ": " ny a",
"ぬゅ": " ny u",
"ぬょ": " ny o",
"ねぇ": " n e:",
"のぉ": " n o:",
"はぁ": " h a:",
"ひぃ": " h i:",
"ふぅ": " f u:",
"ふゃ": " hy a",
"へぇ": " h e:",
"ほぉ": " h o:",
"ばぁ": " b a:",
"びぃ": " b i:",
"ぶぅ": " b u:",
"ぶゅ": " by u",
"べぇ": " b e:",
"ぼぉ": " b o:",
"ぱぁ": " p a:",
"ぴぃ": " p i:",
"ぷぅ": " p u:",
"ぷゃ": " py a",
"ぷゅ": " py u",
"ぷょ": " py o",
"ぺぇ": " p e:",
"ぽぉ": " p o:",
"まぁ": " m a:",
"みぃ": " m i:",
"むぅ": " m u:",
"むゃ": " my a",
"むゅ": " my u",
"むょ": " my o",
"めぇ": " m e:",
"もぉ": " m o:",
"やぁ": " y a:",
"ゆぅ": " y u:",
"ゆゃ": " y a:",
"ゆゅ": " y u:",
"ゆょ": " y o:",
"よぉ": " y o:",
"らぁ": " r a:",
"りぃ": " r i:",
"るぅ": " r u:",
"るゃ": " ry a",
"るゅ": " ry u",
"るょ": " ry o",
"れぇ": " r e:",
"ろぉ": " r o:",
"わぁ": " w a:",
"をぉ": " o:",
"う゛": " b u",
"でぃ": " d i",
"でゃ": " dy a",
"でゅ": " dy u",
"でょ": " dy o",
"てぃ": " t i",
"てゃ": " ty a",
"てゅ": " ty u",
"てょ": " ty o",
"すぃ": " s i",
"ずぁ": " z u",
"ずぃ": " z i",
"ずぇ": " z e",
"ずぉ": " z o",
"きゃ": " ky a",
"きゅ": " ky u",
"きょ": " ky o",
"しゃ": " sh a",
"しゅ": " sh u",
"しぇ": " sh e",
"しょ": " sh o",
"ちゃ": " ch a",
"ちゅ": " ch u",
"ちぇ": " ch e",
"ちょ": " ch o",
"とぅ": " t u",
"とゃ": " ty a",
"とゅ": " ty u",
"とょ": " ty o",
"どぁ": " d o ",
"どぅ": " d u",
"どゃ": " dy a",
"どゅ": " dy u",
"どょ": " dy o",
"どぉ": " d o:",
"にゃ": " ny a",
"にゅ": " ny u",
"にょ": " ny o",
"ひゃ": " hy a",
"ひゅ": " hy u",
"ひょ": " hy o",
"みゃ": " my a",
"みゅ": " my u",
"みょ": " my o",
"りゃ": " ry a",
"りゅ": " ry u",
"りょ": " ry o",
"ぎゃ": " gy a",
"ぎゅ": " gy u",
"ぎょ": " gy o",
"ぢぇ": " j e",
"ぢゃ": " j a",
"ぢゅ": " j u",
"ぢょ": " j o",
"じぇ": " j e",
"じゃ": " j a",
"じゅ": " j u",
"じょ": " j o",
"びゃ": " by a",
"びゅ": " by u",
"びょ": " by o",
"ぴゃ": " py a",
"ぴゅ": " py u",
"ぴょ": " py o",
"うぁ": " u a",
"うぃ": " w i",
"うぇ": " w e",
"うぉ": " w o",
"ふぁ": " f a",
"ふぃ": " f i",
"ふゅ": " hy u",
"ふょ": " hy o",
"ふぇ": " f e",
"ふぉ": " f o",
# 1音からなる変換規則
"": " a",
"": " i",
"": " u",
"": " v u", # ゔの処理を追加
"": " e",
"": " o",
"": " k a",
"": " k i",
"": " k u",
"": " k e",
"": " k o",
"": " s a",
"": " sh i",
"": " s u",
"": " s e",
"": " s o",
"": " t a",
"": " ch i",
"": " ts u",
"": " t e",
"": " t o",
"": " n a",
"": " n i",
"": " n u",
"": " n e",
"": " n o",
"": " h a",
"": " h i",
"": " f u",
"": " h e",
"": " h o",
"": " m a",
"": " m i",
"": " m u",
"": " m e",
"": " m o",
"": " r a",
"": " r i",
"": " r u",
"": " r e",
"": " r o",
"": " g a",
"": " g i",
"": " g u",
"": " g e",
"": " g o",
"": " z a",
"": " j i",
"": " z u",
"": " z e",
"": " z o",
"": " d a",
"": " j i",
"": " z u",
"": " d e",
"": " d o",
"": " b a",
"": " b i",
"": " b u",
"": " b e",
"": " b o",
"": " p a",
"": " p i",
"": " p u",
"": " p e",
"": " p o",
"": " y a",
"": " y u",
"": " y o",
"": " w a",
"": " i",
"": " e",
"": " N",
"": " q",
# ここまでに処理されてない ぁぃぅぇぉ はそのまま大文字扱い
"": " a",
"": " i",
"": " u",
"": " e",
"": " o",
"": " w a",
# 長音の処理
# for (pattern, replace_str) in JULIUS_LONG_VOWEL:
# text = pattern.sub(replace_str, text)
# text = text.replace("o u", "o:") # おう -> おーの音便
"": ":",
"": ":",
"": ":",
"-": ":",
# その他特別な処理
"": " o",
# ここまでに処理されていないゅ等もそのまま大文字扱い(追加)
"": " y a",
"": " y u",
"": " y o",
}
def hiragana2p(txt: str) -> str:
"""
Modification of `jaconv.hiragana2julius`.
- avoid using `:`, instead, `あーーー` -> `a a a a`.
- avoid converting `o u` to `o o` (because the input is already actual `yomi`).
- avoid using `N` for `ん` (for compatibility)
- use `v` for `ゔ` related text.
- add bare `ゃ` `ゅ` `ょ` to `y a` `y u` `y o` (for compatibility).
"""
result = []
skip = 0
for i in range(len(txt)):
if skip:
skip -= 1
continue
for length in range(3, 0, -1):
if txt[i : i + length] in hiragana_map:
result.append(hiragana_map[txt[i : i + length]])
skip = length - 1
break
txt = "".join(result)
txt = txt.strip()
txt = txt.replace(":+", ":")
# ここまで`jaconv.hiragana2julius`と音便処理と長音処理をのぞいて同じ
# ここから`k a:: k i:`→`k a a a k i i`のように`:`の数だけ繰り返す処理
pattern = r"(\w)(:*)"
replacement = lambda m: m.group(1) + (" " + m.group(1)) * len(m.group(2))
txt = re.sub(pattern, replacement, txt)
txt = txt.replace("N", "n") # 促音のNをnに変換
return txt
def kata2phoneme(text: str) -> str:
"""Convert katakana text to phonemes."""
text = text.strip()
if text == "":
return [""]
elif text.startswith(""):
return [""] + kata2phoneme(text[1:])
res = []
prev = None
while text:
if re.match(_MARKS, text):
res.append(text)
text = text[1:]
continue
if text.startswith(""):
if prev:
res.append(prev[-1])
text = text[1:]
continue
res += hiragana2p(jaconv.kata2hira(text)).split(" ")
break
# res = _COLON_RX.sub(":", res)
return res
_SYMBOL_TOKENS = set(list("・、。?!"))
_NO_YOMI_TOKENS = set(list("「」『』―()[][]"))
_MARKS = re.compile(
r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
)
def text2sep_kata(text: str):
parsed = pyopenjtalk.run_frontend(text)
res = []
sep = []
for parts in parsed:
word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
"", ""
)
if yomi:
if re.match(_MARKS, yomi):
if len(word) > 1:
word = [replace_punctuation(i) for i in list(word)]
yomi = word
res += yomi
sep += word
continue
elif word not in rep_map.keys() and word not in rep_map.values():
word = ","
yomi = word
res.append(yomi)
else:
if word in _SYMBOL_TOKENS:
res.append(word)
elif word in ("", ""):
res.append("")
elif word in _NO_YOMI_TOKENS:
pass
else:
res.append(word)
sep.append(word)
return sep, res, get_accent(parsed)
def get_accent(parsed):
labels = pyopenjtalk.make_label(parsed)
phonemes = []
accents = []
for n, label in enumerate(labels):
phoneme = re.search(r"\-([^\+]*)\+", label).group(1)
if phoneme not in ["sil", "pau"]:
phonemes.append(phoneme.replace("cl", "q").lower())
else:
continue
a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
a2 = int(re.search(r"\+(\d+)\+", label).group(1))
if re.search(r"\-([^\+]*)\+", labels[n + 1]).group(1) in ["sil", "pau"]:
a2_next = -1
else:
a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
# Falling
if a1 == 0 and a2_next == a2 + 1:
accents.append(-1)
# Rising
elif a2 == 1 and a2_next == 2:
accents.append(1)
else:
accents.append(0)
return list(zip(phonemes, accents))
_ALPHASYMBOL_YOMI = {
"#": "シャープ",
"%": "パーセント",
"&": "アンド",
"+": "プラス",
"-": "マイナス",
":": "コロン",
";": "セミコロン",
"<": "小なり",
"=": "イコール",
">": "大なり",
"@": "アット",
"a": "エー",
"b": "ビー",
"c": "シー",
"d": "ディー",
"e": "イー",
"f": "エフ",
"g": "ジー",
"h": "エイチ",
"i": "アイ",
"j": "ジェー",
"k": "ケー",
"l": "エル",
"m": "エム",
"n": "エヌ",
"o": "オー",
"p": "ピー",
"q": "キュー",
"r": "アール",
"s": "エス",
"t": "ティー",
"u": "ユー",
"v": "ブイ",
"w": "ダブリュー",
"x": "エックス",
"y": "ワイ",
"z": "ゼット",
"α": "アルファ",
"β": "ベータ",
"γ": "ガンマ",
"δ": "デルタ",
"ε": "イプシロン",
"ζ": "ゼータ",
"η": "イータ",
"θ": "シータ",
"ι": "イオタ",
"κ": "カッパ",
"λ": "ラムダ",
"μ": "ミュー",
"ν": "ニュー",
"ξ": "クサイ",
"ο": "オミクロン",
"π": "パイ",
"ρ": "ロー",
"σ": "シグマ",
"τ": "タウ",
"υ": "ウプシロン",
"φ": "ファイ",
"χ": "カイ",
"ψ": "プサイ",
"ω": "オメガ",
}
_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
_CURRENCY_MAP = {"$": "ドル", "¥": "", "£": "ポンド", "": "ユーロ"}
_CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
def japanese_convert_numbers_to_words(text: str) -> str:
res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
return res
def japanese_convert_alpha_symbols_to_words(text: str) -> str:
return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
def is_japanese_character(char):
# 定义日语文字系统的 Unicode 范围
japanese_ranges = [
(0x3040, 0x309F), # 平假名
(0x30A0, 0x30FF), # 片假名
(0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs)
(0x3400, 0x4DBF), # 汉字扩展 A
(0x20000, 0x2A6DF), # 汉字扩展 B
# 可以根据需要添加其他汉字扩展范围
]
# 将字符的 Unicode 编码转换为整数
char_code = ord(char)
# 检查字符是否在任何一个日语范围内
for start, end in japanese_ranges:
if start <= char_code <= end:
return True
return False
rep_map = {
"": ",",
"": ",",
"": ",",
"": ".",
"": "!",
"": "?",
"\n": ".",
"": ".",
"": "...",
"···": "...",
"・・・": "...",
"·": ",",
"": ",",
"": ",",
"$": ".",
"": "'",
"": "'",
'"': "'",
"": "'",
"": "'",
"": "'",
"": "'",
"(": "'",
")": "'",
"": "'",
"": "'",
"": "'",
"": "'",
"[": "'",
"]": "'",
"": "-",
"": "-",
"": "-",
"~": "-",
"": "'",
"": "'",
}
def replace_punctuation(text):
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(
r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
+ "".join(punctuation)
+ r"]+",
"",
replaced_text,
)
return replaced_text
def text_normalize(text):
res = unicodedata.normalize("NFKC", text)
res = japanese_convert_numbers_to_words(res)
# res = "".join([i for i in res if is_japanese_character(i)])
res = replace_punctuation(res)
res = res.replace("", "")
return res
def distribute_phone(n_phone, n_word):
phones_per_word = [0] * n_word
for task in range(n_phone):
min_tasks = min(phones_per_word)
min_index = phones_per_word.index(min_tasks)
phones_per_word[min_index] += 1
return phones_per_word
def handle_long(sep_phonemes):
for i in range(len(sep_phonemes)):
if sep_phonemes[i][0] == "":
sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
if "" in sep_phonemes[i]:
for j in range(len(sep_phonemes[i])):
if sep_phonemes[i][j] == "":
sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
return sep_phonemes
tokenizer = AutoTokenizer.from_pretrained("./utils/bert_vits2/bert/deberta-v2-large-japanese-char-wwm")
def align_tones(phones, tones):
res = []
for pho in phones:
temp = [0] * len(pho)
for idx, p in enumerate(pho):
if len(tones) == 0:
break
if p == tones[0][0]:
temp[idx] = tones[0][1]
if idx > 0:
temp[idx] += temp[idx - 1]
tones.pop(0)
temp = [0] + temp
temp = temp[:-1]
if -1 in temp:
temp = [i + 1 for i in temp]
res.append(temp)
res = [i for j in res for i in j]
assert not any([i < 0 for i in res]) and not any([i > 1 for i in res])
return res
def rearrange_tones(tones, phones):
res = [0] * len(tones)
for i in range(len(tones)):
if i == 0:
if tones[i] not in punctuation:
res[i] = 1
elif tones[i] == prev:
if phones[i] in punctuation:
res[i] = 0
else:
res[i] = 1
elif tones[i] > prev:
res[i] = 2
elif tones[i] < prev:
res[i - 1] = 3
res[i] = 1
prev = tones[i]
return res
def g2p(norm_text):
sep_text, sep_kata, acc = text2sep_kata(norm_text)
sep_tokenized = []
for i in sep_text:
if i not in punctuation:
sep_tokenized.append(tokenizer.tokenize(i))
else:
sep_tokenized.append([i])
sep_phonemes = handle_long([kata2phoneme(i) for i in sep_kata])
# 异常处理MeCab不认识的词的话会一路传到这里来然后炸掉。目前来看只有那些超级稀有的生僻词会出现这种情况
for i in sep_phonemes:
for j in i:
assert j in symbols, (sep_text, sep_kata, sep_phonemes)
tones = align_tones(sep_phonemes, acc)
word2ph = []
for token, phoneme in zip(sep_tokenized, sep_phonemes):
phone_len = len(phoneme)
word_len = len(token)
aaa = distribute_phone(phone_len, word_len)
word2ph += aaa
phones = ["_"] + [j for i in sep_phonemes for j in i] + ["_"]
# tones = [0] + rearrange_tones(tones, phones[1:-1]) + [0]
tones = [0] + tones + [0]
word2ph = [1] + word2ph + [1]
assert len(phones) == len(tones)
return phones, tones, word2ph
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("./utils/bert_vits2/bert/deberta-v2-large-japanese")
text = "hello,こんにちは、世界ー!……"
from text.japanese_bert import get_bert_feature
text = text_normalize(text)
print(text)
phones, tones, word2ph = g2p(text)
bert = get_bert_feature(text, word2ph)
print(phones, tones, word2ph, bert.shape)