FunASR/funasr/tokenizer/korean_cleaner.py

75 lines
1.9 KiB
Python

# Referenced from https://github.com/hccho2/Tacotron-Wavenet-Vocoder-Korean
import re
class KoreanCleaner:
@classmethod
def _normalize_numbers(cls, text):
number_to_kor = {
"0": "",
"1": "",
"2": "",
"3": "",
"4": "",
"5": "",
"6": "",
"7": "",
"8": "",
"9": "",
}
new_text = "".join(
number_to_kor[char] if char in number_to_kor.keys() else char for char in text
)
return new_text
@classmethod
def _normalize_english_text(cls, text):
upper_alphabet_to_kor = {
"A": "에이",
"B": "",
"C": "",
"D": "",
"E": "",
"F": "에프",
"G": "",
"H": "에이치",
"I": "아이",
"J": "제이",
"K": "케이",
"L": "",
"M": "",
"N": "",
"O": "",
"P": "",
"Q": "",
"R": "",
"S": "에스",
"T": "",
"U": "",
"V": "브이",
"W": "더블유",
"X": "엑스",
"Y": "와이",
"Z": "",
}
new_text = re.sub("[a-z]+", lambda x: str.upper(x.group()), text)
new_text = "".join(
upper_alphabet_to_kor[char] if char in upper_alphabet_to_kor.keys() else char
for char in new_text
)
return new_text
@classmethod
def normalize_text(cls, text):
# stage 0 : text strip
text = text.strip()
# stage 1 : normalize numbers
text = cls._normalize_numbers(text)
# stage 2 : normalize english text
text = cls._normalize_english_text(text)
return text