1
0
Fork 0
TakwayDisplayPlatform/utils/bert_vits2/re_matching.py

82 lines
2.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
def extract_language_and_text_updated(speaker, dialogue):
# 使用正则表达式匹配<语言>标签和其后的文本
pattern_language_text = r"<(\S+?)>([^<]+)"
matches = re.findall(pattern_language_text, dialogue, re.DOTALL)
speaker = speaker[1:-1]
# 清理文本:去除两边的空白字符
matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches]
matches_cleaned.append(speaker)
return matches_cleaned
def validate_text(input_text):
# 验证说话人的正则表达式
pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)"
# 使用re.DOTALL标志使.匹配包括换行符在内的所有字符
matches = re.findall(pattern_speaker, input_text, re.DOTALL)
# 对每个匹配到的说话人内容进行进一步验证
for _, dialogue in matches:
language_text_matches = extract_language_and_text_updated(_, dialogue)
if not language_text_matches:
return (
False,
"Error: Invalid format detected in dialogue content. Please check your input.",
)
# 如果输入的文本中没有找到任何匹配项
if not matches:
return (
False,
"Error: No valid speaker format detected. Please check your input.",
)
return True, "Input is valid."
def text_matching(text: str) -> list:
speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)"
matches = re.findall(speaker_pattern, text, re.DOTALL)
result = []
for speaker, dialogue in matches:
result.append(extract_language_and_text_updated(speaker, dialogue))
return result
def cut_para(text):
splitted_para = re.split("[\n]", text) # 按段分
splitted_para = [
sentence.strip() for sentence in splitted_para if sentence.strip()
] # 删除空字符串
return splitted_para
def cut_sent(para):
para = re.sub("([。!;\?])([^”’])", r"\1\n\2", para) # 单字符断句符
para = re.sub("(\.{6})([^”’])", r"\1\n\2", para) # 英文省略号
para = re.sub("(\{2})([^”’])", r"\1\n\2", para) # 中文省略号
para = re.sub("([。!?\?][”’])([^,。!?\?])", r"\1\n\2", para)
para = para.rstrip() # 段尾如果有多余的\n就去掉它
return para.split("\n")
if __name__ == "__main__":
text = """
[说话人1]
[说话人2]<zh>你好吗?<jp>元気ですか?<jp>こんにちは,世界。<zh>你好吗?
[说话人3]<zh>谢谢。<jp>どういたしまして。
"""
text_matching(text)
# 测试函数
test_text = """
[说话人1]<zh>你好,こんにちは!<jp>こんにちは,世界。
[说话人2]<zh>你好吗?
"""
text_matching(test_text)
res = validate_text(test_text)
print(res)