pun_emo_speaker_utils/takway/common_utils.py

97 lines
3.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ############################################################# #
# format table function
# ############################################################# #
def format_table(header, rows):
# 计算列宽
col_width = max(len(str(word)) for row in rows for word in row) + 2 # 最大单词长度 + 2 作为列宽
# 打印表头
print("".join(word.ljust(col_width) for word in header))
# 打印分隔线
print("".join("-" * col_width for _ in header))
# 打印内容
for row in rows:
print("".join(str(word).ljust(col_width) for word in row))
# ############################################################# #
# encode and decode bytes and string
# ############################################################# #
import base64
def encode_bytes2str(data):
# 将字节串编码为Base64
if data is None:
return None
return base64.b64encode(data).decode('utf-8')
def decode_str2bytes(data):
# 将Base64编码的字节串解码为字节串
if data is None:
return None
return base64.b64decode(data.encode('utf-8'))
import re
def split_sentences(text: str):
# 定义中文标点符号的正则表达式
pattern = r'[\\\\\\\\\\\\\》]+'
# 使用正则表达式分割字符串
sentences = re.split(pattern, text)
# 过滤掉空字符串
sentences = [sentence for sentence in sentences if sentence]
return sentences
'''
# 示例文本
text = "今天天气真好,我们去公园玩吧!你觉得怎么样?好的,那就这么定了。"
# 调用函数进行断句
sentences = split_sentences(text)
print(sentences)
'''
def split_chinese_text(text: str, return_patch=False):
# 定义中文标点符号集合
punctuations = set('。!?,;:、“”()《》【】')
# 初始化断句结果列表和标点符号列表
sentences = []
punctuation_list = []
text_patch = []
start = 0 # 断句开始位置
for i, char in enumerate(text):
if char in punctuations:
# 如果当前字符是标点符号,则进行断句,并记录标点符号
sentences.append(text[start:i+1])
punctuation_list.append(char)
start = i + 1 # 更新断句开始位置
# 处理最后一句(如果最后一句后没有标点符号)
if start < len(text):
sentences.append(text[start:])
if return_patch:
if len(punctuation_list) == 0:
return [text], False # 有残留语句
elif len(sentences) == len(punctuation_list):
return [''.join(sentences)], True
else:
return [''.join(sentences[:-1]), sentences[-1]], True
return sentences, punctuation_list
'''
# 示例文本
text = "你好,世界!今天天气怎么样?希望你有一个美好的一天。"
sentences, punctuation_list = split_chinese_text(text)
print("断句结果:", sentences)
print("标点符号列表:", punctuation_list)
'''
def remove_brackets_and_contents(text):
# 使用sub函数替换匹配的文本为空字符串
result = re.sub(r'\(.*?\)', '', text)
result = re.sub(r'\.*?\', '', result)
result = re.sub(r'\【.*?\', '', result)
return result