pun_emo_speaker_utils/takway/common_utils.py

97 lines
3.2 KiB
Python
Raw Permalink Normal View History

2024-05-11 22:34:08 +08:00
# ############################################################# #
# format table function
# ############################################################# #
def format_table(header, rows):
# 计算列宽
col_width = max(len(str(word)) for row in rows for word in row) + 2 # 最大单词长度 + 2 作为列宽
# 打印表头
print("".join(word.ljust(col_width) for word in header))
# 打印分隔线
print("".join("-" * col_width for _ in header))
# 打印内容
for row in rows:
print("".join(str(word).ljust(col_width) for word in row))
# ############################################################# #
# encode and decode bytes and string
# ############################################################# #
import base64
def encode_bytes2str(data):
# 将字节串编码为Base64
if data is None:
return None
return base64.b64encode(data).decode('utf-8')
def decode_str2bytes(data):
# 将Base64编码的字节串解码为字节串
if data is None:
return None
return base64.b64decode(data.encode('utf-8'))
import re
def split_sentences(text: str):
# 定义中文标点符号的正则表达式
pattern = r'[\\\\\\\\\\\\\》]+'
# 使用正则表达式分割字符串
sentences = re.split(pattern, text)
# 过滤掉空字符串
sentences = [sentence for sentence in sentences if sentence]
return sentences
'''
# 示例文本
text = "今天天气真好,我们去公园玩吧!你觉得怎么样?好的,那就这么定了。"
# 调用函数进行断句
sentences = split_sentences(text)
print(sentences)
'''
def split_chinese_text(text: str, return_patch=False):
# 定义中文标点符号集合
punctuations = set('。!?,;:、“”()《》【】')
# 初始化断句结果列表和标点符号列表
sentences = []
punctuation_list = []
text_patch = []
start = 0 # 断句开始位置
for i, char in enumerate(text):
if char in punctuations:
# 如果当前字符是标点符号,则进行断句,并记录标点符号
sentences.append(text[start:i+1])
punctuation_list.append(char)
start = i + 1 # 更新断句开始位置
# 处理最后一句(如果最后一句后没有标点符号)
if start < len(text):
sentences.append(text[start:])
if return_patch:
if len(punctuation_list) == 0:
return [text], False # 有残留语句
elif len(sentences) == len(punctuation_list):
return [''.join(sentences)], True
else:
return [''.join(sentences[:-1]), sentences[-1]], True
return sentences, punctuation_list
'''
# 示例文本
text = "你好,世界!今天天气怎么样?希望你有一个美好的一天。"
sentences, punctuation_list = split_chinese_text(text)
print("断句结果:", sentences)
print("标点符号列表:", punctuation_list)
'''
def remove_brackets_and_contents(text):
# 使用sub函数替换匹配的文本为空字符串
result = re.sub(r'\(.*?\)', '', text)
result = re.sub(r'\.*?\', '', result)
result = re.sub(r'\【.*?\', '', result)
return result