97 lines
3.2 KiB
Python
97 lines
3.2 KiB
Python
|
||
# ############################################################# #
|
||
# format table function
|
||
# ############################################################# #
|
||
|
||
def format_table(header, rows):
|
||
# 计算列宽
|
||
col_width = max(len(str(word)) for row in rows for word in row) + 2 # 最大单词长度 + 2 作为列宽
|
||
# 打印表头
|
||
print("".join(word.ljust(col_width) for word in header))
|
||
# 打印分隔线
|
||
print("".join("-" * col_width for _ in header))
|
||
# 打印内容
|
||
for row in rows:
|
||
print("".join(str(word).ljust(col_width) for word in row))
|
||
|
||
# ############################################################# #
|
||
# encode and decode bytes and string
|
||
# ############################################################# #
|
||
|
||
import base64
|
||
def encode_bytes2str(data):
|
||
# 将字节串编码为Base64
|
||
if data is None:
|
||
return None
|
||
return base64.b64encode(data).decode('utf-8')
|
||
|
||
def decode_str2bytes(data):
|
||
# 将Base64编码的字节串解码为字节串
|
||
if data is None:
|
||
return None
|
||
return base64.b64decode(data.encode('utf-8'))
|
||
|
||
import re
|
||
def split_sentences(text: str):
|
||
# 定义中文标点符号的正则表达式
|
||
pattern = r'[\。\,\、\;\:\?\!\“\”\(\)\《\》]+'
|
||
# 使用正则表达式分割字符串
|
||
sentences = re.split(pattern, text)
|
||
# 过滤掉空字符串
|
||
sentences = [sentence for sentence in sentences if sentence]
|
||
return sentences
|
||
'''
|
||
# 示例文本
|
||
text = "今天天气真好,我们去公园玩吧!你觉得怎么样?好的,那就这么定了。"
|
||
# 调用函数进行断句
|
||
sentences = split_sentences(text)
|
||
|
||
print(sentences)
|
||
'''
|
||
|
||
def split_chinese_text(text: str, return_patch=False, punctuations=None):
|
||
# 定义中文标点符号集合
|
||
punctuations = set('。!?,;:、“”()《》【】')
|
||
# 初始化断句结果列表和标点符号列表
|
||
sentences = []
|
||
punctuation_list = []
|
||
|
||
text_patch = []
|
||
|
||
start = 0 # 断句开始位置
|
||
for i, char in enumerate(text):
|
||
if char in punctuations:
|
||
# 如果当前字符是标点符号,则进行断句,并记录标点符号
|
||
sentences.append(text[start:i+1])
|
||
punctuation_list.append(char)
|
||
start = i + 1 # 更新断句开始位置
|
||
|
||
# 处理最后一句(如果最后一句后没有标点符号)
|
||
if start < len(text):
|
||
sentences.append(text[start:])
|
||
|
||
|
||
if return_patch:
|
||
if len(punctuation_list) == 0:
|
||
return [text], False # 有残留语句
|
||
elif len(sentences) == len(punctuation_list):
|
||
return [''.join(sentences)], True
|
||
else:
|
||
return [''.join(sentences[:-1]), sentences[-1]], True
|
||
return sentences, punctuation_list
|
||
'''
|
||
# 示例文本
|
||
text = "你好,世界!今天天气怎么样?希望你有一个美好的一天。{}"
|
||
sentences, punctuation_list = split_chinese_text(text)
|
||
|
||
print("断句结果:", sentences)
|
||
print("标点符号列表:", punctuation_list)
|
||
'''
|
||
|
||
def remove_brackets_and_contents(text):
|
||
# 使用sub函数替换匹配的文本为空字符串
|
||
result = re.sub(r'\(.*?\)', '', text)
|
||
result = re.sub(r'\(.*?\)', '', result)
|
||
result = re.sub(r'\【.*?\】', '', result)
|
||
return result
|