import re import argparse def load_dict(seg_file): seg_dict = {} with open(seg_file, "r") as infile: for line in infile: s = line.strip().split() key = s[0] value = s[1:] seg_dict[key] = " ".join(value) return seg_dict def forward_segment(text, dic): word_list = [] i = 0 while i < len(text): longest_word = text[i] for j in range(i + 1, len(text) + 1): word = text[i:j] if word in dic: if len(word) > len(longest_word): longest_word = word word_list.append(longest_word) i += len(longest_word) return word_list def tokenize(txt, seg_dict): out_txt = "" pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])") for word in txt: if pattern.match(word): if word in seg_dict: out_txt += seg_dict[word] + " " else: out_txt += "" + " " else: continue return out_txt.strip() def get_parser(): parser = argparse.ArgumentParser( description="text tokenize", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--text-file", "-t", default=False, required=True, type=str, help="input text", ) parser.add_argument( "--seg-file", "-s", default=False, required=True, type=str, help="seg file", ) parser.add_argument( "--txt-index", "-i", default=1, required=True, type=int, help="txt index", ) parser.add_argument( "--output-dir", "-o", default=False, required=True, type=str, help="output dir", ) return parser def main(): parser = get_parser() args = parser.parse_args() txt_writer = open("{}/text.{}.txt".format(args.output_dir, args.txt_index), "w") shape_writer = open("{}/len.{}".format(args.output_dir, args.txt_index), "w") seg_dict = load_dict(args.seg_file) with open(args.text_file, "r") as infile: for line in infile: s = line.strip().split() text_id = s[0] text_list = forward_segment("".join(s[1:]).lower(), seg_dict) text = tokenize(text_list, seg_dict) lens = len(text.strip().split()) txt_writer.write(text_id + " " + text + "\n") shape_writer.write(text_id + " " + str(lens) + "\n") if __name__ == "__main__": main()