#!/usr/bin/env python3 # encoding: utf-8 import sys # sys.argv[1]: lm dict # sys.argv[2]: lexicon file # sys.argv[3]: lexicon file for corpus.dict lex_dict = {} with open(sys.argv[2], "r", encoding="utf8") as fin: for line in fin: words = line.strip().split("\t") if len(words) != 2: continue lex_dict[words[0]] = words[1] with open(sys.argv[1], "r", encoding="utf8") as fin, open( sys.argv[3], "w", encoding="utf8" ) as fout: for line in fin: word = line.strip() if word == "" or word == "": continue word_lex = "" if word in lex_dict: word_lex = lex_dict[word] else: for i in range(len(word)): if word[i] in lex_dict: word_lex += " " + lex_dict[word[i]] else: word_lex += " " fout.write("{}\t{}\n".format(word, word_lex.strip()))