def jieba_cut_and_save_file(inputList, n_weight, a_weight, output_cleaned_file=False):
"""
1. 读取中文文件并分词句子
2. 可以将分词后的结果保存到文件
"""
output_file = os.path.join('./data/', 'cleaned_' + 'trainMatrix.txt')
lines = []
tags = []
for line in inputList:
result = pseg.cut(clean_str(line))
a = []
b = []
for word, flag in result:
# 对分词后的新闻
if word != ' ':
# 若非空
a.append(word)
if flag.find('n')==0:
# 若是名词
b.append(n_weight)
elif flag.find('a')==0:
# 若形容词
b.append(a_weight)
else:
b.append(1)
lines.append(a)
tags.append(b)
if output_cleaned_file:
with open(output_file, 'w') as f:
for line in lines:
f.write(" ".join(line) + '\n')
vocabulary = createVocabList(lines)
# 根据词典生成词向量化器,并进行词向量化
setOfWords2Vec = setOfWords2VecFactory(vocabulary)
vectorized = []
for i,news in enumerate(lines):
vector = setOfWords2Vec(news, tags[i])
vectorized.append(vector)
return vectorized, vocabulary