去除文本中的特殊符号、停用词等,对文本进行分词。
lda_jieba_preprocess.py
# -*- coding: UTF-8 -*-
import re
import jieba
import jieba.analyse
def cut_seg(input_file_name, output_file_name, stop_word_list):
stop_words = set(stop_word_list)
with open(input_file_name, 'r') as fin, open(output_file_name, 'w') as fout:
for line in fin:
item_list = line.strip().split("t")
if len(item_list) != 2:
continue
text = item_list[1]
# 过滤掉"[ 龇牙 ]、[ 握手 ]"之类的表情符号,第一个[表示转移
text = re.sub('[[0-9a-zA-Zu4e00-u9fa5.,,。?“”]+]', '', text)
# text = re.sub(u'[^0-9a-zA-Zu4e00-u9fa5.,,。?“”]+', ' ', text)
temp_list = jieba.cut(text, cut_all=True)
content = ' 


