预处理文本如下:
很强势的处理方法,去掉所有非中文字符之外的符号。如果对数据有其它要求,建议换一种方法。
import pandas as pd
import re
import csv
import os
from pandas import DataFrame
#创建一个清洗列表
def cleanlist():
clean =[line.strip() for line in open('D:待处理文本.csv',enconding='UTF-8').readlines()]
return clean
def is_chinese(uchar):
if uchar >= u'u4e00' and uchar <= u'u9fa5': # 判断一个uchar是否是汉字
return True
else:
return False
def allcontents(contents):
content = ''
for i in contents:
if is_chinese(i):
content = content + i
print('n处理后的句子为:n' + content)
return content
# 给出文档路径
filename = "D:待处理文本.csv"
outfilename = "D:清洗结果.csv"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='gbk')
# 将输出结果写入ou.txt中
for line in inputs:
line_seg = allcontents(line)
outputs.write(line_seg + 'n')
outputs.close()
inputs.close()
处理之后的结果,如下图所示:



