#去掉标点,英文
def clean_text(value):
import re
if value:
text="".join(re.findall(r"[u4e00-u9fff]+", value))
return text if len(text)>0 else None
else:
return None
#去掉停词
def stopwordslist(filepath):
stopwords=set([line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()])
return stopwords
stop_words_cn=stopwordslist('cn_stop_words.txt')
words_final=[list(filter(lambda w:w not in stop_words_cn,words)) for words in words_clean]
#统计
from collections import Counter
word_freq = Counter([w for words in words_final for w in words])
word_freq_df = pd.Dataframe(word_freq.items())
word_freq_df.columns = ['word', 'count']
word_freq_df.sort_values('count', ascending=0).head(10)
#词云和数据给出的结论一直,耳机的评论是正面的,但需要查看中性和负面评论的原因
def plot(value,top_K):
comment=df[df['sentiment_value']==value]
comment=comment['content'].apply(lambda x : list(jieba.cut(x)))
words_final=[list(filter(lambda w:w not in stop_words_cn,words)) for words in comment]
word_freq = Counter([w for words in words_final for w in words])
word_freq_df = pd.Dataframe(word_freq.items())
word_freq_df.columns = ['word', 'count']
word_freq_dict=dict(list(word_freq_df.sort_values('count',ascending=0).head(top_K).apply(lambda row: (row['word'],row['count']), axis=1)))
wc.generate_from_frequencies(word_freq_dict)
# 显示词云
plt.imshow(wc)
plt.axis("off")
plt.figure(figsize=(16,8), dpi=1000)
plt.show()
# 将关键词通过分隔符链接起来
df=df[pd.notnull(df['text_clean'])]
keywords_text = df.apply(lambda r: '|'.join([w for w in jieba.analyse.extract_tags(r['text_clean'], topK=5, withWeight=False, allowPOS=()) if w not in stop_words_cn]), axis=1).tolist()
keywords_text[:2]
<16843x1000 sparse matrix of type ''
with 34537 stored elements in Compressed Sparse Row format>
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X, y)
/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(