导入模块 import urllib.request import urllib.parse from lxml import etree import requests import pandas as pd import re
打开部分已经获取的信息,计算各个字符出现的次数。后边使用字符出现的次数对句子进行打分
def str_count_three(strs: str):
from collections import Counter
return Counter(strs)
with open('count.txt','r',encoding='utf8') as f:
content = f.read()
features = dict(str_count_three(content))
del features[',']
del features['。']
del features['n']
del features[';']
获取网页
def query(content):
# 请求地址
url = 'https://baike.baidu.com' + content
# 请求头部
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
res = requests.get(url =url,headers=headers)
return res.text
百科信息过滤
def get_data(result):
result = result.split('。')
templist = []
# print(features)
for part in result:
for feature in features_mian:
if feature[0] in part and not '<' in part and not '>' in part and not 'n' in part:
templist.append(part)
tempdict = dict()
for part1 in templist:
parts1 = part1.split(';')
for part2 in parts1:
parts2 = part2.split(';')
for part in parts2:
tempdict[part] = 0
for word in part:
for feature in features:
if word == feature[0]:
tempdict[part] += feature[1]
tempdict[part] = tempdict[part] / len(part)
tempdict = sorted(tempdict.items(), key=lambda d: d[1], reverse=True)
# tempdict = tempdict[:]
final_anser = ''
for sentence in tempdict:
final_anser += sentence[0] + ';'
if len(final_anser) > 50:
break
return final_anser
主程序
输入需要查找的词,获取网页信息,过滤获得多句所需句子,通过样本中字符出现次数对句子进行排序
if __name__ == '__main__':
data = pd.read_excel('杂草统计.xlsx')
grass_words = data['杂草名称']
print(grass_words)
features = sorted(features.items(), key=lambda d: d[1], reverse=True)
features = features[:30]
features_mian = ['花', '茎', '叶', '根', '毛', "穗"]
shibie_list = list()
error_list = list()
for grass_word in grass_words:
content = '/item/' + urllib.parse.quote(grass_word)
result = query(content)
final_anser = get_data(result)
# final_anser = 'n'.join(dict(tempdict).keys())
print(final_anser)
if final_anser == '':
print(grass_word)
urls = re.findall('



