'''
爬取菜鸟100例 https://www.runoob.com/python/python-100-examples.html 标题,题目,分析
该网站多模板 题目分析可能在不同的P标签内 也可能在同一标签内
'''
'''
该网站采用urllib 和 xpath 来爬取
'''
import urllib.request as request
import urllib.error
from lxml import etree
import MySQLdb
def get_html(url):
'''
获得页面的response
:param url:
:return:
'''
try:
res = request.urlopen(url)
except urllib.error.HTTPError as e:
print(e.reason)
except urllib.error.URLError as e:
print(e.reason)
except Exception as e:
print(e)
else:
return res.read().decode('utf-8')
def get_urllist(html):
'''
获取每一个例子的url
:param html:
:return:
'''
html = etree.HTML(html)
url_list = html.xpath('//li/a[@target="_blank"]/@href')
return ['https://www.runoob.com'+i for i in url_list]
def get_data(html):
'''
获取具体的数据
:param html:
:return:
'''
html = etree.HTML(html)
title_and_fen_xi = html.xpath('//div[@id="content"]/p')
result = [i.xpath('string(.)') for i in title_and_fen_xi]
index = -2
for i,j in enumerate(result):
if '程序源代码' in j:
index = i
result = "".join(result[1:index])
result = result.split('程序分析:')
title = ''
fenxi = '无'
if len(result) >1:
title,fenxi = result[0].strip(),result[1].strip()
else:
title = result[0]
return title,fenxi
# print(title_and_fen_xi)
def save_to_mysql(args):
'''
存入数据库中
:param args:
:return:
'''
# print(type(args[0]),type(args[1]))
sql = 'insert into example values(%s,%s)'
try:
cursor.execute(sql,args)
except Exception as e:
print("存入数据库错误",e)
conn.rollback()
else:
print('存入数据库')
conn.commit()
if __name__ == '__main__':
url = 'https://www.runoob.com/python/python-100-examples.html'
orgin_html = get_html(url)
# 注意一点 当存入中文时 需要加charset = utf8
conn = MySQLdb.Connection(
host='localhost',
user='root',
password='123456',
port=3306,
db='testdata',
charset = 'utf8' # 存入中文
)
cursor = conn.cursor()
for url in get_urllist(orgin_html):
print(url)
save_to_mysql(get_data(get_html(url)))
cursor.close()
conn.close()