感觉应该要趁热打铁,不然放几天又凉了。
跟上一个例子比呢:
1.这次我使用了字典来保存数据,并且把字典的内容存储到excel表格
2.使用正则表达式解析字符串,分离字符串
3.替换掉出现的中文全角空格
import requests
from bs4 import BeautifulSoup
import re
# import bs4
import xlsxwriter as xw
baseUrl = "https://www.ygdy8.com"
# 用于解析文本
listContents = ("译名", "片名", "年代", "产地", "类别", "语言", "字幕", "上映日期", "IMDb评分", "豆瓣评分", "片长", "导演", "编剧", "主演", "简介")
def getHtmlText(url):
try:
r = requests.get(url)
r.raise_for_status() # 网页打开失败,触发异常
r.encoding = r.apparent_encoding
return r.text
except:
return "网页打开失败"
def subGetInfoFromHtml(uLIst, html):
soup = BeautifulSoup(html, "html.parser")
dit = {}
# 找到标题
title = soup.find("div", class_="title_all")
print("title = ", title.string)
dit["title"] = title.string
# 解析文件中的内容
tags = soup.find_all("div", id="Zoom")
# print("len = ",len(tags))
for it in tags:
str1 = it.text.replace(u'u3000', u'').replace(u'xa0', u' ') # 去除中文空格等
# print("str1 = ", (str1))
str1 = re.sub(r"磁力链.*", "", str1) # 删除磁力链之后的文字
str1 = re.sub(r"n", "", str1) # 删除换行的字符
str1 += "◎" # 追加一个标识,不然简介那解析不出来
# print("str1 = ", (str1))
for sub in listContents: # 按照上面的定义解析字符串
# print(type(sub))
pat = str(sub)+"(.+?)◎"
t = re.findall(pat, str1)
print(sub, " = ", t)
if len(t):
dit[sub] = t[0]
else:
dit[sub] = 'None' # 没有相应的解析数据
dit["href"] = it.a["href"] # 磁链也放到里面去
# print(dit)
uLIst.append(dit) # 追加到列表中
def getInfoFromHtml(uLIst, html):
soup = BeautifulSoup(html, "html.parser")
n = 0 # 记录有多少个
tags = soup.find("div", class_="co_content2")
print(type(tags))
for it in tags("a"):
# list = [re.search(r"d+", it.string).group()]
# uLIst.append(list)
# print(it)
# print(it["href"])
# print(it.string)
url_t = baseUrl+it["href"] # 合成网页
# print(url_t)
n += 1
if n == 1:
continue
html1 = getHtmlText(url_t) # 获取网页内容
if len(html1) > 500: # 获取网页
# print("-------------------")
subGetInfoFromHtml(uLIst, html1) # 输入网页
# print("++++++++++++++++++++++")
# print(it.string)
# if n > 100:
# break
print("n=",n)
return n
def printList(List, num=20):
pass
# print("{:^10}t{:^10}t{:^10}{:^10}t{:^10}t{:^10}{:^10}t{:^10}t{:^10}".format("排名", "学校名称", "学校英文名称", "详细网址",
# "校标网址", "所属地区", "类型", "综合总分",
# "办学层次"))
# for i in range(num):
# itm = List[i]
# print(
# "{:^10}t{:^10}t{:^10}{:^10}t{:^10}t{:^10}{:^10}t{:^10}t{:^10}".format(itm[0], itm[1], itm[2], itm[3],
# itm[4], itm[5], itm[6], itm[7],
# itm[8]))
def saveListToExcel(LIst, fileName):
workbook = xw.Workbook(fileName) # 创建工作簿
worksheet1 = workbook.add_worksheet("sheet1") # 创建子表
worksheet1.activate() # 激活表
# title = ["排名", "学校名称", "学校英文名称", "详细网址", "校标网址", "所属地区", "类型", "综合总分", "办学层次"] # 设置表头
# worksheet1.write_row('A1', "电影名") # 从A1单元格开始写入名称
# worksheet1.write_row('B1', listContents) # 从B1单元格开始写入表头
# worksheet1.write_row('Q1', "磁链") # 从Q1单元格开始写入
worksheet1.write_row('A1', LIst[0].keys()) # 从A1单元格开始写入名称
k = 2
for it in LIst:
row = "A" + str(k)
worksheet1.write_row(row, it.values()) # 从A2单元格开始写入数据
k += 1
workbook.close() # 关闭表
if __name__ == '__main__':
infoList = []
try:
fp = open("dianying_save", mode="r", encoding="utf-8") # 尝试去打开文件,打开失败就重新下载
html = fp.read() # 全部读出来
if len(html) < 1000: # if 不需要使用括号,读取的内容小于1000个字节,也是重新下载
raise () # 人为触发异常
except: # 文件打开失败的时候,就触发重新爬取一次
print("文件打开失败,下载网页")
url = "https://www.ygdy8.com/index.html"
html = getHtmlText(url)
# print(len(html))
fp = open("dianying_save", mode="w", encoding="utf-8") # 把爬取的内容写到文件中,供下一次运行使用
fp.write(html)
fp.close()
if len(html) > 1000:
num = getInfoFromHtml(infoList, html)
if num > 0:
saveListToExcel(infoList, "阳光电影.xlsx")
else:
print("文件内容小于1000个字节,不做解析处理")



