book_parser metree.HTML(temp_content, metree.HTMLParser())
li_list book_parser.xpath( //div[ class section-works ]/ul[ class works-list ]/li )
# print(len(li_list))
book_infos_list []
for li_element in li_list:
book_infos []
# 书名
book_name li_element.xpath( .//span[ class title-text ]//text() )
if len(book_name) 0:
book_infos.append(book_name[0])
# 作者
author li_element.xpath( .//div[ class author ]/a[1]/span/span/text() )
if len(author) 0:
book_infos.append(author[0])
# 简介
introduce li_element.xpath( .//div[ class intro ]/span//text() )
if len(introduce) 0:
intro_infos1 re.sub(r s , , introduce[0])
intro_infos re.sub( , , , intro_infos1)
book_infos.append(intro_infos)
# print(intro_infos)
# 书本信息
book_sticky li_element.xpath( .//div[ class extra-info ]//text() )
if len(book_sticky) 0:
text .join(book_sticky)
book_infos.append(text)
# print(text)
# 价格
sale_list li_element.xpath( .//span[ class sale ]/span//text() )
if len(sale_list) 2:
before_sale sale_list[1]
now_sale sale_list[2]
# print(before_sale)
# print(now_sale)
else:
now_sale 0
before_sale sale_list[1]
book_infos.append(str(before_sale))
book_infos.append(str(now_sale))
book_infos_list.append(book_infos)
# print(book_infos_list)
return book_infos_list
def save_book_infos_by_csv(self, datas):
with open( ./豆瓣信息.csv , a , encoding utf-8 ) as writer:
for book_info in datas:
res , .join(book_info) n
writer.write(res)
def run(self):
with open( ./豆瓣信息.csv , w , encoding utf-8 ) as writer:
writer.write( 书名,作者,简介,书本信息,原价,折扣价n )
book_url_list self.get_url_list()
index 1
for url in book_url_list:
html_content self.parse_html(url)
book_data self.catch_book_infos(html_content)
self.save_book_infos_by_csv(book_data)
print( 正在爬取并保存第%d页豆瓣书籍信息...... % index)
index 1
print( 全部豆瓣书籍网页数据保存成功!!!!! )