爬取CSDN博主文章列表，练习

# 博主首页链接擦姐的域名很强。带有一个dream 普通人就是 https://blog.csdn.net/博主名称 blog_url https://dream.blog.csdn.net # blog_url https://blog.csdn.net/博主名称 # 自行查看博主首页文章的页数 pages 2 # 循环获取每一页 for i in range(1, pages 1): url f {blog_url}/article/list/{i} print(f 爬取。。。{url} ) html get_html(url) ats get_ats(html, ats) # 保存csv 保存后如若用excel打开则需先用记事本打开另存为 ANSI 才不会乱码 # wrt(ats) # 保存excel ex excel() # with_url , 哪一列下标内容带url ats数组改列后面必须紧接着url ex.write_row(ats, with_url 1) #ex.wbac.cell(row 1, column 1).value HYPERlink( {} , {} ) .format( https://www.baidu.com , link Name ) ex.save()

Excel 有的封装封装了个寂寞尝试封装

from openpyxl import Workbook, load_workbook
import time
class excel():
 def __init__(self) - None:
 self.wb Workbook()
 self.wbac self.wb.active
 def get_sheet_names(self):
 return self.wb.get_sheet_names()
 # 向sheet中写入一行数据
 def write_row(self, list, sheet_name Sheet , with_url None):
 # sheets self.get_sheet_names()
 sheet self.wb.get_sheet_by_name(sheet_name)
 #sheet wb.get_sheet_by_name(sheets[0])
 if sheet:
 # row [y for item in list for y in item]
 # sheet.append(row)
 for i in range(len(list)):
 # ll []
 row list[i]
 for y in range(len(row)):
 content row[y]
 if with_url y and i 0:
 # print(i,y)
 # 替换成链接
 # wu HYPERlink( {} , {} ) .format(row[y 1], content)
 list[i][y] HYPERlink( {} , {} ) .format(row[y 1], content)
 # v self.wbac.cell(row i 1, column y 1).value HYPERlink( {} , {} ) .format(row[y 1], content)
 # ll.append(wu)
 # continue
 # else:
 # ll.append(content)
 # print(ll)
 # sheet.append(ll)
 sheet.append(row)
 def new_sheet(self, name):
 self.wb.create_sheet(name)
 def save(self, path f ./files/Excel{str(round(time.time()*1000))}.xlsx ):
 self.wb.save(path)
 # def __call__(self, list):
 # self
 # pass

封装的工具类无用的import删掉

import os, time, requests, random, telnetlib, json, pypinyin
from bs4 import BeautifulSoup
__dir__ os.path.dirname(os.path.abspath(__file__))
# print(__dir__)
def get_headers(localhost True, refer https://www.baidu.com , host None):
 ua Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36 
 if not localhost:
 uas [
 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36 ,
 Mozilla/5.0 (compatible; Baiduspider/2.0; http://www.baidu.com/search/spider.html) ,
 Mozilla/5.0 (compatible; Baiduspider-render/2.0; http://www.baidu.com/search/spider.html) ,
 Baiduspider-image ( http://www.baidu.com/search/spider.htm) ,
 Mozilla/5.0 (compatible; Googlebot/2.1; http://www.google.com/bot.html) ,
 Mozilla/5.0 (compatible; Googlebot-Image/1.0; http://www.google.com/bot.html) ,
 Sogou web spider/4.0( http://www.sogou.com/docs/help/webmasters.htm#07) ,
 Sogou News Spider/4.0( http://www.sogou.com/docs/help/webmasters.htm#07) ,
 Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0); ,
 Mozilla/5.0 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm) ,
 Sosospider ( http://help.soso.com/webspider.htm) ,
 Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html) 
 ua random.choice(uas)
 headers {
 User-Agent : ua,
 Referer : refer,
 Host : host
 return headers
def get_html(url, ret_type text , timeout 50, encoding utf-8 ):
 headers get_headers()
 res requests.get(url, headers headers, timeout timeout)
 res.encoding encoding
 # print(res.status_code)
 # print(res.text)
 if ret_type text :
 return res.text
 elif ret_type image :
 return res.content
 elif ret_type json :
 return res.json()

好了嗯嗯嗯就是这样…尝试抓取一下自己的文章列表吧修改这里

 # 博主首页链接
 blog_url https://blog.csdn.net/博主名称 
 # 自行查看 博主首页文章的页数
 pages 2

其他在保存为csv的时候再通过office-excel打开会乱码

别急~~ 先用记事本打开另存为编码改为ANSI 就可以了。

爬取CSDN博主文章列表，练习

Python相关栏目本月热门文章