有一个爬某度搜索结果数量的任务,打算用多线程+代理池做。没怎么写过用代理的爬虫,所以打算通过这个任务加深对代理的理解
代码先上代码:
import requests
import threading
from math import ceil
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import re
import time
import random
import socket
import urllib3
import pickle
import os
import json
# InsecureRequestWarning: Unverified HTTPS request is being made to host 'www.moudu.com'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings
requests.packages.urllib3.disable_warnings()
data = pd.read_csv('temp.csv')
data['词条'] = data['词条'].str.strip()
# UA池
user_agent_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0'
]
# req重试次数
requests.adapters.DEFAULT_RETRIES = 5
result = []
# 获取代理池,需要自己购买
def get_proxies_by_url(url="http://xxx/api/?key=xxx"):
while True:
try:
d = requests.get(url)
proxy_json = json.loads(d.text)
result = []
for i in proxy_json:
result.append(('http', f"{i['Ip']}:{i['Port']}"))
return result
except (
IndexError, TypeError, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError,
socket.timeout,
urllib3.exceptions.ReadTimeoutError, urllib3.exceptions.MaxRetryError) as e:
pass
def search(keywords, name, page_num=1):
s = requests.session()
# 如果 verify 设置为 `False`,请求将接受任何 TLS 证书:由服务器提供,并将忽略主机名不匹配或过期证书
s.verify = False
# 指定短连接,否则会被封
s.keep_alive = False
proxies = []
def query(wd):
nonlocal page_num
nonlocal proxies
s.params = {'ie': 'utf-8',
'f': '8',
'rsv_bp': '1',
'rsv_idx': "1",
'tn': 'moudu',
'wd': wd,
'fenlei': '256',
'rqlang': 'cn',
"rsv_enter": "0",
"rsv_btype": "i",
"rsp": "0",
"rsv_dl": "ib", }
while True:
while True:
if not proxies:
page_num += 1
# print(f"代理翻到第{page_num}页")
# proxies = get_free_proxies(page_num)
# 获取代理
proxies = get_proxies_by_url()
else:
break
# 随机选择代理
proxy = random.choice(proxies)
s.proxies = {proxy[0]: proxy[1]}
s.headers = {'Host': 'www.moudu.com',
'Cache-Control': 'max-age=0',
'Connection': 'close',
'Referer': 'https://www.moudu.com/',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': random.choice(user_agent_list),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Sec-GPC': '1',
'DNT': '1',
}
# print(s.proxies)
# {'http': '47.113.90.161:83'}
d = s.get("https://www.moudu.com/s", timeout=5, verify=False)
try:
return (wd, re.search('"asDataDispNum":"(.*?)"', d.text)[1])
except:
# print(wd, e)
# 有问题的代理 删掉
proxies.remove(proxy)
# return None
for word in tqdm(keywords, desc=str(name)):
result.append(query(word))
# 读取之前的记录
if not result:
if os.path.exists('result.pic'):
with open('result.pic', 'rb') as f:
result = pickle.load(f)
while True:
# 不重复的word
kwds = data['词条'].unique().tolist()
# 筛出没查过的keyword
if result:
res = pd.merge(pd.Dataframe(kwds, columns=[0]), pd.Dataframe(result).drop_duplicates(), how='left')
kwds = res[pd.isnull(res[1])][0].tolist()
print(f"还有{len(kwds)}个没查过")
if not kwds:
break
try:
threads_list = []
# 线程数量
n_chunks = 6
# 每个线程负责的word数量(会有冗余)
chunk_len = ceil(len(kwds) // n_chunks)
for i in range(n_chunks):
print(i * chunk_len, (i + 1) * chunk_len + 1)
t = threading.Thread(target=search, args=(kwds[i * chunk_len:(i + 1) * chunk_len + 1], i))
threads_list.append(t)
# 启动线程
for t in threads_list:
t.start()
for t in threads_list:
t.join()
except:
pass
finally:
with open('result.pic', 'wb') as f:
pickle.dump(result, f)
print("把这一轮爬好的存起来")
kwds = data['词条'].unique().tolist()
res = pd.merge(pd.Dataframe(kwds, columns=['词条']), pd.Dataframe(result, columns=['词条', '数量']).drop_duplicates('词条'),
on='词条')
# 最终结果
data.merge(res, how='left', on='词条').to_csv("result.csv", index=None, encoding="utf_8_sig")
遇到的困难
- 刚开始用的快代理上的免费ip,质量差不说,过一会快代理就访问不了了。只能自己买一个一日代理,质量好一些,所以成功率还是不高,但能用。刚开始单线程跑的,慢的一批,改用多线程就好多了,但其实还是很慢,虽然和代理质量有关系,但后面打算用协程优化一下刚开始用的长链接,疯狂被封。然后群里的大佬指点,要改成短连接。也就是
请求头中的'Connection'设为 'close' 或 session的keep_alive 参数设为False
4.中途还是遇到报错,且抛出
requests.exceptions.ConnectTimeout,
requests.exceptions.ConnectionError,
socket.timeout,
urllib3.exceptions.ReadTimeoutError,
urllib3.exceptions.MaxRetryError
这些指定异常也不管用(我只能except,无脑抛异常)。看这篇文章说和ssl证书有关系,创建一个爬虫专用的环境即可,我没尝试。
requests.exceptions.SSLError: HTTPSConnectionPool(host='www.baidu.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, u'[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)'),))



