问题描述:原因分析:解决方案:
方法一:方法二:方法三:
代码一代码二代码三代码四代码五Test代码
# 项目场景: Python3.8 问题描述:
在使用Python爬虫爬取网页的列表页中的详情页时,返回的详情页的html文件的数据长度有限。
原因分析:
频繁爬取目标网站,导致的网址反爬虫措施
解决方案:
如果解决不了,你可以把要爬取网页的源码先保存下来,进行后续的处理。
换一个vpn,也就是换一台电脑执行程序
方法二:复制目标网页的Headers添加到代码中
根据目标情况不同修改
def askURL(url):
head = { # 模拟浏览器头部信息,向豆瓣服务器发送消息
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.mafengwo.cn',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
}
url = 'http://www.mafengwo.cn/poi/5423409.html'
# response = requests.get(url)
# # cookie1
# cookie1 = response.cookies
# # js代码
# js_code = response.text
def get_521_content(url,head):
req = requests.get(url, headers=head)
cookies = req.cookies
cookies = '; '.join(['='.join(item) for item in cookies.items()])
txt_521 = req.text
txt_521 = ''.join(re.findall('', txt_521))
return (txt_521, cookies)
def fixed_fun(function):
func_return = function.replace('eval', 'return')
content = execjs.compile(func_return)
req = requests.get(url, headers=head)
evaled_func = ''.join(re.findall('', req.text))
# print(js_con)
# fn = js_con.split('=').split(' ')
# evaled_func = content.call(fn)
# print(evaled_func)
mode_func = evaled_func.replace('while(window._phantom||window.__phantomas){};', '').
replace('document.cookie=', 'return').replace(';if((function(){try{return !!window.addEventListener;}', '').
replace("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',l,false);}", '').
replace("else{document.attachEvent('onreadystatechange',l);}", '').replace(
r"setTimeout('location.href=location.href.replace(/[?|&]captcha-challenge/,'')',1500);", '')
content = execjs.compile(mode_func)
cookies = content.call('l')
__jsl_clearance = cookies.split(';')[0]
return __jsl_clearance
def cookie_dict(js, id):
dict = {}
js = js.split('=')
id = id.split('=')
dict[js[0]] = js[1]
dict[id[0]] = id[1]
return dict
if __name__ == '__main__':
func = get_521_content(url,head)
content = func[0]
cookie_id = func[1]
cookie_js = fixed_fun(func[0])
dicted_cookie = cookie_dict(cookie_js, cookie_id)
head = { # 模拟浏览器头部信息,向豆瓣服务器发送消息
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.mafengwo.cn',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
}
def get_521_content(url):
req = requests.get(url, headers=head, timeout=5)
print(req.status_code, req.text)
if req.status_code == 521:
cookies = dict(req.cookies.items())
print(cookies)
js_con = ''.join(re.findall('', req.text))
if js_con:
__jsl_clearance = fixed_fun(js_con, url)
if __jsl_clearance:
key, value = __jsl_clearance.split('=')
cookies[key] = value
return cookies
# 执行js代码获取cookies 的__jsl_clearance的键值
def fixed_fun(js_con, url): # js_con 第一次请求获取的js内容
func_return = js_con.replace('eval(', 'return(')
print('第一次替换eval==》return后: ', func_return)
content = execjs.compile(func_return)
# fn = js_con.split('=')[0].split(' ')[1]
# 只有['document.cookie']
fn = js_con.split('=')[0].split(' ')[1]
evaled_func = content.call(fn)
print('第一次执行js代码后: ', evaled_func)
fn = evaled_func.split('=')[0].split(' ')[1] # 获取动态函数名
aa = evaled_func.split("") # 获取标签的内容
aa = aa[1].split("")[0] if len(aa) >= 2 else ''
mode_func = evaled_func.
replace(
"setTimeout('location.href=location.pathname+location.search.replace(/[\?|&]captcha-challenge/,\'\')',1500);document.cookie=",
'return').
replace(';if((function(){try{return !!window.addEventListener;}', '').
replace(
"}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
'').
replace(
"if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
'').
replace("return'__jsl_clearance", "var window={};return '__jsl_clearance").
replace(
"var " + fn + "=document.createElement('div');" + fn + ".innerHTML='" + aa + "';" + fn + "=" + fn + ".firstChild.href",
"var " + fn + "='" + url + "'")
print('第二次替换后的js代码:', mode_func)
try:
content = execjs.compile(mode_func)
cookies = content.call(fn)
__jsl_clearance = cookies.split(';')[0]
print(__jsl_clearance)
return __jsl_clearance
except:
print('js执行错误:', mode_func)
return None
# 携带解密后的cookies第二次爬取详情页
def con_spider(cookies, url):
response = requests.get(url, headers=head, cookies=cookies, timeout=5)
if response.status_code == 200:
response.encoding = 'utf-8'
print(response.status_code)
print(response.text)
return response
else:
print('第二次爬取错误状态码:', response.status_code)
return None
if __name__ == "__main__":
cookies = get_521_content(url)
con_spider(cookies, url)
代码三
# resource:https://www.cnblogs.com/gongs/p/10524710.html
import execjs
import re
import requests
url = 'http://www.mafengwo.cn/poi/5423409.html'
head = { # 模拟浏览器头部信息,向豆瓣服务器发送消息
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3;q=0.9",
# "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
# "Cache-Control": "max-age=0",
# "Connection": "keep-alive",
# "cookie": 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2022-01-10+21:47:27";}; __jsluid_h=aa6e6e4350e2fd0e52cc227da10e26b5; __omc_chl=; __omc_r=; __mfwc=direct; uva=s:78:"a:3:{s:2:"lt";i:1641822448;s:10:"last_refer";s:6:"direct";s:5:"rhost";s:0:"";}";; __mfwurd=a:3:{s:6:"f_time";i:1641822448;s:9:"f_rdomain";s:0:"";s:6:"f_host";s:3:"www";}; __mfwuuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; UM_distinctid=17e443e711c512-05dd7ff73ec639-5e181552-144000-17e443e711dc58; login=mafengwo; mafengwo=16a582a6e0ca5f6c73654cb640343886_35627906_61e15d7be119c7.29366428_61e15d7be11a11.54996187; __jsl_clearance=1642341544.979|0|fafiHNHGZB+baEyxg5NVjPfVXm0=; PHPSESSID=s4foj9fhkm3mq8rs64omagvvp2; mfw_uid=35627906; __mfwa=1641822449293.40635.14.1642238623523.1642341546971; __mfwlv=1642341546; __mfwvn=10; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642215122,1642218839,1642238624,1642341547; CNZZDATA30065558=cnzz_eid=1067569765-1641819345-&ntime=1642337760; bottom_ad_status=0; uol_throttle=35627906; __mfwb=8cc49c72508e.10.direct; __mfwlt=1642343676; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642343676',
# "Host": "www.mafengwo.cn",
# "Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55",
}
def getResponse():
"""
获取response
:return:
"""
response = requests.get(url1, headers=head)
return response
def getJslid(response):
"""
:param response:
:return:
"""
cook = response.cookies
return '; '.join(['='.join(item) for item in cook.items()])
def getClearance(response):
"""
:return:
"""
txt = ''.join(re.findall('', response.text))
func_return = txt.replace('eval', 'return')
print(func_return)
content = execjs.compile(func_return)
print(type(content))
# content = open("jsdom_document").read()
# print(content)
# execjs._exceptions.ProgramError: ReferenceError: document is not defined
eval_func = content.call('x')
name = re.findall(r'var (.*?)=function.*', eval_func)[0]
mode_func = eval_func.replace('while(window._phantom||window.__phantomas){};', '').
replace('document.cookie=', 'return').replace('if((function(){try{return !!window.addEventListener;}', '').
replace("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',%s,false)}" % name, '').
replace("else{document.attachEvent('onreadystatechange',%s)}" % name, '').replace(
r"setTimeout('location.href=location.pathname+location.search.replace(/[?|&]captcha-challenge/,'')',1500);",
'')
content = execjs.compile(mode_func)
cookies = content.call(name)
# print(cookies)
clearance = cookies.split(';')[0]
return clearance
def structurecookie(cook, clearance):
"""
构造新的headers
:return:
"""
cookie = cook + ';' + clearance
print(cookie)
return cookie
if __name__ == '__main__':
response = getResponse()
clearance = getClearance(response)
cook = getJslid(response)
head = { # 模拟浏览器头部信息,向豆瓣服务器发送消息
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"cookie": 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2022-01-10+21:47:27";}; __jsluid_h=aa6e6e4350e2fd0e52cc227da10e26b5; __omc_chl=; __omc_r=; __mfwc=direct; uva=s:78:"a:3:{s:2:"lt";i:1641822448;s:10:"last_refer";s:6:"direct";s:5:"rhost";s:0:"";}";; __mfwurd=a:3:{s:6:"f_time";i:1641822448;s:9:"f_rdomain";s:0:"";s:6:"f_host";s:3:"www";}; __mfwuuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; UM_distinctid=17e443e711c512-05dd7ff73ec639-5e181552-144000-17e443e711dc58; login=mafengwo; mafengwo=16a582a6e0ca5f6c73654cb640343886_35627906_61e15d7be119c7.29366428_61e15d7be11a11.54996187; __jsl_clearance=1642341544.979|0|fafiHNHGZB+baEyxg5NVjPfVXm0=; PHPSESSID=s4foj9fhkm3mq8rs64omagvvp2; mfw_uid=35627906; __mfwa=1641822449293.40635.14.1642238623523.1642341546971; __mfwlv=1642341546; __mfwvn=10; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642215122,1642218839,1642238624,1642341547; CNZZDATA30065558=cnzz_eid=1067569765-1641819345-&ntime=1642337760; bottom_ad_status=0; uol_throttle=35627906; __mfwb=8cc49c72508e.10.direct; __mfwlt=1642343676; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642343676',
"Host": "www.mafengwo.cn",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55",
}
# # 输出访问网页的状态码
# req = requests.get(url, headers=head).status_code
# print(req)
request = urllib.request.Request(url1, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode(encoding="utf-8", errors="ignore")
print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print("状态码:%s"%(e.code))
if hasattr(e, "reason"):
print("原因:%s"%(e.reason))
# response = requests.get(url1)
# print(response)
# # cookie1
# cookie1 = response.cookies
# print(cookie1)
# # js代码
# js_code = response.text
# print(js_code)



