可以自动读取验证码的‘超级鹰’接口配置(须先去网站注册)。将下文保存为 .py 文件再在爬取过程中对它进行调用。
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
# todo:更改点一
self.password = md5(password.encode("utf-8")).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
爬取过程
#编码流程:
#1.验证码的识别,获取验证码图片的文字数据
#2.对post请求进行发送
#3.对响应数据进行持久化存储
from CodeClass import Chaojiying_Client
import requests
from lxml import etree
def getCodeText(imgPath, codeType):
# todo: 更改点二,输入注册的账号与密码,软件ID
chaojiying = Chaojiying_Client('自己的账号', '账号对应的密码', codeType)
# todo: 更改点三:本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
im = open(imgPath, 'rb').read()
# todo: 更改点四:1902 验证码类型,在官网测试案例可以查看
print(chaojiying.PostPic(im, codeType)['pic_str'])
return (chaojiying.PostPic(im, codeType)['pic_str'])
#创建一个session对象
session = requests.Session()
#1.对验证码图片进行捕获和识别
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
}
url = 'http://www.renren.com/SysHome.do'
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
code_img_src = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
code_img_data = requests.get(url=code_img_src, headers=headers).content
with open('./code.jpg', 'wb') as fp:
fp.write(code_img_data)
#使用超级鹰提供的示例代码对验证码图片进行识别
result = getCodeText('code.jpg', '1902')
#post请求的发送(模拟登录)
login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2021041722428'
data = {
'email': '自己的email',
'icode': result,
'origURL': 'http://www.renren.com/home',
'domain': 'renren.com',
'key_id': '1',
'captcha_type': 'web_login',
'password': '自己的网页密码',
'rkey': '2def5d84381e7889d5a3035d83561d72',
'f': 'http%3A%2F%2Fsc.renren.com%2F',
}
#使用session进行post请求的发送
response = session.post(url=login_url, headers=headers, data=data)
print(response.status_code) # 验证登录是否成功,成功打印为200
# login_page_text = response.text
#
# with open('renren.html', 'w', encoding='utf-8') as fp:
# fp.write(login_page_text)
#爬取当前用户的个人主页对应的页面数据
detail_url = 'http://www.renren.com/975729432/profile'
# 手动cookie处理
# headers = {
# 'cookie': 'anonymid=kjwko5zu9c9ejo; depovince=GW; _r01_=1; JSESSIonID=abc6ROCxf_ZKHblHx8-Bx; taihe_bi_sdk_uid=d72d002eaa7344f7a5db362c7bdb1f4b; taihe_bi_sdk_session=6dceb868c1de0a9c3909b08f7620eaa; ick_login=5b65bf4c-3bdd-4ba1-9d57-6bfcb1f61bd9; first_login_flag=1; ln_uact=17725092141; ln_hurl=http://hdn.xnimg.cn/photos/hdn321/20210114/1540/h_main_YCoN_13bc0011ae671986.jpg; wp_fold=0; jebecookies=e469785d-be55-420c-87fd-5f2b9b12397|||||; _de=A7B1CC4E75848C08E3569C4B78B82B66; p=7a79e2c2beaec48c1387e3046ec959b62; t=0c32fad3b6c4d893c9f9af9092cccaca2; societyguester=0c32fad3b6c4d893c9f9af9092cccaca2; id=975729432; xnsid=ff9ca28a; ver=7.0; loginfrom=null; XNESSESSIonID=78ffc4e97f08; WebOnLineNotice_975729432=1'
# }
#使用携带cookie的session进行get请求的发送
detail_page_text = session.get(url=detail_url, headers=headers).text
with open('zhuazei.html', 'w', encoding='utf-8') as fp:
fp.write(detail_page_text)



