【前言】最近在爬虫360百科的数据遇到了一些问题,如下图所示:
于是查到了用selenium,PIL等工具绕过验证码,代码主要参考极验平台验证的例子自己部分改动后代码的在文末。
由于360百科验证码页面中直接出现的是滑块和缺口同时存在的验证码图,网页资源也没有完整原图,而且滑块本身有黄色荧光边框,没办法直接截取网页中的图,于是我刷新滑块获取两张不同位置缺口的图(通过src下载的),通过蒙板的方法获得了17张验证码原图,虽然过程很麻烦,好在问题解决了。然后将原图与截图逐一进行像素比对,计算出需要滑动的距离,最后模拟人工滑动轨迹。
主要包括以下几个步骤:
1)批量搜集或下载所有验证码原图,保存到本地(没有缺口,没有滑块的原图)如下,以此为参照,计算出滑块需要滑动的距离(一共要用到3张图片(原图,带滑块,带缺口)才能计算出距离);
2)利用selenium进入滑块验证码页面,截取所需页面图片,目的计算出缺口距离;
3)机器模拟人工滑动轨迹;
代码实现部分:
#coding=gbk
import time
import PIL
import requests
from PIL import Image
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib3.packages.six import BytesIO
import requests as req
BORDER = 6
INIT_LEFT = 60
class CrackGeetest():
def __init__(self,browser):
#self.url = 'http://qcaptcha.so.com/?ret=https%3A%2F%2Fbaike.so.com%2Fdoc%2Fsearch%3Fword%3DF-94%25E2%2580%259C%25E6%2598%259F%25E7%2581%25AB%25E2%2580%259D%25E6%2588%2598%25E6%2596%2597%25E6%259C%25BA&tk=66cb441211433617573485761031a02a5'
self.browser = browser
# 设置显示等待时间
self.wait = WebDriverWait(self.browser, 20)
def crack(self,url):
self.browser.get(url)
slider = self.get_slider()
image2 = self.getImage2('captcha2.png')
image1 = self.matchImg(image2)
image3 = self.get_geetest_image('screenshot.png')
border = self.getborder(image3,image1)
gap = self.get_gap(image1, image2)
# 减去缺口位移
gap2 = gap-border
# 获取移动轨迹
track = self.get_track(gap2)
sumTrack = self.getTrackSum(track)
track2 = self.compTrackAdistance(sumTrack, gap2, track)
sumTrack2 = self.getTrackSum(track2)
again = self.move_to_gap(slider, track2)
time.sleep(3)
if again:
return self.browser.current_url
else:
self.crack(self.browser.current_url)
def compTrackAdistance(self,tracksum,distance,track):
if tracksum > distance:
while(self.getTrackSum(track)!= distance):
track[-1] = track[-1] - 1
return track
elif tracksum13:
border = i+5
return border
return border
def getImage2(self, name='captcha2.png'):
''' 获取验证码图片, return: 图片对象 '''
img = self.browser.find_element_by_class_name("v-wrap-img-bg")
srcurl = img.get_attribute("src")
self.urllib_download(srcurl)
IMG = Image.open('temp2.png')
resizeimg = IMG.resize((375, 232), Image.ANTIALIAS)
resizeimg.save(name)
return resizeimg
def matchImg(self,img):
list = []
for i in range(1,18):
image = Image.open("D:\IMAGE\"+str(i)+".png")
resizeimg = image.resize((375, 232), Image.ANTIALIAS)
list.append(resizeimg)
pixel1 = img.load()[77, 30]
pixela = img.load()[55, 135]
pixelc = img.load()[125, 17]
for id,iMage in enumerate(list):
pixel2 = iMage.load()[77, 30]
pixelb = iMage.load()[55,135]
pixelC = iMage.load()[125, 17]
if (abs(pixel1[0]-pixel2[0])<=5)&(abs(pixela[0] - pixelb[0])<=5)&(abs(pixelc[0] - pixelC[0])<=5):
print("sucesss match!")
iMage.save("matchimg.png")
return iMage
return None
def get_slider(self):
''' 获取滑块, return: 滑块对象 '''
slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'v-target-btn')))
return slider
def get_gap(self, image1, image2):
''' 获取缺口偏移量, 参数:image1不带缺口图片、image2带缺口图片。返回偏移量 '''
left = 80
flag = 0
if image1:
if image2:
for i in range(1, 375):
for j in range(1, 232):
if not self.is_pixel_equal(image1, image2, i, j):
flag = flag + 1
if (flag > 13):
left = i
return left
return left
def is_pixel_equal(self, image1, image2, x, y):
'''
判断两个像素是否相同
:param image1: 图片1
:param image2: 图片2
:param x: 位置x
:param y: 位置y
:return: 像素是否相同
'''
# 取两个图片的像素点(R、G、B)
if image2:
pixel1 = image1.load()[x, y]
pixel2 = image2.load()[x, y]
if abs(pixel1[0] - pixel2[0]) < 40 and abs(pixel1[1] - pixel2[1]) < 40 and abs(
pixel1[2] - pixel2[2]) < 40:
return True
else:
return False
return False
def get_track(self, distance):
'''
根据偏移量获取移动轨迹
:param distance: 偏移量
:return: 移动轨迹
'''
# 移动轨迹
track = []
# 当前位移
current = 0
# 减速阈值
mid1 = distance * 2 / 3
mid2 = distance * 3 / 4
# 计算间隔
t = 0.3
# 初速度
v = 10
while current < distance:
if current < mid1:
# 加速度为正2
a = 2
elif current 


