黑板课爬虫闯关

第四关增加了登录验证和密码获取，主页面如下：

2. 但是想要进入主页面需要先登录：

3. 进去后可以随便试下密码，果然错误，然后系统提供了一个密码表

密码表
然后就是繁杂的解密过程了，流程和之前其实差不多，同样的模拟登陆，不同的是这里增加了密码获取这一条，开始做的时候也是很懵逼，后来在网上大佬的提点下知道密码是有100位QAQ，而且页面加载极其之慢，所以这个过程非常煎熬
关卡主要是想考察模拟登陆和多线程爬虫这一块，这样比较快嘛，但是也是可以暴力破解的嘛，下面提供三个版本参考
one.py(单线程，直接获取密码组合到100位再进行测试)

import requests
from lxml import etree
import codecs
import csv
import re


se = requests.session()


headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
}

class HBK():
    def __init__(self):
 self.login_url = "http://www.heibanke.com/accounts/login"
 self.username = "whaike"
 self.password = "12345654321"
 self.passwrods = ['' for i in range(101)]
 self.pwd = ''

 ##获取登陆之前的csrf
    def getCsrf(self):
 res = se.get(url=self.login_url,headers=headers,timeout=30).text
 tree = etree.HTML(res)
 self.csrf = tree.xpath('/html/body/div/div/div[2]/form/input[@name="csrfmiddlewaretoken"]/@value')[0]

    #登陆
    def login(self):
 self.getCsrf()
 data = {
     "csrfmiddlewaretoken":self.csrf,
     "username":self.username,
     "password":self.password
 }
 se.post(url=self.login_url,headers=headers,data=data,timeout=30)
 print('登陆成功')

    #获取登陆之后的csrf,也就是要进行第四关闯关的csrf
    def getNCsrf(self):
 url = 'http://www.heibanke.com/lesson/crawler_ex03/'
 res = se.get(url,headers=headers,timeout=30).text
 tree = etree.HTML(res)
 csrf = tree.xpath('//input[1]/@value')[0]
 return csrf

    #猜测密码是否正确
    def guesspwd(self):
 url = 'http://www.heibanke.com/lesson/crawler_ex03/'
 csrf = self.getNCsrf()
 data = {
     "csrfmiddlewaretoken":csrf,
     "username":"whaike",
     "password":self.pwd
 }
 res = se.post(url,headers=headers,data=data,timeout=30)
 if int(res.status_code) == 200:
     self.h3 = re.findall('(.*?)',res.text)
     return True
 else:
     return False

    #循环抓取第一页的随机值，直到密码长度为100时开始猜测，猜测失败继续执行，猜测成功停止运行
    def getGasswords(self):
 print('获取第一页')
 url = 'http://www.heibanke.com/lesson/crawler_ex03/pw_list/?page=1'
 res = se.get(url,headers=headers,timeout=30).text
 tree = etree.HTML(res)
 trs = tree.xpath('/html/body/div/div/div[2]/table/tr')[1:]
 for tr in trs:
     p1 = tr.xpath('td[1]/text()')[0] #位置
     p = int(re.findall('d+',p1)[0]) #偶尔数字前会有一些其他字符出现,提取数字部分,转换为整数
     w = tr.xpath('td[2]/text()')[0] #值
     self.passwrods[p] = w
 self.pwd = ''.join(self.passwrods)
 length = len(self.pwd) #密码长度
 print('当前密码:%s,长度%d'%(self.pwd,length))
 if length == 100:
     print('满足条件，开始猜测...')
     if self.guesspwd():
  print ('猜测成功,密码为:%s'%self.pwd)
     else:
  print ('猜测失败,继续执行')
  self.getGasswords()
 else: #如果密码长度不为100，则再次获取第一页的随机密码并组成新的密码
     self.getGasswords() #递归


if __name__ == '__main__':

    print('开始闯关 - 第四关')
    spider = HBK()
    spider.login()
    spider.getGasswords()
    print(spider.h3)

two.py(多线程版，转自网上某大佬)

  #!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2017-09-02 22:25:21
# @Author  : bb (317716008@qq.com)
# @Word    : python can change world!
# @Version : python3.6
import requests
from bs4 import BeautifulSoup
import threading
from queue import Queue


dict1={}
vlauess=[]
web1="http://www.heibanke.com/accounts/login"
web2="http://www.heibanke.com/lesson/crawler_ex03/pw_list/"
web3="http://www.heibanke.com/lesson/crawler_ex03/"
global queuewz
global queuemm
queuewz=Queue()
queuemm=Queue()


class mythreads(threading.Thread):

    def __init__(self):
 threading.Thread.__init__(self)

    def run(self):
 work()
 while not queuemm.empty():
     try:
  dict1[str(queuewz.get())]=queuemm.get()
  print(dict1)
  print("字典长度为%s"%len(dict1))
  if int(len(dict1)) ==100:
      print("凑到100啦！")
      for i in range(1,101):
   vlauess.append(dict1[str(i)])
      c=vlauess[:100]
      zzmm=''.join(c)
      print("密码为%s"%zzmm)
      print("正在登录.......")
      dataWebsite1 = {'username': 'user','password': zzmm}
      s=login_get()
      res=s.post(web3, data=dataWebsite1).text
      if u'恭喜' in res:
   title=re.findall("(.*?)",res)
   word=re.findall("(.*?)",res)
   word2=re.findall("(.*?)",res)
   html=re.findall('下一关',res)
   print('n'.join([title[0], word[0], word2[0],'下一关地址是','http://www.heibanke.com'+html[0]]))
   break
      else:
   print("网页有问题哦！可以尝试手动将获得的正确密码登入进去哦！")
   break
  else:
      main()
     except IndexError:
  print("例表空了,下一页！")


def login_get():
    try:
 s = requests.Session()
 r=s.get(web1)     # 访问登录页面获取登录要用的csrftoken
 token1 = r.cookies['csrftoken']      # 保存csrftoken
 # 将csrftoekn存入字段csrfmiddlewaretoken
 dataWebsite1 = {'username': 'user',
   'password': 'password',
   'csrfmiddlewaretoken': token1
      }
 res=s.post(web1, data=dataWebsite1)
    except KeyError as e:
 pass

    return s

def get_html(s):
    r=s.get(web2)
    res=r.text
    return res

def get_dict(res):

    soup=BeautifulSoup(res,"html.parser")
    for a in soup.find_all('td',attrs={'title':'password_pos'}):
 wz=(a.string)
 queuewz.put(wz)
    for b in soup.find_all('td',attrs={'title':'password_val'}):
 mm=(b.string)
 queuemm.put(mm)

def work():
    res=get_html(s)
    get_dict(res)


def main():
    global s
    s=login_get()
    threads=[]
    threads_count=10

    for i in range(threads_count):
 threads.append(mythreads())

    for t in threads:
 t.start()

    for t in threads:
 t.join()


if __name__ == '__main__':
    main()

three.py

import re
import requests
from threading import Thread
import time

def print_run_time(func):
    """
    装饰器函数，输出运行时间
    """
    def wrapper(self, *args, **kw):
 local_time = time.time()
 # print args),kw
 func(self)
 print('run time is {:.2f}:'.format(time.time() - local_time))
    return wrapper

class hbk_crawler(object):
    """黑板客爬虫闯关"""
    def __init__(self): pass

    def login(self):
 """登录函数 input:第几关"""
 self.url = 'http://www.heibanke.com/lesson/crawler_ex03'
 self.login_url = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex03'
 self.s = requests.session()
 print("正在登录第4关....")
 try:
     self.csrftoken = self.s.get(self.login_url).cookies['csrftoken']
 except:
     print("网络连接错误，请重试...")
     exit()
 self.payload = {'username': 'test', 'password': 'test123',
   'csrfmiddlewaretoken': self.csrftoken}
 self.payload['csrfmiddlewaretoken'] = self.s.post(
     self.login_url, self.payload).cookies['csrftoken']
 print("登录成功....")
 return None

    def parseurl(self, url):
 """分析网页,查找密码位置和值"""
 while self.count < 100:
     response = self.s.get(url)
     if response.ok:
  content = response.text
  pos_pattern = r'_pos.>(.*)'
  val_pattern = r'_val.>(.*)'
  pos_list = re.findall(pos_pattern, content)
  val_list = re.findall(val_pattern, content)
  for pos, val in zip(pos_list, val_list):
      if pos not in self.pw_dict:
   self.pw_dict[pos] = val
   self.count = self.count + 1
  print(str(self.count) + '%' + self.count // 2 * '*')

    def ex04(self, *args, **kw):
 """ 第4关:找密码,加入了登录验证,CSRF保护,密码长度100位，响应时间增加 """
 self.count = 0
 self.login()
 self.pw_dict = {}
 pw_url = ('http://www.heibanke.com/lesson/crawler_ex03/pw_list',)
 # 线程数,黑板客服务器15秒内最多响应2个请求，否则返回404.
 n = 2
 threads = [Thread(target=self.parseurl, args=(
     pw_url)) for i in range(n)]
 for t in threads:
     print(t.name, 'start...')
     t.start()
 for t in threads:
     t.join()
 self.pw_list = ['' for n in range(101)]
 for pos in self.pw_dict.keys():
     self.pw_list[int(pos)] = self.pw_dict[pos]
 password = int(''.join(self.pw_list))
 self.payload['password'] = password
 response = self.s.post(self.url, self.payload)
 pattern = r'(.*)'
 result = re.findall(pattern, response.text)
 result2 = re.findall('下一关',response.text)
 print(result[0])
 print(result2)


if __name__ == '__main__':
    Hbk_crawler = hbk_crawler()
    Hbk_crawler.ex04()

综合总结下来，几种方法原理都差不多，主要是多线程在数据获取上速度会快一些，关于多线程的板块，后面会进行更新，大家也可以去看看官方文档或者廖雪峰的教程，然后这里每次都要模拟登陆比较麻烦，注意代码不要冗余，在获取错误信息方面基本都差不多，用的re抓取，如果"text" in XXX的方式不适用的话，可以尝试构建result为抓取的错误信息，返回值为空则为正确，有返回值则说明有错误信息，页面加载比较慢，建议给点输出信息以免你觉得代码挂了

更多代码详情参考我的Github

黑板课爬虫闯关

Python相关栏目本月热门文章