主要是构建链接,令它一直获取网页,知道获取到除了输入数字以外的内容,提供两种方式,原理都差不多,都是主要用到正则表达式和requests进行获取,小编用的是python3哦
one.py
import requests
import re
from lxml import etree
def get_Html(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36"
}
req = requests.get(url, headers=headers, timeout=20)
# print(req.content.decode('utf-8'))
return req.content.decode('utf-8')
def next():
html = get_Html(url)
number = re.findall('.*?(d+)', html)
while number:
next_url = "http://www.heibanke.com/lesson/crawler_ex00/%s" % number[0]
print(next_url)
html = requests.get(next_url).content.decode('utf-8')
number = re.findall(r'.*?(d+).', html)
res = re.findall('
two.py
import requests
import re
import datetime
if __name__ == '__main__':
begin_time = datetime.datetime.now()
url = 'http://www.heibanke.com/lesson/crawler_ex00/'
new_url = url
num_re = re.compile(r'[^d<]*?(d+)[^d<]*?
- 更多代码详情参考我的Github



